Tutorial: http://pandas.pydata.org/pandas-docs/stable/10min.html

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

##### Object Creation

Create a series

In [11]:
s = pd.Series([10, 33, 75, np.nan, 106, 8])

In [30]:
s

0     10.0
1     33.0
2     75.0
3      NaN
4    106.0
5      8.0
dtype: float64

Create a DataFrame by passing a numpy array w/datetime index & labeled columns

In [31]:
dates = pd.date_range('20170608', periods=16) ##, tz='Africa/Kampala')
#pd.date_range?

In [32]:
dates

DatetimeIndex(['2017-06-08', '2017-06-09', '2017-06-10', '2017-06-11',
               '2017-06-12', '2017-06-13', '2017-06-14', '2017-06-15',
               '2017-06-16', '2017-06-17', '2017-06-18', '2017-06-19',
               '2017-06-20', '2017-06-21', '2017-06-22', '2017-06-23'],
              dtype='datetime64[ns]', freq='D')

In [39]:
# create a dataframe here
# pd.DataFrame? pd.DataFrame(self, data=None, index=None, columns=None, dtype=None, copy=False)
df = pd.DataFrame(np.random.randn(16,5), index=dates, columns=list('ABCDE'))  #data dims match index: 16==16 | cols= 3=='ABC'

In [40]:
df

Unnamed: 0,A,B,C,D,E
2017-06-08,0.801781,1.136717,1.188838,-0.40212,-0.235515
2017-06-09,0.372936,-1.435038,-1.286853,0.831206,-1.135913
2017-06-10,0.18756,0.460868,-1.805518,0.58242,0.399372
2017-06-11,2.088195,0.691398,-0.261565,-0.139224,-0.487922
2017-06-12,1.244276,1.2096,0.033265,1.376887,2.805198
2017-06-13,0.036693,1.028335,-0.196918,0.335599,-0.255349
2017-06-14,0.23079,0.280319,-0.289356,0.279247,1.028231
2017-06-15,1.853709,0.300233,2.032685,0.660566,-0.946793
2017-06-16,1.214562,0.474831,-1.545191,2.401952,1.478247
2017-06-17,0.432238,0.779668,1.082635,-1.632081,-0.745487


Create a second DF - passing dict of objects

In [43]:
df2 = pd.DataFrame({ 'A' : 1.,
                        'B' : pd.Timestamp('20130102'),
                         'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                         'D' : np.array([3] * 4,dtype='int32'),
                         'E' : pd.Categorical(["test","train","test","train"]),
                         'F' : 'foo' }) 

In [44]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [46]:
df2.dtypes     #detects type of data

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [51]:
# If you’re using IPython, tab completion for column names (as well as public attributes) is automatically enabled.
# Here’s a subset of the attributes that will be completed:
df2.   #tab completion

SyntaxError: invalid syntax (<ipython-input-51-f33bfe1f6a59>, line 3)

VIEWING DATA

In [52]:
df.index

DatetimeIndex(['2017-06-08', '2017-06-09', '2017-06-10', '2017-06-11',
               '2017-06-12', '2017-06-13', '2017-06-14', '2017-06-15',
               '2017-06-16', '2017-06-17', '2017-06-18', '2017-06-19',
               '2017-06-20', '2017-06-21', '2017-06-22', '2017-06-23'],
              dtype='datetime64[ns]', freq='D')

In [53]:
df.columns

Index([u'A', u'B', u'C', u'D', u'E'], dtype='object')

In [54]:
df.values

array([[ 0.80178103,  1.13671672,  1.18883826, -0.4021196 , -0.23551548],
       [ 0.37293603, -1.43503796, -1.28685305,  0.83120639, -1.13591308],
       [ 0.18755988,  0.46086831, -1.80551754,  0.58242006,  0.39937236],
       [ 2.08819505,  0.69139766, -0.26156534, -0.13922403, -0.48792182],
       [ 1.24427569,  1.20959971,  0.03326541,  1.37688746,  2.80519755],
       [ 0.03669339,  1.02833508, -0.19691752,  0.3355986 , -0.25534894],
       [ 0.23079024,  0.28031887, -0.28935576,  0.27924685,  1.02823084],
       [ 1.85370879,  0.30023301,  2.03268451,  0.66056559, -0.94679261],
       [ 1.21456156,  0.47483095, -1.54519117,  2.40195208,  1.47824677],
       [ 0.4322375 ,  0.77966836,  1.08263496, -1.63208089, -0.74548661],
       [ 0.73840275,  1.2520091 , -0.33780033, -1.04703191,  1.60174434],
       [ 0.76990115,  0.51315186,  1.93035886,  0.9793123 , -0.52022208],
       [-0.7353359 , -1.2757567 , -0.09617301, -0.05641755, -1.17687007],
       [-0.60202451, -0.14064203,  0.8

In [56]:
# quick descriptive stats
df.describe()   

Unnamed: 0,A,B,C,D,E
count,16.0,16.0,16.0,16.0,16.0
mean,0.712616,0.297807,0.155551,0.228682,0.157019
std,0.802061,0.961161,1.217622,1.056823,1.184841
min,-0.735336,-1.687746,-1.805518,-1.632081,-1.17687
25%,0.219983,0.175079,-0.420289,-0.247545,-0.795813
50%,0.754152,0.493991,-0.146545,0.307423,-0.245432
75%,1.252181,1.05543,1.109186,0.868233,1.113092
max,2.088195,1.252009,2.032685,2.401952,2.805198


In [60]:
# transposing data
df.transpose?

In [62]:
df.T

Unnamed: 0,2017-06-08 00:00:00,2017-06-09 00:00:00,2017-06-10 00:00:00,2017-06-11 00:00:00,2017-06-12 00:00:00,2017-06-13 00:00:00,2017-06-14 00:00:00,2017-06-15 00:00:00,2017-06-16 00:00:00,2017-06-17 00:00:00,2017-06-18 00:00:00,2017-06-19 00:00:00,2017-06-20 00:00:00,2017-06-21 00:00:00,2017-06-22 00:00:00,2017-06-23 00:00:00
A,0.801781,0.372936,0.18756,2.088195,1.244276,0.036693,0.23079,1.853709,1.214562,0.432238,0.738403,0.769901,-0.735336,-0.602025,1.492279,1.275898
B,1.136717,-1.435038,0.460868,0.691398,1.2096,1.028335,0.280319,0.300233,0.474831,0.779668,1.252009,0.513152,-1.275757,-0.140642,1.17697,-1.687746
C,1.188838,-1.286853,-1.805518,-0.261565,0.033265,-0.196918,-0.289356,2.032685,-1.545191,1.082635,-0.3378,1.930359,-0.096173,0.879304,-0.667756,1.828858
D,-0.40212,0.831206,0.58242,-0.139224,1.376887,0.335599,0.279247,0.660566,2.401952,-1.632081,-1.047032,0.979312,-0.056418,-0.196021,1.122184,-1.437563
E,-0.235515,-1.135913,0.399372,-0.487922,2.805198,-0.255349,1.028231,-0.946793,1.478247,-0.745487,1.601744,-0.520222,-1.17687,1.367677,-0.951567,0.28747


In [64]:
# sorting by an axis
df.sort_index(axis=1, ascending=False)

Unnamed: 0,E,D,C,B,A
2017-06-08,-0.235515,-0.40212,1.188838,1.136717,0.801781
2017-06-09,-1.135913,0.831206,-1.286853,-1.435038,0.372936
2017-06-10,0.399372,0.58242,-1.805518,0.460868,0.18756
2017-06-11,-0.487922,-0.139224,-0.261565,0.691398,2.088195
2017-06-12,2.805198,1.376887,0.033265,1.2096,1.244276
2017-06-13,-0.255349,0.335599,-0.196918,1.028335,0.036693
2017-06-14,1.028231,0.279247,-0.289356,0.280319,0.23079
2017-06-15,-0.946793,0.660566,2.032685,0.300233,1.853709
2017-06-16,1.478247,2.401952,-1.545191,0.474831,1.214562
2017-06-17,-0.745487,-1.632081,1.082635,0.779668,0.432238


In [68]:
df.sort_values(by='E', ascending = False)

Unnamed: 0,A,B,C,D,E
2017-06-12,1.244276,1.2096,0.033265,1.376887,2.805198
2017-06-18,0.738403,1.252009,-0.3378,-1.047032,1.601744
2017-06-16,1.214562,0.474831,-1.545191,2.401952,1.478247
2017-06-21,-0.602025,-0.140642,0.879304,-0.196021,1.367677
2017-06-14,0.23079,0.280319,-0.289356,0.279247,1.028231
2017-06-10,0.18756,0.460868,-1.805518,0.58242,0.399372
2017-06-23,1.275898,-1.687746,1.828858,-1.437563,0.28747
2017-06-08,0.801781,1.136717,1.188838,-0.40212,-0.235515
2017-06-13,0.036693,1.028335,-0.196918,0.335599,-0.255349
2017-06-11,2.088195,0.691398,-0.261565,-0.139224,-0.487922


##### A note about selections:
Note While standard Python / Numpy expressions for selecting and setting are intuitive and come in handy for interactive work, 
for production code, we recommend the optimized pandas data access methods, .at, .iat, .loc, .iloc and .ix.