## Pandas 10 minute Tutorial
notes taken from http://pandas.pydata.org/pandas-docs/stable/10min.html

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Create a Pandas **series**

In [18]:
s = pd.Series([1,2,3,6,7])

s

0    1
1    2
2    3
3    6
4    7
dtype: int64

## Create a Pandas **DataFrame** by passing a numpy array, index and column label

In [19]:
dates = pd.date_range('20160624', periods=5)
dates

DatetimeIndex(['2016-06-24', '2016-06-25', '2016-06-26', '2016-06-27',
               '2016-06-28'],
              dtype='datetime64[ns]', freq='D')

In [20]:
df = pd.DataFrame(np.random.randn(5,4), index=dates, columns=['one','two','three','four'])
df

Unnamed: 0,one,two,three,four
2016-06-24,-0.865824,-0.259413,-0.278707,-0.083715
2016-06-25,2.37769,-0.603749,-0.364608,-0.829026
2016-06-26,0.195804,0.305053,1.295641,-1.087357
2016-06-27,0.926671,-0.749013,-0.218765,-0.159252
2016-06-28,-2.008621,0.431193,-0.481636,0.869175


## Create a Pandas **DataFrame** by passing a dictionary of objects

In [21]:
dict = {'x':[1,2,3],'y':[2,3,1],'z':[0,4,5]}
df2 = pd.DataFrame(dict)
df2

Unnamed: 0,x,y,z
0,1,2,0
1,2,3,4
2,3,1,5


In [22]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })
print df2.dtypes
df2

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [23]:
print df2.index
print df2.columns
print df2.values

Int64Index([0, 1, 2, 3], dtype='int64')
Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object')
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]


## **Transposing** your data

In [24]:
df.T

Unnamed: 0,2016-06-24 00:00:00,2016-06-25 00:00:00,2016-06-26 00:00:00,2016-06-27 00:00:00,2016-06-28 00:00:00
one,-0.865824,2.37769,0.195804,0.926671,-2.008621
two,-0.259413,-0.603749,0.305053,-0.749013,0.431193
three,-0.278707,-0.364608,1.295641,-0.218765,-0.481636
four,-0.083715,-0.829026,-1.087357,-0.159252,0.869175


## **sorting** by an axis

In [25]:
df.sort_index(axis=1, ascending=True)

Unnamed: 0,four,one,three,two
2016-06-24,-0.083715,-0.865824,-0.278707,-0.259413
2016-06-25,-0.829026,2.37769,-0.364608,-0.603749
2016-06-26,-1.087357,0.195804,1.295641,0.305053
2016-06-27,-0.159252,0.926671,-0.218765,-0.749013
2016-06-28,0.869175,-2.008621,-0.481636,0.431193


In [26]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,one,two,three,four
2016-06-28,-2.008621,0.431193,-0.481636,0.869175
2016-06-27,0.926671,-0.749013,-0.218765,-0.159252
2016-06-26,0.195804,0.305053,1.295641,-1.087357
2016-06-25,2.37769,-0.603749,-0.364608,-0.829026
2016-06-24,-0.865824,-0.259413,-0.278707,-0.083715


In [27]:
df.sort_values(by='one')

Unnamed: 0,one,two,three,four
2016-06-28,-2.008621,0.431193,-0.481636,0.869175
2016-06-24,-0.865824,-0.259413,-0.278707,-0.083715
2016-06-26,0.195804,0.305053,1.295641,-1.087357
2016-06-27,0.926671,-0.749013,-0.218765,-0.159252
2016-06-25,2.37769,-0.603749,-0.364608,-0.829026


## DataFrame Indexing

- .at
- .iat
- .loc
- .iloc
- .ix

if *i* in front, takes integer argument  
if not, takes labels as argument  
*.ix* takes both

In [65]:
#.at, .iat, .loc, .iloc, .ix
#  no i - label
#  i - integer
#  ix - mixed integer/label
df.iloc[1]
print df.columns[1:]
comp1=df[df.columns[1:]]
predictors = ['two','three','four']
comp2=df[predictors]

print comp1 == comp2

Index([u'two', u'three', u'four'], dtype='object')
             two three  four
2016-06-24  True  True  True
2016-06-25  True  True  True
2016-06-26  True  True  True
2016-06-27  True  True  True
2016-06-28  True  True  True


- number indexing (like np arrays) work too

In [52]:
df[0:]

Unnamed: 0,one,two,three,four
2016-06-24,-0.865824,-0.259413,-0.278707,-0.083715
2016-06-25,2.37769,-0.603749,-0.364608,-0.829026
2016-06-26,0.195804,0.305053,1.295641,-1.087357
2016-06-27,0.926671,-0.749013,-0.218765,-0.159252
2016-06-28,-2.008621,0.431193,-0.481636,0.869175


**Boolean indexing**

In [57]:
print df[df['one']>0]
print df[df.one >0]

                 one       two     three      four
2016-06-25  2.377690 -0.603749 -0.364608 -0.829026
2016-06-26  0.195804  0.305053  1.295641 -1.087357
2016-06-27  0.926671 -0.749013 -0.218765 -0.159252
                 one       two     three      four
2016-06-25  2.377690 -0.603749 -0.364608 -0.829026
2016-06-26  0.195804  0.305053  1.295641 -1.087357
2016-06-27  0.926671 -0.749013 -0.218765 -0.159252


In [55]:
df[df>0]

Unnamed: 0,one,two,three,four
2016-06-24,,,,
2016-06-25,2.37769,,,
2016-06-26,0.195804,0.305053,1.295641,
2016-06-27,0.926671,,,
2016-06-28,,0.431193,,0.869175


In [61]:
df2 = df.copy()
df2['E']=['one','two','three','four','five']
df2[df2.E.isin(['three'])]

Unnamed: 0,one,two,three,four,E
2016-06-26,0.195804,0.305053,1.295641,-1.087357,three


## Missing Data
- .reindex
- .dropna
- .fillna
- .isnull

In [73]:
df3 = df2.reindex(list('ABCDE'))
df3

Unnamed: 0,one,two,three,four,E
A,,,,,
B,,,,,
C,,,,,
D,,,,,
E,,,,,


In [74]:
df3.dropna()

Unnamed: 0,one,two,three,four,E


In [75]:
df3.fillna(1)

Unnamed: 0,one,two,three,four,E
A,1.0,1.0,1.0,1.0,1
B,1.0,1.0,1.0,1.0,1
C,1.0,1.0,1.0,1.0,1
D,1.0,1.0,1.0,1.0,1
E,1.0,1.0,1.0,1.0,1


In [76]:
df3.isnull()

Unnamed: 0,one,two,three,four,E
A,True,True,True,True,True
B,True,True,True,True,True
C,True,True,True,True,True
D,True,True,True,True,True
E,True,True,True,True,True


## Operations

- .mean - *get mean*
- .apply - *apply a function to data*
- .value_counts - *histogram* **only works on series**
- .str.lower - *string methods*
- pd.concat - *concatenate, equivalent to np.concatenate()*

In [106]:
df

Unnamed: 0,one,two,three,four
2016-06-24,-0.865824,-0.259413,-0.278707,-0.083715
2016-06-25,2.37769,-0.603749,-0.364608,-0.829026
2016-06-26,0.195804,0.305053,1.295641,-1.087357
2016-06-27,0.926671,-0.749013,-0.218765,-0.159252
2016-06-28,-2.008621,0.431193,-0.481636,0.869175


In [86]:
df.mean() #default axis=0 (along the row)

one      0.125144
two     -0.175186
three   -0.009615
four    -0.258035
dtype: float64

In [87]:
df.mean(1) #axis=1 (along the column)

2016-06-24   -0.371915
2016-06-25    0.145077
2016-06-26    0.177285
2016-06-27   -0.050090
2016-06-28   -0.297472
Freq: D, dtype: float64

In [94]:
df['one'].value_counts()

 0.926671    1
-0.865824    1
 2.377690    1
 0.195804    1
-2.008621    1
Name: one, dtype: int64

In [105]:
pieces = [df.one, df.two, df.three, df.four]
print pd.concat(pieces, axis=1)
print pd.concat(pieces)

                 one       two     three      four
2016-06-24 -0.865824 -0.259413 -0.278707 -0.083715
2016-06-25  2.377690 -0.603749 -0.364608 -0.829026
2016-06-26  0.195804  0.305053  1.295641 -1.087357
2016-06-27  0.926671 -0.749013 -0.218765 -0.159252
2016-06-28 -2.008621  0.431193 -0.481636  0.869175
2016-06-24   -0.865824
2016-06-25    2.377690
2016-06-26    0.195804
2016-06-27    0.926671
2016-06-28   -2.008621
2016-06-24   -0.259413
2016-06-25   -0.603749
2016-06-26    0.305053
2016-06-27   -0.749013
2016-06-28    0.431193
2016-06-24   -0.278707
2016-06-25   -0.364608
2016-06-26    1.295641
2016-06-27   -0.218765
2016-06-28   -0.481636
2016-06-24   -0.083715
2016-06-25   -0.829026
2016-06-26   -1.087357
2016-06-27   -0.159252
2016-06-28    0.869175
dtype: float64
