## Pandas 10 minute Tutorial
notes taken from http://pandas.pydata.org/pandas-docs/stable/10min.html

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



## Create a Pandas **series**

In [2]:
s = pd.Series([1,2,3,6,7])

s

0    1
1    2
2    3
3    6
4    7
dtype: int64

## Create a Pandas **DataFrame** by passing a numpy array, index and column label

In [3]:
dates = pd.date_range('20160624', periods=5)
dates

DatetimeIndex(['2016-06-24', '2016-06-25', '2016-06-26', '2016-06-27',
               '2016-06-28'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(5,4), index=dates, columns=['one','two','three','four'])
df

Unnamed: 0,one,two,three,four
2016-06-24,-2.910848,1.103582,0.206304,-0.436187
2016-06-25,1.138587,-0.013138,0.4781,2.298781
2016-06-26,1.121629,-0.0555,-0.506361,0.454096
2016-06-27,-1.136933,0.125587,-0.740266,0.485956
2016-06-28,-1.781667,-1.672843,0.832271,-0.368503


## Create a Pandas **DataFrame** by passing a dictionary of objects

In [5]:
dict = {'x':[1,2,3],'y':[2,3,1],'z':[0,4,5]}
df2 = pd.DataFrame(dict)
df2

Unnamed: 0,x,y,z
0,1,2,0
1,2,3,4
2,3,1,5


In [6]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })
print df2.dtypes
df2

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
print df2.index
print df2.columns
print df2.values

Int64Index([0, 1, 2, 3], dtype='int64')
Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object')
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]


## **Transposing** your data

In [8]:
df.T

Unnamed: 0,2016-06-24 00:00:00,2016-06-25 00:00:00,2016-06-26 00:00:00,2016-06-27 00:00:00,2016-06-28 00:00:00
one,-2.910848,1.138587,1.121629,-1.136933,-1.781667
two,1.103582,-0.013138,-0.0555,0.125587,-1.672843
three,0.206304,0.4781,-0.506361,-0.740266,0.832271
four,-0.436187,2.298781,0.454096,0.485956,-0.368503


## **sorting** by an axis

In [9]:
df.sort_index(axis=1, ascending=True)

Unnamed: 0,four,one,three,two
2016-06-24,-0.436187,-2.910848,0.206304,1.103582
2016-06-25,2.298781,1.138587,0.4781,-0.013138
2016-06-26,0.454096,1.121629,-0.506361,-0.0555
2016-06-27,0.485956,-1.136933,-0.740266,0.125587
2016-06-28,-0.368503,-1.781667,0.832271,-1.672843


In [10]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,one,two,three,four
2016-06-28,-1.781667,-1.672843,0.832271,-0.368503
2016-06-27,-1.136933,0.125587,-0.740266,0.485956
2016-06-26,1.121629,-0.0555,-0.506361,0.454096
2016-06-25,1.138587,-0.013138,0.4781,2.298781
2016-06-24,-2.910848,1.103582,0.206304,-0.436187


In [11]:
df.sort_values(by='one')

Unnamed: 0,one,two,three,four
2016-06-24,-2.910848,1.103582,0.206304,-0.436187
2016-06-28,-1.781667,-1.672843,0.832271,-0.368503
2016-06-27,-1.136933,0.125587,-0.740266,0.485956
2016-06-26,1.121629,-0.0555,-0.506361,0.454096
2016-06-25,1.138587,-0.013138,0.4781,2.298781


## DataFrame Indexing

- .at
- .iat
- .loc
- .iloc
- .ix

if *i* in front, takes integer argument  
if not, takes labels as argument  
*.ix* takes both

In [12]:
#.at, .iat, .loc, .iloc, .ix
#  no i - label
#  i - integer
#  ix - mixed integer/label
df.iloc[1]
print df.columns[1:]
comp1=df[df.columns[1:]]
predictors = ['two','three','four']
comp2=df[predictors]

print comp1 == comp2

Index([u'two', u'three', u'four'], dtype='object')
             two three  four
2016-06-24  True  True  True
2016-06-25  True  True  True
2016-06-26  True  True  True
2016-06-27  True  True  True
2016-06-28  True  True  True


- number indexing (like np arrays) work too

In [13]:
df[0:]

Unnamed: 0,one,two,three,four
2016-06-24,-2.910848,1.103582,0.206304,-0.436187
2016-06-25,1.138587,-0.013138,0.4781,2.298781
2016-06-26,1.121629,-0.0555,-0.506361,0.454096
2016-06-27,-1.136933,0.125587,-0.740266,0.485956
2016-06-28,-1.781667,-1.672843,0.832271,-0.368503


**Boolean indexing**

In [14]:
print df[df['one']>0]
print df[df.one >0]

                 one       two     three      four
2016-06-25  1.138587 -0.013138  0.478100  2.298781
2016-06-26  1.121629 -0.055500 -0.506361  0.454096
                 one       two     three      four
2016-06-25  1.138587 -0.013138  0.478100  2.298781
2016-06-26  1.121629 -0.055500 -0.506361  0.454096


In [15]:
df[df>0]

Unnamed: 0,one,two,three,four
2016-06-24,,1.103582,0.206304,
2016-06-25,1.138587,,0.4781,2.298781
2016-06-26,1.121629,,,0.454096
2016-06-27,,0.125587,,0.485956
2016-06-28,,,0.832271,


In [16]:
df2 = df.copy()
df2['E']=['one','two','three','four','five']
df2[df2.E.isin(['three'])]

Unnamed: 0,one,two,three,four,E
2016-06-26,1.121629,-0.0555,-0.506361,0.454096,three


## Missing Data
- .reindex
- .dropna
- .fillna
- .isnull

In [17]:
df3 = df2.reindex(list('ABCDE'))
df3

Unnamed: 0,one,two,three,four,E
A,,,,,
B,,,,,
C,,,,,
D,,,,,
E,,,,,


In [18]:
df3.dropna()

Unnamed: 0,one,two,three,four,E


In [19]:
df3.fillna(1)

Unnamed: 0,one,two,three,four,E
A,1.0,1.0,1.0,1.0,1
B,1.0,1.0,1.0,1.0,1
C,1.0,1.0,1.0,1.0,1
D,1.0,1.0,1.0,1.0,1
E,1.0,1.0,1.0,1.0,1


In [20]:
df3.isnull()

Unnamed: 0,one,two,three,four,E
A,True,True,True,True,True
B,True,True,True,True,True
C,True,True,True,True,True
D,True,True,True,True,True
E,True,True,True,True,True


## Operations

- .mean - *get mean*
- .apply - *apply a function to data*
- .value_counts - *histogram* **only works on series**
- .str.lower - *string methods*
- pd.concat - *concatenate, equivalent to np.concatenate()*
- .merge - *merges dataframes*

In [21]:
df

Unnamed: 0,one,two,three,four
2016-06-24,-2.910848,1.103582,0.206304,-0.436187
2016-06-25,1.138587,-0.013138,0.4781,2.298781
2016-06-26,1.121629,-0.0555,-0.506361,0.454096
2016-06-27,-1.136933,0.125587,-0.740266,0.485956
2016-06-28,-1.781667,-1.672843,0.832271,-0.368503


In [22]:
df.mean() #default axis=0 (along the row)

one     -0.713847
two     -0.102462
three    0.054010
four     0.486828
dtype: float64

In [23]:
df.mean(1) #axis=1 (along the column)

2016-06-24   -0.509287
2016-06-25    0.975583
2016-06-26    0.253466
2016-06-27   -0.316414
2016-06-28   -0.747686
Freq: D, dtype: float64

In [24]:
df['one'].value_counts()

-1.136933    1
 1.138587    1
-2.910848    1
-1.781667    1
 1.121629    1
Name: one, dtype: int64

In [25]:
pieces = [df.one, df.two, df.three, df.four]
print pd.concat(pieces, axis=1)
print pd.concat(pieces)

                 one       two     three      four
2016-06-24 -2.910848  1.103582  0.206304 -0.436187
2016-06-25  1.138587 -0.013138  0.478100  2.298781
2016-06-26  1.121629 -0.055500 -0.506361  0.454096
2016-06-27 -1.136933  0.125587 -0.740266  0.485956
2016-06-28 -1.781667 -1.672843  0.832271 -0.368503
2016-06-24   -2.910848
2016-06-25    1.138587
2016-06-26    1.121629
2016-06-27   -1.136933
2016-06-28   -1.781667
2016-06-24    1.103582
2016-06-25   -0.013138
2016-06-26   -0.055500
2016-06-27    0.125587
2016-06-28   -1.672843
2016-06-24    0.206304
2016-06-25    0.478100
2016-06-26   -0.506361
2016-06-27   -0.740266
2016-06-28    0.832271
2016-06-24   -0.436187
2016-06-25    2.298781
2016-06-26    0.454096
2016-06-27    0.485956
2016-06-28   -0.368503
dtype: float64


In [28]:
left = pd.DataFrame( {'label': ['hi','bye'], 'lval':[1,2]})
right = pd.DataFrame( {'label': ['hi','bye'], 'rval':[3,4]})

print left
print right

pd.merge(left,right)

  label  lval
0    hi     1
1   bye     2
  label  rval
0    hi     3
1   bye     4


Unnamed: 0,label,lval,rval
0,hi,1,3
1,bye,2,4
