## Pandas
[Pandas](http://pandas.pydata.org/) is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

In [1]:
import pandas as pd

### Data structure

* Series

In [2]:
ser_obj = pd.Series(range(10, 20))
print type(ser_obj)

<class 'pandas.core.series.Series'>


In [3]:
print ser_obj.values
print ser_obj.index

[10 11 12 13 14 15 16 17 18 19]
RangeIndex(start=0, stop=10, step=1)


In [4]:
print ser_obj.head(3)

0    10
1    11
2    12
dtype: int64


In [5]:
print ser_obj

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64


In [6]:
print ser_obj[0]
print ser_obj[8]

10
18


In [7]:
print ser_obj * 2
print ser_obj > 15

0    20
1    22
2    24
3    26
4    28
5    30
6    32
7    34
8    36
9    38
dtype: int64
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool


In [8]:
# dict => Series
year_data = {2001: 17.8, 2002: 20.1, 2003: 16.5}
ser_obj2 = pd.Series(year_data)
print ser_obj2.head()
print ser_obj2.index

2001    17.8
2002    20.1
2003    16.5
dtype: float64
Int64Index([2001, 2002, 2003], dtype='int64')


In [9]:
ser_obj2.name = 'temp'
ser_obj2.index.name = 'year'
print ser_obj2.head()

year
2001    17.8
2002    20.1
2003    16.5
Name: temp, dtype: float64


* DataFrame

In [10]:
import numpy as np

# ndarray => DataFrame
array = np.random.randn(5, 4)
print array

df_obj = pd.DataFrame(array)
print df_obj.head()

[[ 0.19159895 -0.36144372  1.70971588  2.09489632]
 [ 0.32124797 -0.81449591  0.96105235  0.32863114]
 [-0.48900331 -0.43196077  1.76879307 -0.12742134]
 [ 0.69202904  0.13948337  0.2678797   1.10410093]
 [-0.74824118 -1.06493021 -0.07768127  0.16062383]]
          0         1         2         3
0  0.191599 -0.361444  1.709716  2.094896
1  0.321248 -0.814496  0.961052  0.328631
2 -0.489003 -0.431961  1.768793 -0.127421
3  0.692029  0.139483  0.267880  1.104101
4 -0.748241 -1.064930 -0.077681  0.160624


In [11]:
# dict => DataFrame
dict_data = {'A': 1.,
             'B': pd.Timestamp('20161217'),
             'C': pd.Series(1, index=list(range(4)), dtype='float32'),
             'D': np.array([3] * 4, dtype='int32'),
             'E': pd.Categorical(['Python', 'Java', 'C++', 'C#']),
             'F': 'PyData'}

df_obj2 = pd.DataFrame(dict_data)
print df_obj2.head()

     A          B    C  D       E       F
0  1.0 2016-12-17  1.0  3  Python  PyData
1  1.0 2016-12-17  1.0  3    Java  PyData
2  1.0 2016-12-17  1.0  3     C++  PyData
3  1.0 2016-12-17  1.0  3      C#  PyData


In [12]:
print df_obj2['A']
print type(df_obj2['A'])

print df_obj2.A

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64
<class 'pandas.core.series.Series'>
0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64


In [13]:
# Add column
df_obj2['G'] = df_obj2['D'] + 4
print df_obj2.head()

     A          B    C  D       E       F  G
0  1.0 2016-12-17  1.0  3  Python  PyData  7
1  1.0 2016-12-17  1.0  3    Java  PyData  7
2  1.0 2016-12-17  1.0  3     C++  PyData  7
3  1.0 2016-12-17  1.0  3      C#  PyData  7


In [14]:
# Delete column
del df_obj2['G']
print df_obj2.head()

     A          B    C  D       E       F
0  1.0 2016-12-17  1.0  3  Python  PyData
1  1.0 2016-12-17  1.0  3    Java  PyData
2  1.0 2016-12-17  1.0  3     C++  PyData
3  1.0 2016-12-17  1.0  3      C#  PyData


In [15]:
print type(ser_obj.index)
print type(df_obj2.index)

print df_obj2.index

<class 'pandas.indexes.range.RangeIndex'>
<class 'pandas.indexes.numeric.Int64Index'>
Int64Index([0, 1, 2, 3], dtype='int64')


In [16]:
# Index does not support mutable operations
# df_obj2.index[0] = 2 # Error

### Data Process

* Series Index

In [17]:
ser_obj = pd.Series(range(5), ['a', 'b', 'c', 'd', 'e'])
print ser_obj.head()

a    0
b    1
c    2
d    3
e    4
dtype: int64


In [18]:
# row index
print ser_obj['a']
print ser_obj[0]

0
0


In [19]:
# slicing index
print ser_obj[1:3]     # exclusive
print ser_obj['b':'d'] # inclusive

b    1
c    2
dtype: int64
b    1
c    2
d    3
dtype: int64


In [20]:
# discrete index
print ser_obj[[0, 2, 4]]
print ser_obj[['a', 'e']]

a    0
c    2
e    4
dtype: int64
a    0
e    4
dtype: int64


In [21]:
# bool index
ser_bool = ser_obj > 2
print ser_bool
print ser_obj[ser_bool]

print ser_obj[ser_obj > 2]

a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int64
d    3
e    4
dtype: int64


* DataFrame Index

In [22]:
df_obj = pd.DataFrame(np.random.randn(5, 4), columns = ['a', 'b', 'c', 'd'])
print df_obj

          a         b         c         d
0  2.630777  1.017585  0.452941 -1.510812
1  0.499951  0.562357 -0.855192  0.092463
2 -1.601111 -0.222469  0.842450 -0.801463
3 -0.685564 -1.108363 -0.953491  1.791629
4  0.159935 -1.128503 -0.046154  1.364497


In [23]:
# column index
print 'Column index'
print df_obj['a']
print type(df_obj[['a']])

# discrete index
print 'Discrete index'
print df_obj[['a', 'c']]
print df_obj[[1, 3]]

Column index
0    2.630777
1    0.499951
2   -1.601111
3   -0.685564
4    0.159935
Name: a, dtype: float64
<class 'pandas.core.frame.DataFrame'>
Discrete index
          a         c
0  2.630777  0.452941
1  0.499951 -0.855192
2 -1.601111  0.842450
3 -0.685564 -0.953491
4  0.159935 -0.046154
          b         d
0  1.017585 -1.510812
1  0.562357  0.092463
2 -0.222469 -0.801463
3 -1.108363  1.791629
4 -1.128503  1.364497


* Three kinds of index methods

In [24]:
print ser_obj
print df_obj

a    0
b    1
c    2
d    3
e    4
dtype: int64
          a         b         c         d
0  2.630777  1.017585  0.452941 -1.510812
1  0.499951  0.562357 -0.855192  0.092463
2 -1.601111 -0.222469  0.842450 -0.801463
3 -0.685564 -1.108363 -0.953491  1.791629
4  0.159935 -1.128503 -0.046154  1.364497


In [25]:
# label index => loc
# Series
print ser_obj['b': 'd']
print ser_obj.loc['b': 'd']

# DataFrame
print df_obj['a']
print df_obj.loc[0:2, 'a']

b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
dtype: int64
0    2.630777
1    0.499951
2   -1.601111
3   -0.685564
4    0.159935
Name: a, dtype: float64
0    2.630777
1    0.499951
2   -1.601111
Name: a, dtype: float64


In [26]:
# int location index => iloc
print ser_obj[1:3]
print ser_obj.iloc[1:3]

print df_obj.iloc[0:2, 0] # It's different from df_obj.loc[0:2, 'a']

b    1
c    2
dtype: int64
b    1
c    2
dtype: int64
0    2.630777
1    0.499951
Name: a, dtype: float64


In [27]:
# mixed index => ix
print ser_obj.ix[1:3]
print ser_obj.ix['b': 'c']

print df_obj.ix[0:2, 0]  # label index first and then location index

b    1
c    2
dtype: int64
b    1
c    2
dtype: int64
0    2.630777
1    0.499951
2   -1.601111
Name: a, dtype: float64


* Calculation and align

In [28]:
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))

print 's1: '
print s1

print 's2: '
print s2

s1: 
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64
s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int64


In [29]:
print s1 + s2

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [30]:
df1 = pd.DataFrame(np.ones((2, 2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3, 3)), columns = ['a', 'b', 'c'])

print 'df1: '
print df1

print 'df2: '
print df2

df1: 
     a    b
0  1.0  1.0
1  1.0  1.0
df2: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0


In [31]:
print df1 + df2

     a    b   c
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN


In [32]:
print s1.add(s2, fill_value = 100)

0     30.0
1     32.0
2     34.0
3     36.0
4     38.0
5    115.0
6    116.0
7    117.0
8    118.0
9    119.0
dtype: float64


In [33]:
df1.sub(df2, fill_value = 2.)

Unnamed: 0,a,b,c
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,1.0,1.0,1.0


In [34]:
# fill NaN by default
s3 = s1 + s2
print s3

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [35]:
print s3.fillna(-1)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    -1.0
6    -1.0
7    -1.0
8    -1.0
9    -1.0
dtype: float64


In [36]:
df3 = df1 + df2
print df3

     a    b   c
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN


In [37]:
print df3.fillna(100)

       a      b      c
0    2.0    2.0  100.0
1    2.0    2.0  100.0
2  100.0  100.0  100.0


* Functions

In [38]:
df = pd.DataFrame(np.random.randn(5, 4) - 1)
print df

print np.abs(df)

          0         1         2         3
0 -2.259169  0.799284 -1.798008 -0.021779
1 -0.698317 -1.487234 -2.004944  0.028553
2 -2.128286 -1.874384 -0.048466 -0.219495
3  1.056847 -0.656604 -2.294863 -0.110693
4 -1.787468 -0.581718 -1.045755 -0.430043
          0         1         2         3
0  2.259169  0.799284  1.798008  0.021779
1  0.698317  1.487234  2.004944  0.028553
2  2.128286  1.874384  0.048466  0.219495
3  1.056847  0.656604  2.294863  0.110693
4  1.787468  0.581718  1.045755  0.430043


In [39]:
# apply => works on column or row
print df.apply(lambda x : x.max())

0    1.056847
1    0.799284
2   -0.048466
3    0.028553
dtype: float64


In [40]:
# apply =>  works on rows
print df.apply(lambda x: x.max(), axis = 1)

0    0.799284
1    0.028553
2   -0.048466
3    1.056847
4   -0.430043
dtype: float64


In [41]:
# applymap => works on each element
print df.applymap(lambda x : '%0.2f' % x)

       0      1      2      3
0  -2.26   0.80  -1.80  -0.02
1  -0.70  -1.49  -2.00   0.03
2  -2.13  -1.87  -0.05  -0.22
3   1.06  -0.66  -2.29  -0.11
4  -1.79  -0.58  -1.05  -0.43


* sorting

In [42]:
s4 = pd.Series(range(10, 16), index = np.random.randint(8, size=6))
print s4

2    10
5    11
0    12
0    13
2    14
2    15
dtype: int64


In [43]:
print s4.sort_index()

0    12
0    13
2    10
2    14
2    15
5    11
dtype: int64


In [44]:
df4 = pd.DataFrame(np.random.randn(3, 4),
                  index=np.random.randint(3, size=3),
                  columns=np.random.randint(4, size=4))
df4

Unnamed: 0,3,3.1,0,1
1,1.112702,-1.99015,-0.38245,-0.128823
1,-0.380712,1.183447,0.013907,0.561071
1,-1.604982,0.79415,-1.877383,0.56367


In [45]:
# df4.sort_index(ascending = False)
df4.sort_index(axis=1)

Unnamed: 0,0,1,3,3.1
1,-0.38245,-0.128823,1.112702,-1.99015
1,0.013907,0.561071,-0.380712,1.183447
1,-1.877383,0.56367,-1.604982,0.79415


In [46]:
df4.sort_values(by=1) # by axis name

Unnamed: 0,3,3.1,0,1
1,1.112702,-1.99015,-0.38245,-0.128823
1,-0.380712,1.183447,0.013907,0.561071
1,-1.604982,0.79415,-1.877383,0.56367


* Handle missing data

In [47]:
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                       [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()

Unnamed: 0,0,1,2
0,-1.160321,0.947642,-0.485632
1,1.0,,
2,4.0,,
3,1.0,,2.0


In [48]:
df_data.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,True
2,False,True,True
3,False,True,False


In [49]:
df_data.dropna()
# df_data.dropna(axis=1)

Unnamed: 0,0,1,2
0,-1.160321,0.947642,-0.485632


In [50]:
df_data.fillna(-100.)

Unnamed: 0,0,1,2
0,-1.160321,0.947642,-0.485632
1,1.0,-100.0,-100.0
2,4.0,-100.0,-100.0
3,1.0,-100.0,2.0
