# Getting Started w/ Pandas

- high-level data structures
- built on top of Numpy

In [1]:
#from pandas import Series, Dataframe
import pandas as pd
import numpy as np

from pandas import Series, DataFrame

## Pandas Data Structures

### Series
- one-dimensional array-like object containing an array of data and an associated array of data label

In [2]:
obj = Series([4, 7, -5, 3])

In [3]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj2 = Series([4,7,-5,3], index=['a', 'b', 'c', 'd'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [5]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [6]:
obj2[['a', 'b']]

a    4
b    7
dtype: int64

In [7]:
obj2[obj2>0]

a    4
b    7
d    3
dtype: int64

In [8]:
obj2*2

a     8
b    14
c   -10
d     6
dtype: int64

In [9]:
'b' in obj2

True

In [10]:
'e' in obj2

False

### Create a Series from Python Dict

In [11]:
data = {'a': 1, 'b': 2, 'c': 3, 'd':4}

In [12]:
obj3 = Series(data)

In [13]:
obj3

a    1
b    2
c    3
d    4
dtype: int64

In [14]:
### Passing index in the Series with missing data
obj4 = Series(data, index=['a', 'b', 'c', 'd','e'])
obj4

a    1.0
b    2.0
c    3.0
d    4.0
e    NaN
dtype: float64

### How to identify missing values in Pandas?

In [15]:
pd.isnull(obj4)

a    False
b    False
c    False
d    False
e     True
dtype: bool

In [16]:
pd.notnull(obj4)

a     True
b     True
c     True
d     True
e    False
dtype: bool

### How to set Series attributes?

In [17]:
obj4.name = "Letters Mapping"

In [18]:
obj4.index.name = 'Letters'

In [19]:
obj4

Letters
a    1.0
b    2.0
c    3.0
d    4.0
e    NaN
Name: Letters Mapping, dtype: float64

## DataFrame
- tabular, spreadsheet-like data structures
- it has both row and column index


### 2 ways to create DataFrame

### 1. most common way is from a dict of equal-length of arrays

In [20]:
# most common way is from a dict of equal-length of arrays
data = {'letters': ['a', 'b', 'c', 'd'],
         'numbers': [1,2,3,4],
         'year':[2010, 2011, 2012, 2013]}
frame = DataFrame(data)
frame

Unnamed: 0,letters,numbers,year
0,a,1,2010
1,b,2,2011
2,c,3,2012
3,d,4,2013


In [21]:
# can pass columns
DataFrame(data, columns=['year', 'letters', 'numbers'])

Unnamed: 0,year,letters,numbers
0,2010,a,1
1,2011,b,2
2,2012,c,3
3,2013,d,4


### 2. Nested dict

In [22]:
nested_data = {'Male': {2010: 2.4, 2011: 2.5}, 'Female': {2010: 2.6, 2012: 3}}
DataFrame(nested_data)

Unnamed: 0,Female,Male
2010,2.6,2.4
2011,,2.5
2012,3.0,


In [23]:
DataFrame(nested_data, index=[2009, 2010, 2011, 2012])

Unnamed: 0,Female,Male
2009,,
2010,2.6,2.4
2011,,2.5
2012,3.0,


## Fundamental Mechanics of interacting with the data in Series and DataFrame

### Reindexing

In [24]:
obj = Series([1,2,3,4], index=['b', 'd','c','a'])
obj

b    1
d    2
c    3
a    4
dtype: int64

In [25]:
new_obj = obj.reindex(['a', 'b', 'c', 'd'])
new_obj

a    4
b    1
c    3
d    2
dtype: int64

In [26]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a    4
b    1
c    3
d    2
e    0
dtype: int64

In [27]:
### for ordered data like time series
obj3 = Series(['a', 'b', 'c'], index=[0,2,4])
obj3

0    a
2    b
4    c
dtype: object

In [28]:
# fill values forward
obj3.reindex(range(6), method='ffill')

#try bfill and see what happens...

0    a
1    a
2    b
3    b
4    c
5    c
dtype: object

In [29]:
import numpy as np

In [30]:
df = DataFrame(np.arange(9).reshape((3,3)), index=['a','b','c'], columns=['2010', '2011', '2012'])
df

Unnamed: 0,2010,2011,2012
a,0,1,2
b,3,4,5
c,6,7,8


In [31]:
df.reindex(['a','b','c', 'd'])

Unnamed: 0,2010,2011,2012
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,,,


In [32]:
df.reindex(columns=['2013', '2012', '2011', '2010'])

Unnamed: 0,2013,2012,2011,2010
a,,2,1,0
b,,5,4,3
c,,8,7,6


In [33]:
df

Unnamed: 0,2010,2011,2012
a,0,1,2
b,3,4,5
c,6,7,8


In [34]:
# succinct way
df.ix[['b', 'c', 'a', 'd'], ['2013', '2012', '2011', '2010']]

Unnamed: 0,2013,2012,2011,2010
b,,5.0,4.0,3.0
c,,8.0,7.0,6.0
a,,2.0,1.0,0.0
d,,,,


### Dropping entries from an axis

In [35]:
# For Series,
series_obj = Series(np.arange(5.), index=['a', 'b', 'c','d', 'e'])
series_obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [36]:
# delete single index
new_obj = obj.drop('c')
new_obj

b    1
d    2
a    4
dtype: int64

In [37]:
# delete multiple indexes
new_obj = obj.drop(['a', 'b'])
new_obj

d    2
c    3
dtype: int64

In [38]:
# For DataFrame
data = DataFrame(np.arange(16).reshape((4,4)), index=[2010, 2011, 2012, 2013], columns=['a', 'b', 'c','d'])
data

Unnamed: 0,a,b,c,d
2010,0,1,2,3
2011,4,5,6,7
2012,8,9,10,11
2013,12,13,14,15


In [39]:
# drop multiple rows
data.drop([2011, 2012])

Unnamed: 0,a,b,c,d
2010,0,1,2,3
2013,12,13,14,15


In [40]:
# drop multiple columns
data.drop(['b', 'c'], axis=1)

Unnamed: 0,a,d
2010,0,3
2011,4,7
2012,8,11
2013,12,15


# Indexing

In [41]:
# Series
obj = Series(np.arange(4.0), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [42]:
obj['b']

1.0

In [43]:
obj[0:2] #retrieve those value between 0 and 2(exclusive)

a    0.0
b    1.0
dtype: float64

In [44]:
obj[['b', 'c', 'd']] #retrieves all rows with the index

b    1.0
c    2.0
d    3.0
dtype: float64

In [45]:
obj[[1,3]] # retrieves all rows containing the specified array

b    1.0
d    3.0
dtype: float64

In [46]:
obj[obj < 2] # retrieves all elements less than 2

a    0.0
b    1.0
dtype: float64

In [47]:
obj['b':'c'] # index slicing is inclusive

b    1.0
c    2.0
dtype: float64

In [48]:
obj['b':'c'] = 5 # sets the value of the index to 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

## DataFrame Indexing

In [49]:
df = DataFrame(np.arange(16).reshape((4,4)), index=['a', 'b', 'c', 'd'], columns=['2010', '2011', '2012', '2013'])
df

Unnamed: 0,2010,2011,2012,2013
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [50]:
df['2011'] # retrieves the column

a     1
b     5
c     9
d    13
Name: 2011, dtype: int64

In [51]:
df[['2011', '2012']] # retrieves multiple columns

Unnamed: 0,2011,2012
a,1,2
b,5,6
c,9,10
d,13,14


In [52]:
df[2:4] # selecting rows by slicing

Unnamed: 0,2010,2011,2012,2013
c,8,9,10,11
d,12,13,14,15


In [53]:
df[ df['2012'] > 8] #selecting rows by boolean

Unnamed: 0,2010,2011,2012,2013
c,8,9,10,11
d,12,13,14,15


In [54]:
df < 8 # scalar comparison

Unnamed: 0,2010,2011,2012,2013
a,True,True,True,True
b,True,True,True,True
c,False,False,False,False
d,False,False,False,False


In [55]:
df[df < 8] = 0 # set those values less than 8
df

Unnamed: 0,2010,2011,2012,2013
a,0,0,0,0
b,0,0,0,0
c,8,9,10,11
d,12,13,14,15


In [56]:
#special indexing field ix, it allows you to select subset of the rows and columns
df.ix['c', ['2011', '2013']]

2011     9
2013    11
Name: c, dtype: int64

In [57]:
df.ix[['c', 'b'], ['2011', '2012']] # pass arrays of index and columns

Unnamed: 0,2011,2012
c,9,10
b,0,0


In [58]:
df.ix[2] # retrieves the 3rd rows in the dataframe

2010     8
2011     9
2012    10
2013    11
Name: c, dtype: int64

In [59]:
df.ix[:'c', :'2012' ] #slicing

Unnamed: 0,2010,2011,2012
a,0,0,0
b,0,0,0
c,8,9,10


In [60]:
df.ix[df['2011'] > 10, :3]

Unnamed: 0,2010,2011,2012
d,12,13,14


INDEXING OPTIONS WITH DATAFRAME
* obj[val] - Select single columns
* obj.ix[val] - select single row
* obj.ix[:, val] - select single column w/ subset
* obj.ix[val1, val2] - select both rows and columns
* reindex method - conform one or more axes to new indexes
* xs method - select single row or column as a Series by label
* icol, irow methods - select single column or row, respectively, as a Series by integer location.
* get_value, set_value methods - Select single value by row and column label


# Arithmethic and Data Alignment

In [61]:
# Adding Series data structures
series1 = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
series1

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [66]:
series2 = Series(np.arange(3.,8.), index=['e', 'd', 'c', 'b', 'a'])
series2

e    3.0
d    4.0
c    5.0
b    6.0
a    7.0
dtype: float64

In [68]:
series1 + series2 # it propagates NA values when there are no overlap on the index

a    7.0
b    7.0
c    7.0
d    7.0
e    NaN
dtype: float64

In [72]:
# Adding DataFrames
df1 = DataFrame(np.arange(9.0).reshape((3,3)), columns=list('abc'), index=["2010", "2011", "2012"] )
df1

Unnamed: 0,a,b,c
2010,0.0,1.0,2.0
2011,3.0,4.0,5.0
2012,6.0,7.0,8.0


In [75]:
df2 = DataFrame(np.arange(12.).reshape((4,3)), columns=list("bcd"), index=["2013", "2012", "2011", "2010"])
df2

Unnamed: 0,b,c,d
2013,0.0,1.0,2.0
2012,3.0,4.0,5.0
2011,6.0,7.0,8.0
2010,9.0,10.0,11.0


In [77]:
df1 + df2

Unnamed: 0,a,b,c,d
2010,,10.0,12.0,
2011,,10.0,12.0,
2012,,10.0,12.0,
2013,,,,


What if you want to add 2 dataframes and fill the missing values with 0?

In [78]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d
2010,0.0,10.0,12.0,11.0
2011,3.0,10.0,12.0,8.0
2012,6.0,10.0,12.0,5.0
2013,,0.0,1.0,2.0


Arithmetic methods
* add - method for addition(+)
* sub - method for substraction(-)
* div - method for division(/)
* mul - method for multiplication(*)

In [80]:
#broadcasting
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [81]:
arr[0] # returns the first element of the 2d array

array([0, 1, 2, 3])

In [83]:
arr - arr[0]

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [87]:
#similar with DataFrames
df = DataFrame(np.arange(12.0).reshape((4,3)), columns=list('abc'), index=["2010", "2011", "2012", "2013"])
df

Unnamed: 0,a,b,c
2010,0.0,1.0,2.0
2011,3.0,4.0,5.0
2012,6.0,7.0,8.0
2013,9.0,10.0,11.0


In [90]:
series = df.ix[0] # get the 1st row of the dataframe
series

a    0.0
b    1.0
c    2.0
Name: 2010, dtype: float64

In [91]:
df - series

Unnamed: 0,a,b,c
2010,0.0,0.0,0.0
2011,3.0,3.0,3.0
2012,6.0,6.0,6.0
2013,9.0,9.0,9.0


In [92]:
# What if an index value is not found in either the Dataframe's columns or Series index.
series2 = Series(range(3), index=list('bef'))
series2

b    0
e    1
f    2
dtype: int64

In [93]:
df + series2

Unnamed: 0,a,b,c,e,f
2010,,1.0,,,
2011,,4.0,,,
2012,,7.0,,,
2013,,10.0,,,


In [94]:
# What if you want to substract a column from another column?
series3 = df['b']
series3

2010     1.0
2011     4.0
2012     7.0
2013    10.0
Name: b, dtype: float64

In [97]:
df.sub(series3, axis=0)

Unnamed: 0,a,b,c
2010,-1.0,0.0,1.0
2011,-1.0,0.0,1.0
2012,-1.0,0.0,1.0
2013,-1.0,0.0,1.0


# Function application and mapping

In [99]:
df = DataFrame(np.random.randn(4,3), columns=list('abc'), index=["2010", "2011", "2012", "2013"])
df

Unnamed: 0,a,b,c
2010,0.543068,-0.618018,-0.684225
2011,-1.168272,-1.154461,0.869636
2012,-0.288331,0.022148,0.0924
2013,-0.149043,0.721728,1.336434


In [100]:
np.abs(df)

Unnamed: 0,a,b,c
2010,0.543068,0.618018,0.684225
2011,1.168272,1.154461,0.869636
2012,0.288331,0.022148,0.0924
2013,0.149043,0.721728,1.336434


In [103]:
f = lambda x: x.max() - x.min()
df.apply(f)

a    1.711340
b    1.876189
c    2.020658
dtype: float64

In [105]:
df.apply(f, axis=1)

2010    1.227293
2011    2.037907
2012    0.380731
2013    1.485476
dtype: float64

In [109]:
# formatting
format = lambda x: '%.2f' % x
df.applymap(format)

Unnamed: 0,a,b,c
2010,0.54,-0.62,-0.68
2011,-1.17,-1.15,0.87
2012,-0.29,0.02,0.09
2013,-0.15,0.72,1.34


# Sorting and Ranking

In [111]:
obj = Series(range(4), index=list('bacd'))
obj

b    0
a    1
c    2
d    3
dtype: int64

In [112]:
obj.sort_index()

a    1
b    0
c    2
d    3
dtype: int64

In [117]:
df = DataFrame(np.arange(8).reshape((2,4)), index=list('ba'), columns=["2013", "2011", "2012", "2011"])
df

Unnamed: 0,2013,2011,2012,2011.1
b,0,1,2,3
a,4,5,6,7


In [118]:
df.sort_index()

Unnamed: 0,2013,2011,2012,2011.1
a,4,5,6,7
b,0,1,2,3


In [119]:
df.sort_index(axis=1)

Unnamed: 0,2011,2011.1,2012,2013
b,1,3,2,0
a,5,7,6,4


In [120]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,2013,2012,2011,2011.1
b,0,2,1,3
a,4,6,5,7


In [122]:
# sort Series by its values
obj = Series([3, 6, 9, -3])
obj.sort_values()

3   -3
0    3
1    6
2    9
dtype: int64

In [123]:
# missing values are sorted to the end of the Series
obj = Series([np.nan, 3, 6, np.nan ,9, -3])

In [127]:
obj.sort_values(ascending=False)

4    9.0
2    6.0
1    3.0
5   -3.0
0    NaN
3    NaN
dtype: float64

In [128]:
# sorting DataFrame
df = DataFrame({"2015": [4, 7, -3, 2], "2016": [0, 1, 0, 1]})
df


Unnamed: 0,2015,2016
0,4,0
1,7,1
2,-3,0
3,2,1


In [131]:
df.sort_values(by=['2016', '2015'])

Unnamed: 0,2015,2016
2,-3,0
0,4,0
3,2,1
1,7,1


In [135]:
#ranking
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [133]:
obj.rank() # by default rank break ties by assigning each group the MEAN rank

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [134]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [137]:
# dataframes ranking
df = DataFrame({'b': [4.3, 7, 3, 2], 'a': [0,-1, 2, -3], 'c': [-5,2,8,-2.5]})
df

Unnamed: 0,a,b,c
0,0,4.3,-5.0
1,-1,7.0,2.0
2,2,3.0,8.0
3,-3,2.0,-2.5


In [138]:
df.rank(axis=1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,1.0,2.0,3.0
3,1.0,3.0,2.0


## Tie-breaking methods with rank
* average - default: assign the average rank to each entry in the equal group
* min - use the minimum rank for the whole group
* max - use the maximum rank for the whole group
* first - assign ranks in the order the values appear in the data

# What if the indexes have duplicate values?

In [139]:
series = Series(range(5), index=list("aabbc"))
series

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [140]:
series.index.is_unique

False

In [141]:
series['a'] # duplicated indexes return a Series

a    0
a    1
dtype: int64

In [142]:
series['c'] # return scalar

4

In [144]:
# same logic extends to DataFrames
df = DataFrame(np.random.randn(5,3), index=list('aabbc'))
df

Unnamed: 0,0,1,2
a,-1.48652,0.075489,-1.663881
a,-1.789326,-1.556271,0.179653
b,0.089104,-0.896234,-1.129488
b,-1.477344,-0.747518,0.025358
c,0.534238,0.687684,-0.955967


In [145]:
df.ix['b']

Unnamed: 0,0,1,2
b,0.089104,-0.896234,-1.129488
b,-1.477344,-0.747518,0.025358


# Descriptive Statistics for Pandas

In [146]:
df = DataFrame([[1,2], [3, np.nan], [np.nan, np.nan], [4,5]], index=list('aabb'), columns=['2016', '2017'])
df

Unnamed: 0,2016,2017
a,1.0,2.0
a,3.0,
b,,
b,4.0,5.0


In [148]:
df.sum() #get the total of the columns

2016    8.0
2017    7.0
dtype: float64

In [151]:
df.mean(axis=1, skipna=False)

a    1.5
a    NaN
b    NaN
b    4.5
dtype: float64

In [152]:
df.describe() # commonly used summary statistics

Unnamed: 0,2016,2017
count,3.0,2.0
mean,2.666667,3.5
std,1.527525,2.12132
min,1.0,2.0
25%,2.0,2.75
50%,3.0,3.5
75%,3.5,4.25
max,4.0,5.0


### Descriptive and summary statistics
* count  - Number of non-NA values
* describe - compute set of summary statistics for Series and DataFrame column
* min, max - compute minimum and maximum values
* argmin, argmax - compute index locations(integers) at which minimum or maximum value obtained
* idxmin, idxmax - compute index values at which minimum or maximum value obtained
* quantile - compute sample quantile ranging from 0 to 1
* sum            - sum of values
* mean           - mean of values
* median         - median of values
* mad            - mean absolute values from mean value
* var            - sample variances of values
* std            - sample standard deviation of values
* skew           - sample skewness(3rd moment) of values
* kurt           - sample kurtosis(4th moment) of values
* cumsum         - cumulative sum of values
* cummin, cummax - cumulative minimum and maximum of values
* cumprod        - cumulative product of values
* diff           - compute 1st arithmetic difference(useful for time series)
* pct_change     - compute percent change

In [153]:
obj = Series(list('cadaabbcc'))
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [154]:
obj.unique() #unique values

array(['c', 'a', 'd', 'b'], dtype=object)

In [156]:
obj.value_counts() #returns value frequencies

a    3
c    3
b    2
d    1
dtype: int64

In [158]:
#alternatively,
pd.value_counts(obj.values, sort=True)

a    3
c    3
b    2
d    1
dtype: int64

In [160]:
# data filtering using isin
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [161]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

# Handling Missing Data

In [164]:
fruits = Series(['apple', 'banana', np.nan, 'citrus'])
fruits.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [165]:
fruits.dropna()

0     apple
1    banana
3    citrus
dtype: object

In [167]:
fruits.fillna('orange')

0     apple
1    banana
2    orange
3    citrus
dtype: object

In [170]:
fruits.fillna(method='ffill')

0     apple
1    banana
2    banana
3    citrus
dtype: object

In [171]:
fruits.fillna(method='bfill')

0     apple
1    banana
2    citrus
3    citrus
dtype: object

In [173]:
fruits.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [174]:
fruits.isnull()

0    False
1    False
2     True
3    False
dtype: bool

# Missing Data Filtering


In [175]:
from numpy import nan as NA

In [176]:
data = Series([1, NA, 3.5, NA, 7])

In [178]:
data.dropna() #or data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [179]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [180]:
data = DataFrame([[1,2,3],[4, NA, NA], [NA, NA, NA], [NA, 5, 6]])
data

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,
2,,,
3,,5.0,6.0


In [184]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [185]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,
3,,5.0,6.0


In [187]:
data[4] = NA # add new column with all NaN
data

Unnamed: 0,0,1,2,4
0,1.0,2.0,3.0,
1,4.0,,,
2,,,,
3,,5.0,6.0,


In [188]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,
2,,,
3,,5.0,6.0


# Filling Missing Data

In [189]:
data

Unnamed: 0,0,1,2,4
0,1.0,2.0,3.0,
1,4.0,,,
2,,,,
3,,5.0,6.0,


In [190]:
data.fillna(0)

Unnamed: 0,0,1,2,4
0,1.0,2.0,3.0,0.0
1,4.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,5.0,6.0,0.0


In [192]:
data.fillna({1: 0.5, 4: -1})

Unnamed: 0,0,1,2,4
0,1.0,2.0,3.0,-1.0
1,4.0,0.5,,-1.0
2,,0.5,,-1.0
3,,5.0,6.0,-1.0
