In [140]:
import pandas as pd
from IPython.display import display

## Pandas Series Object

Indexed one-dimensional data 

In [141]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [142]:
#access series values
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [143]:
#access series index
data.index

RangeIndex(start=0, stop=4, step=1)

In [144]:
#use series index to access value
data[1]

0.5

* In Numpy, there exist implicit index to access values within the array
* In Pandas Series, there can be explicit index to access values within Pandas series

In [145]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

* You can also use dictionary to map keys and values to Series

In [146]:
population_dict = {'california': 38332521,
                  'Texas': 26448193,
                  'New York': 19651127,
                  'Florida': 19552860,
                  'Illinois': 12882135}

population = pd.Series(population_dict)
population

california    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [147]:
# data can either be list or NumPy array, in such case the default index are integers
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [148]:
# data can be a repeated scalar in order to fill in assigned indices
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [149]:
#if data is a dictionary, series indices will be the dict's keys
pd.Series({2:'a', 1:'b', 3:'c'})

2    a
1    b
3    c
dtype: object

In [150]:
#if data is a dictionary, you can explicitely assign indices
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

## Pandas Dataframe Object

In [151]:
area_dict = {'california': 423967,
            'Texas': 695662,
            'New York': 141297,
            'Florida': 170312,
            'Illinois': 149995}

area = pd.Series(area_dict)

area

california    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [152]:
population_dict = {'california': 38332521,
                  'Texas': 26448193,
                  'New York': 19651127,
                  'Florida': 19552860,
                  'Illinois': 12882135}

population = pd.Series(population_dict)

population

california    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [153]:
#Use two Series object to create a DataFrame object.
#Key names that are common to both series will be the index
states = pd.DataFrame({'population': population,
                      'area':area})

states

Unnamed: 0,population,area
california,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [154]:
#access index label
states.index

Index(['california', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [155]:
#access column labels
states.columns

Index(['population', 'area'], dtype='object')

In [156]:
#access specific column
states['area']

california    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [157]:
#or you can access this way
states.area

california    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [158]:
#add new colunn
states['density'] = states['population'] / states['area']
states

Unnamed: 0,population,area,density
california,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [159]:
#access values
states.values

array([[3.83325210e+07, 4.23967000e+05, 9.04139261e+01],
       [2.64481930e+07, 6.95662000e+05, 3.80187404e+01],
       [1.96511270e+07, 1.41297000e+05, 1.39076746e+02],
       [1.95528600e+07, 1.70312000e+05, 1.14806121e+02],
       [1.28821350e+07, 1.49995000e+05, 8.58837628e+01]])

In [160]:
states.T

Unnamed: 0,california,Texas,New York,Florida,Illinois
population,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
area,423967.0,695662.0,141297.0,170312.0,149995.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [161]:
#you cannot call by index name
states['california']

KeyError: 'california'

In [162]:
#however, you can do this
states['california':'New York']

Unnamed: 0,population,area,density
california,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746


In [163]:
#you can also do this
states[0:3]

Unnamed: 0,population,area,density
california,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746


In [164]:
#masking
states.density > 100

california    False
Texas         False
New York       True
Florida        True
Illinois      False
Name: density, dtype: bool

## Null Values

In [165]:
# include bool values
data = pd.Series([1, np.nan, 'hello', None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [166]:
display(data.notnull())

# masking
display(data[data.notnull()])

0     True
1    False
2     True
3    False
dtype: bool

0        1
2    hello
dtype: object

In [167]:
# in series
data.dropna()

0        1
2    hello
dtype: object

In [168]:
# in DataFrame 
df = pd.DataFrame([[1, np.nan, 2], 
                  [2, 3, 5],
                  [np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [169]:
#drops any row with null values
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [170]:
#drop columns with any null values
df.dropna(axis = 1)

Unnamed: 0,2
0,2
1,5
2,6


In [171]:
# add a new column filled with NaN
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [172]:
# drop column when all its elements are null
df.dropna(axis = 1, how = 'all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [173]:
# at least 3 elements have to be not null from each row in order to be dropped
df.dropna(axis = 0, thresh = 3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


## Fill-in for Null

In [174]:
data = pd.Series([1, np.nan, 2, None, 3], index = list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [175]:
#fill in as 0
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [176]:
# fill in as the value of previous index (index - 1)
data.fillna(method = 'ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [177]:
# fill in as the value of next index (index + 1)
data.fillna(method = 'bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [178]:
#in dataframes, you can set the axis for fill method
display(df)

df.fillna(method = 'ffill', axis = 1)

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


## Merging

In [179]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                   'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [180]:
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                   'hire_date': [2004, 2008, 2012, 2014]})
df2

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [181]:
# merge the two dataframes
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


* pd.merge() knows that DataFrame has column Employee and uses this as the key to merge the two dataframes.

In [182]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                   'supervisor': ['Carly', 'Guido', 'Steve']})

display(df3)
display(df4)

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve


In [183]:
display(pd.merge(df3, df4))

Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


many to many join

In [184]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering',
                              'HR', 'HR'],
                   'hire_date': ['math', 'spreadsheets', 
                                 'coding', 'linux', 
                                 'spreadsheets', 'organization']})
display(df1)
display(df5)

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,group,hire_date
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization


In [185]:
display(pd.merge(df1, df5))

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organization


You can also assign which keys to merge on

In [186]:
display(df1)
display(df2)

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [187]:
pd.merge(df1, df2, on='employee')

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


## Join

In [188]:
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                   'food': ['fish', 'beans', 'bread']},
                  columns=['name', 'food'])

df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                   'drink': ['wine', 'beer']},
                  columns=['name', 'drink'])

display(df6)
display(df7)

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread


Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer


In [189]:
# intersection of the two dataframes
pd.merge(df6, df7)

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [190]:
# which is same as this
pd.merge(df6, df7, how='inner')

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [191]:
#outer join
pd.merge(df6, df7, how='outer')

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine
3,Joseph,,beer


In [192]:
#left join
pd.merge(df6, df7, how='left')

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine


In [193]:
#right join
pd.merge(df6, df7, how='right')

Unnamed: 0,name,food,drink
0,Mary,bread,wine
1,Joseph,,beer


In case you have overlapping column names, you can use suffixes keyword

In [194]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                   'rank': [1, 2, 3, 4]})
df8

Unnamed: 0,name,rank
0,Bob,1
1,Jake,2
2,Lisa,3
3,Sue,4


In [195]:
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                   'rank': [3, 1, 4, 2]})
df9

Unnamed: 0,name,rank
0,Bob,3
1,Jake,1
2,Lisa,4
3,Sue,2


In [196]:
# if you set Name as the key the rank columns will automatically be renamed
pd.merge(df8, df9, on='name')

Unnamed: 0,name,rank_x,rank_y
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


In [197]:
#you can set suffixes
pd.merge(df8, df9, on='name', suffixes=["_L", "_R"])

Unnamed: 0,name,rank_L,rank_R
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


## Pandas Aggregation

In [198]:
import seaborn as sns
planets = sns.load_dataset('planets')

#check the shape
display(planets.shape)

#preview the data
display(planets.head())

(1035, 6)

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [199]:
#describes the data
planets.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [200]:
#drop rows containig missing values and describe
display(planets.dropna().describe())

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


## Groupby

Groupby is a strong method that allows you to do the following:
1. Divide and categorize the dataframe based on given key values
2. Allows you to apply aggfunc, transformation, and filtering functions for grouped elements

In [201]:
#create dataframe
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'], 
                   'data': range(6)},
                 columns = ['key', 'data'])

display(df)

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [202]:
#group by key and calculate each groups' sum
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [203]:
#groupby method and find median
planets.groupby('method').median()

Unnamed: 0_level_0,number,orbital_period,mass,distance,year
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Astrometry,1.0,631.18,,17.875,2011.5
Eclipse Timing Variations,2.0,4343.5,5.125,315.36,2010.0
Imaging,1.0,27500.0,,40.395,2009.0
Microlensing,1.0,3300.0,,3840.0,2010.0
Orbital Brightness Modulation,2.0,0.342887,,1180.0,2011.0
Pulsar Timing,3.0,66.5419,,1200.0,1994.0
Pulsation Timing Variations,1.0,1170.0,,,2007.0
Radial Velocity,1.0,360.2,1.26,40.445,2009.0
Transit,1.0,5.714932,1.47,341.0,2012.0
Transit Timing Variations,2.0,57.011,,855.0,2012.5


In [204]:
#groupby method and select orbital_period column
planets.groupby('method')['orbital_period'].median()

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

In [205]:
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


In [206]:
#groupby method on year and describe year column
planets.groupby('method')['year'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,2011.5,2.12132,2010.0,2010.75,2011.5,2012.25,2013.0
Eclipse Timing Variations,9.0,2010.0,1.414214,2008.0,2009.0,2010.0,2011.0,2012.0
Imaging,38.0,2009.131579,2.781901,2004.0,2008.0,2009.0,2011.0,2013.0
Microlensing,23.0,2009.782609,2.859697,2004.0,2008.0,2010.0,2012.0,2013.0
Orbital Brightness Modulation,3.0,2011.666667,1.154701,2011.0,2011.0,2011.0,2012.0,2013.0
Pulsar Timing,5.0,1998.4,8.38451,1992.0,1992.0,1994.0,2003.0,2011.0
Pulsation Timing Variations,1.0,2007.0,,2007.0,2007.0,2007.0,2007.0,2007.0
Radial Velocity,553.0,2007.518987,4.249052,1989.0,2005.0,2009.0,2011.0,2014.0
Transit,397.0,2011.236776,2.077867,2002.0,2010.0,2012.0,2013.0,2014.0
Transit Timing Variations,4.0,2012.5,1.290994,2011.0,2011.75,2012.5,2013.25,2014.0


In [207]:
#unstack method
planets.groupby('method')['year'].describe().unstack()

       method                       
count  Astrometry                          2.000000
       Eclipse Timing Variations           9.000000
       Imaging                            38.000000
       Microlensing                       23.000000
       Orbital Brightness Modulation       3.000000
       Pulsar Timing                       5.000000
       Pulsation Timing Variations         1.000000
       Radial Velocity                   553.000000
       Transit                           397.000000
       Transit Timing Variations           4.000000
mean   Astrometry                       2011.500000
       Eclipse Timing Variations        2010.000000
       Imaging                          2009.131579
       Microlensing                     2009.782609
       Orbital Brightness Modulation    2011.666667
       Pulsar Timing                    1998.400000
       Pulsation Timing Variations      2007.000000
       Radial Velocity                  2007.518987
       Transit             

In [208]:
#randomly create dataframe
import numpy as np

rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                  'data1': range(6),
                  'data2': rng.randint(0, 10, 6),},
                 columns = ['key', 'data1', 'data2'])

df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [209]:
#method can be a string, numpy method, or built-in python method
df.groupby('key').aggregate(['min', np.median, max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [210]:
df.groupby('key').aggregate({'data1': [min, np.mean], 'data2': 'max'})

Unnamed: 0_level_0,data1,data1,data2
Unnamed: 0_level_1,min,mean,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,0,1.5,5
B,1,2.5,7
C,2,3.5,9


In [222]:
"""filtering : filter(), filters based on the group's characteristics"""

# filters gorups whose standard deviation in 'data2' is greater than 4
def filter_func(x):
    return x['data2'].std() > 4

display(df)
display(df.groupby('key').std())

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.12132,1.414214
B,2.12132,4.949747
C,2.12132,4.242641


In [224]:
#this will return data which belongs to groups B and C.
display(df.groupby('key').filter(filter_func))

Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


In [225]:
#or you can use lambda function instead
display(df.groupby('key').filter(lambda x: x['data2'].std() > 4))

Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


In [229]:
display(df)
display(df.groupby('key').mean()) #this will group data by 'key' and calculate each groups' mean values

#this will calculate the difference between elements in each group and the grouped elements' mean value
display(df.groupby('key').transform(lambda x: x - x.mean()))

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1.5,4.0
B,2.5,3.5
C,3.5,6.0


Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


In [230]:
"""apply - used when you want to apply a certain function to the grouped results"""

#divide data1's values by the same group's data2 sum value.
def norm_by_data2(x):
    x['data1'] /= x['data2'].sum()
    return x

display(df)
display(df.groupby('key').apply(norm_by_data2))

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


Unnamed: 0,key,data1,data2
0,A,0.0,5
1,B,0.142857,0
2,C,0.166667,3
3,A,0.375,3
4,B,0.571429,7
5,C,0.416667,9


In [231]:
#assign split keys via list
L = [0, 1, 0, 1, 2, 0]
display(df)

# 0 - index 0, 2, 5 elements' sum
# 1 - index 1, 3 elements' sum
# 2 - index 4 elements' sum
display(df.groupby(L).sum())

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


Unnamed: 0,data1,data2
0,7,17
1,4,3
2,4,7


In [234]:
# using dictionary to map index to group keys
display(df)

df2 = df.set_index('key')
display(df2)

# A maps to C1, B and C maps to C2
mapping = {'A': 'C1', 'B': 'C2', 'C': 'C2'}

display(df2.groupby(mapping).sum())

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9


Unnamed: 0,data1,data2
C1,3,8
C2,12,19


In [235]:
#use python function to group
df2.groupby(str.lower).mean()

Unnamed: 0,data1,data2
a,1.5,4.0
b,2.5,3.5
c,3.5,6.0


## Pivot Table

In [212]:
#import titanic dataset
import seaborn as sns
titanic = sns.load_dataset('titanic')

titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [213]:
#group by sex and calculate the mean of survival status
titanic.groupby('sex')[['survived']].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [214]:
#group sex and class and calculate the mean of survival status
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [215]:
#use pivot table to get the same result
#the first parameter is value you want to observe, followed by index, followed by columns
titanic.pivot_table('survived', index='sex', columns='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


Multi-layer pivot table

In [236]:
# 1. this will create a 3-dimensional pivot table. divided into sex, class, and age
# age is divided into two layers
age = pd.cut(titanic['age'], [0, 18, 80])

# 3-dimensional table
display(titanic.pivot_table('survived', ['sex', age], 'class'))

# sum
display(titanic.pivot_table('survived', 
                            ['sex', age], # index
                            'class', # column
                            aggfunc=np.sum))

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 80]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 80]",0.375,0.071429,0.133663


Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",10,14,22
female,"(18, 80]",72,54,25
male,"(0, 18]",4,9,11
male,"(18, 80]",36,6,27


In [237]:
# 2. if you want to add a sub-division within the column, use pd.qcut
fare = pd.qcut(titanic['fare'], 2)

# this is now 4-dimensional pivot table
titanic.pivot_table('survived', 
                    ['sex', age], # index
                    [fare, 'class']) # column

Unnamed: 0_level_0,fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(0, 18]",,1.0,0.714286,0.909091,1.0,0.318182
female,"(18, 80]",,0.88,0.444444,0.972973,0.914286,0.391304
male,"(0, 18]",,0.0,0.26087,0.8,0.818182,0.178571
male,"(18, 80]",0.0,0.098039,0.125,0.391304,0.030303,0.192308


### other pivot table options

In [239]:
# 1. aggfunc
titanic.pivot_table(index='sex', columns='class',
                    aggfunc={'survived': sum, 'fare': 'mean'})

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,91,70,72
male,67.226127,19.741782,12.661633,45,17,47


## Vectorized String Operations

In [240]:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
names = pd.Series(data)

names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [241]:
# 2. this will skip over null values and capitalize each word in the series1
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

In [242]:
# create text dataset
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                  'Eric Idle', 'Terry Jones', 'Michael Palin'])
monte

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3         Eric Idle
4       Terry Jones
5     Michael Palin
dtype: object

In [244]:
display(monte.str.upper()) #all uppercase
display(monte.str.lower()) #all lowercase

0    GRAHAM CHAPMAN
1       JOHN CLEESE
2     TERRY GILLIAM
3         ERIC IDLE
4       TERRY JONES
5     MICHAEL PALIN
dtype: object

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [245]:
#calculates the length
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [246]:
#returns boolean values for those who start with T
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [247]:
#split function
display(monte.str.split())
display(monte.str.split().str.get(-1))

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [248]:
#slicing
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

## Time Series

In [249]:
from datetime import datetime
datetime(year=2018, month=2, day=4)

datetime.datetime(2018, 2, 4, 0, 0)

In [250]:
# enables datetime interpretation
from dateutil import parser
date = parser.parse("4th of July, 2015")

date

datetime.datetime(2015, 7, 4, 0, 0)

In [251]:
#returns the day of the week
date.strftime('%A')

'Saturday'

### Time series in Numpy: datetime64

In [252]:
import numpy as np
date = np.array('2018-02-04', dtype=np.datetime64)

date

array('2018-02-04', dtype='datetime64[D]')

In [253]:
#returns the following 12 days from the given date
date + np.arange(12)

array(['2018-02-04', '2018-02-05', '2018-02-06', '2018-02-07',
       '2018-02-08', '2018-02-09', '2018-02-10', '2018-02-11',
       '2018-02-12', '2018-02-13', '2018-02-14', '2018-02-15'],
      dtype='datetime64[D]')

### Time series Example Pandas

In [254]:
date = pd.to_datetime("4th of July, 2015")
date

Timestamp('2015-07-04 00:00:00')

In [255]:
date.strftime('%A')

'Saturday'

In [256]:
date + pd.to_timedelta(np.arange(12), 'D')

DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
               '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
               '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
              dtype='datetime64[ns]', freq=None)

### Pandas Time Series : Indexing using Time

In [258]:
# create index using timestamp
index = pd.DatetimeIndex(['2014-07-04', 
                          '2014-08-04', 
                          '2015-07-04', 
                          '2015-08-04'])

data = pd.Series([0, 1, 2, 3], index=index)

display(data)

# slicing using timestamp
display(data['2014-07-04':'2015-07-04'])

# extract data using year
display(data['2015'])

2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64

2014-07-04    0
2014-08-04    1
2015-07-04    2
dtype: int64

2015-07-04    2
2015-08-04    3
dtype: int64

In [259]:
# 1. create datetime index from various types of datetime
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',
                       '2015-Jul-6', '07-07-2015', '20150708'])

dates

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)

In [260]:
# 2. transform dates to period and create periodindex
dates.to_period('D')

PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
             '2015-07-08'],
            dtype='period[D]', freq='D')

In [261]:
# 3. see differences between the dates
dates - dates[0]

TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)

### Pandas Datetime Sequence

In [262]:
# designate the start date and end date
display(pd.date_range('2015-07-03', '2015-07-10'))

# designate start date and periods
display(pd.date_range('2015-07-03', periods=8))

# designate startdate, period, and the units (this case HOUR)
display(pd.date_range('2015-07-03', periods=8, freq='H'))

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2015-07-03 00:00:00', '2015-07-03 01:00:00',
               '2015-07-03 02:00:00', '2015-07-03 03:00:00',
               '2015-07-03 04:00:00', '2015-07-03 05:00:00',
               '2015-07-03 06:00:00', '2015-07-03 07:00:00'],
              dtype='datetime64[ns]', freq='H')

## Advanced Pandas : eval() and query()

* NumPy's vectorization/broadcasting and grouping operations are generally efficient, however, sometimes they depend on temporary mid-point objects, which can create overhead in time and memory usage

### 1. eval()

In [263]:
# create test set
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) 
                      for i in range(4))

In [265]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.37454,0.950714,0.731994,0.598658,0.156019,0.155995,0.058084,0.866176,0.601115,0.708073,...,0.119594,0.713245,0.760785,0.561277,0.770967,0.493796,0.522733,0.427541,0.025419,0.107891
1,0.031429,0.63641,0.314356,0.508571,0.907566,0.249292,0.410383,0.755551,0.228798,0.07698,...,0.093103,0.897216,0.900418,0.633101,0.33903,0.34921,0.725956,0.89711,0.887086,0.779876
2,0.642032,0.08414,0.161629,0.898554,0.606429,0.009197,0.101472,0.663502,0.005062,0.160808,...,0.0305,0.037348,0.822601,0.360191,0.127061,0.522243,0.769994,0.215821,0.62289,0.085347
3,0.051682,0.531355,0.540635,0.63743,0.726091,0.975852,0.5163,0.322956,0.795186,0.270832,...,0.990505,0.412618,0.372018,0.776413,0.340804,0.930757,0.858413,0.428994,0.750871,0.754543
4,0.103124,0.902553,0.505252,0.826457,0.32005,0.895523,0.389202,0.010838,0.905382,0.091287,...,0.455657,0.620133,0.277381,0.188121,0.463698,0.353352,0.583656,0.077735,0.974395,0.986211


In [266]:
# add all four dataframes (the classic method)
%timeit df1 + df2 + df3 + df4

88.4 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [267]:
# use eval() instead and compare the time difference
%timeit pd.eval('df1 + df2 + df3 + df4')

43.3 ms ± 1.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### 2. query()

In [268]:
# create test set
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,0.615875,0.525167,0.047354
1,0.330858,0.412879,0.441564
2,0.689047,0.559068,0.23035
3,0.290486,0.695479,0.852587
4,0.42428,0.534344,0.245216


In [269]:
# 1-1. use masking method
# masking from (df.A < 0.5) & (df.B < 0.5)
df[(df.A < 0.5) & (df.B < 0.5)].head()

Unnamed: 0,A,B,C
1,0.330858,0.412879,0.441564
8,0.448611,0.415924,0.481001
10,0.11291,0.394884,0.950129
11,0.191011,0.118751,0.130223
14,0.075723,0.260648,0.956146


In [273]:
# use query instead
df.query('A < 0.5 and B < 0.5').head()

Unnamed: 0,A,B,C
1,0.330858,0.412879,0.441564
8,0.448611,0.415924,0.481001
10,0.11291,0.394884,0.950129
11,0.191011,0.118751,0.130223
14,0.075723,0.260648,0.956146
