<a href="https://colab.research.google.com/github/irinavalenzuela/Applied-Data-Science-Python/blob/main/Week3_dataprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to Data Science in Python

# Week 3: More Data Processing with Pandas

## Merging DataFrames

In [None]:
import pandas as pd

# Create 2 dataframes: staff and students
staff_df=pd.DataFrame([{'Name':'Kelly','Role':'Director of HR'},
                       {'Name':'Sally','Role':'Course liasion'},
                       {'Name':'James','Role':'Grader'}])

# Index by name
staff_df=staff_df.set_index('Name')

# Student dataframe
student_df=pd.DataFrame([{'Name':'James','School':'Business'},
                         {'Name':'Mike','School':'Law'},
                         {'Name':'Sally','School':'Engineering'}])
#Index by name
student_df=student_df.set_index('Name')

#Print the data frames
print(staff_df.head())
print(student_df.head())


                 Role
Name                 
Kelly  Director of HR
Sally  Course liasion
James          Grader
            School
Name              
James     Business
Mike           Law
Sally  Engineering


In [None]:
# Union of these dataframes: merge() and we want outer join
# We want to use the left and right indices as the joining columns

# The first dataframe is the LEFT and the second is the RIGHT

pd.merge(staff_df, student_df, how='outer',left_index=True,right_index=True)


Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Kelly,Director of HR,
Mike,,Law
Sally,Course liasion,Engineering


In [None]:
# Intersection: inner

pd.merge(staff_df,student_df,how='inner',left_index=True,right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Sally,Course liasion,Engineering
James,Grader,Business


In [None]:
# We want a list of all staff regardless of whether they were students or not
# If they are students, we want their details
# We use the LEFT join

pd.merge(staff_df,student_df,how='left',left_index=True,right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kelly,Director of HR,
Sally,Course liasion,Engineering
James,Grader,Business


In [None]:
# Most common way to merge: Using ON parameter (assign a column as the joining one)

# Lets remove index from both dataframes
staff_df=staff_df.reset_index()
student_df=student_df.reset_index()

# Merge using ON parameter
pd.merge(staff_df,student_df,how='right',on='Name')


Unnamed: 0,Name,Role,School
0,James,Grader,Business
1,Mike,,Law
2,Sally,Course liasion,Engineering


In [None]:
# Conflic with dataframes

staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR', 
                          'Location': 'State Street'},
                         {'Name': 'Sally', 'Role': 'Course liasion', 
                          'Location': 'Washington Avenue'},
                         {'Name': 'James', 'Role': 'Grader', 
                          'Location': 'Washington Avenue'}])
student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business', 
                            'Location': '1024 Billiard Avenue'},
                           {'Name': 'Mike', 'School': 'Law', 
                            'Location': 'Fraternity House #22'},
                           {'Name': 'Sally', 'School': 'Engineering', 
                            'Location': '512 Wilson Crescent'}])


# Location in staff is the office location and in students is their homes.

# Merge function appends _x (left frame) or _y (right frame) to help differentiate

# We want all staff information regardless of their student status, but we want student
# details as well: LEFT join and on the Name column

pd.merge(staff_df,student_df,how='left',on='Name')

Unnamed: 0,Name,Role,Location_x,School,Location_y
0,Kelly,Director of HR,State Street,,
1,Sally,Course liasion,Washington Avenue,Engineering,512 Wilson Crescent
2,James,Grader,Washington Avenue,Business,1024 Billiard Avenue


In [None]:
# Multi-indexing and multiple columns
# Use a list of multiple columns to join keys from both data frames: first and last name

# Example:
staff_df = pd.DataFrame([{'First Name': 'Kelly', 'Last Name': 'Desjardins', 
                          'Role': 'Director of HR'},
                         {'First Name': 'Sally', 'Last Name': 'Brooks', 
                          'Role': 'Course liasion'},
                         {'First Name': 'James', 'Last Name': 'Wilde', 
                          'Role': 'Grader'}])
student_df = pd.DataFrame([{'First Name': 'James', 'Last Name': 'Hammond', 
                            'School': 'Business'},
                           {'First Name': 'Mike', 'Last Name': 'Smith', 
                            'School': 'Law'},
                           {'First Name': 'Sally', 'Last Name': 'Brooks', 
                            'School': 'Engineering'}])

# We pass a python list ['First Name','Last Name']

pd.merge(staff_df,student_df,how='inner',on=['First Name','Last Name'])


Unnamed: 0,First Name,Last Name,Role,School
0,Sally,Brooks,Course liasion,Engineering


In [None]:
# Concatening: joining vertically

# US Departement of Education College Scorecard data
# The data is stored in separate CSV's with each year's record

# NOTE: CSV file we are working is messy, I supress Jupyter warning messages
# and tell read_csv and ignore bad lines: start with cell magic %%capture

%%capture
df_2011 = pd.read_csv("MERGED2011_12_PP.csv", error_bad_lines=False)
df_2012 = pd.read_csv("MERGED2012_13_PP.csv", error_bad_lines=False)
df_2013 = pd.read_csv("MERGED2013_14_PP.csv", error_bad_lines=False)


In [None]:
# Let's get a view

df_2011.head(3)

Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:889cb25b86e8bd07b2aa82b253e3fcd0d8c...
1,size 309076268


In [None]:
# Length of each data frame

print(len(df_2011))
print(len(df_2012))
print(len(df_2013))

2
2
2


In [None]:
# Put all three dataframes in a list
frames = [df_2011, df_2012, df_2013]

# Pass the list into the concat()

pd.concat(frames)

Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:889cb25b86e8bd07b2aa82b253e3fcd0d8c...
1,size 309076268
0,oid sha256:55cfd6746fdcc1cb5a29350c5a39c742ae8...
1,size 157050855
0,oid sha256:dbef09960b9dd4392f144a05562af3639d8...
1,size 157811280


In [None]:
# Let's add the number of rows of the 3 dataframes and see if the numbers match

len(df_2011)+len(df_2012)+len(df_2013)

6

In [None]:
# Concat / Key parameter: set extra level of indices.
# Pass a list of keys to correspond to the dataframes

pd.concat(frames, keys=['2011','2012','2013'])

# The indices as years

Unnamed: 0,Unnamed: 1,version https://git-lfs.github.com/spec/v1
2011,0,oid sha256:889cb25b86e8bd07b2aa82b253e3fcd0d8c...
2011,1,size 309076268
2012,0,oid sha256:55cfd6746fdcc1cb5a29350c5a39c742ae8...
2012,1,size 157050855
2013,0,oid sha256:dbef09960b9dd4392f144a05562af3639d8...
2013,1,size 157811280


## Pandas Idioms

In [None]:
# Pandorable: pandas's idioms

import pandas as pd
import numpy as np

# Bring some timing functionality from the timeit module
import timeit

# Census data from US
df = pd.read_csv('census.csv')
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,NPOPCHG_2010,NPOPCHG_2011,NPOPCHG_2012,NPOPCHG_2013,NPOPCHG_2014,NPOPCHG_2015,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,DEATHS2010,DEATHS2011,DEATHS2012,DEATHS2013,DEATHS2014,DEATHS2015,NATURALINC2010,NATURALINC2011,NATURALINC2012,NATURALINC2013,NATURALINC2014,NATURALINC2015,INTERNATIONALMIG2010,...,RESIDUAL2013,RESIDUAL2014,RESIDUAL2015,GQESTIMATESBASE2010,GQESTIMATES2010,GQESTIMATES2011,GQESTIMATES2012,GQESTIMATES2013,GQESTIMATES2014,GQESTIMATES2015,RBIRTH2011,RBIRTH2012,RBIRTH2013,RBIRTH2014,RBIRTH2015,RDEATH2011,RDEATH2012,RDEATH2013,RDEATH2014,RDEATH2015,RNATURALINC2011,RNATURALINC2012,RNATURALINC2013,RNATURALINC2014,RNATURALINC2015,RINTERNATIONALMIG2011,RINTERNATIONALMIG2012,RINTERNATIONALMIG2013,RINTERNATIONALMIG2014,RINTERNATIONALMIG2015,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,4801108,4816089,4830533,4846411,4858979,5034,15947,14981,14444,15878,12568,14226,59689,59062,57938,58334,58305,11089,48811,48357,50843,50228,50330,3137,10878,10705,7095,8106,7975,1357,...,677,-573,1135,116185,116212,115560,115666,116963,119088,119599,12.45302,12.282581,12.01208,12.056286,12.014973,10.183524,10.05636,10.541099,10.380963,10.371556,2.269496,2.22622,1.470981,1.675322,1.643417,1.02772,1.01984,1.002216,1.142716,1.179963,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,55253,55175,55038,55290,55347,89,593,-78,-137,252,57,151,636,615,574,623,600,152,507,558,583,504,467,-1,129,57,-9,119,133,33,...,22,-10,45,455,455,455,455,455,455,455,11.572789,11.138479,10.416194,11.293597,10.846281,9.225478,10.106133,10.579514,9.136393,8.442022,2.347311,1.032347,-0.16332,2.157204,2.404259,0.363924,0.289782,0.290347,0.3263,0.343466,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,186659,190396,195126,199713,203709,928,3466,3737,4730,4587,3996,517,2187,2092,2160,2186,2240,532,1825,1879,1902,2044,1992,-15,362,213,258,142,248,69,...,91,434,58,2307,2307,2307,2249,2304,2308,2309,11.826352,11.096524,11.205586,11.072868,11.104997,9.868812,9.966716,9.867141,10.353587,9.875515,1.95754,1.129809,1.338445,0.719281,1.229482,1.011215,0.912334,0.881921,1.073855,1.095627,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,27226,27159,26973,26815,26489,-116,-115,-67,-186,-158,-326,70,335,300,283,260,269,128,319,291,294,310,309,-58,16,9,-11,-50,-40,2,...,19,-1,-5,3193,3193,3382,3388,3389,3353,3352,12.278483,11.032454,10.455923,9.667584,10.093051,11.692048,10.70148,10.862337,11.526735,11.593877,0.586435,0.330974,-0.406414,-1.859151,-1.500825,-0.146609,-0.257424,-0.11084,-0.074366,0.0,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,22733,22642,22512,22549,22583,-58,-128,-91,-130,37,34,44,266,245,259,247,253,34,278,237,281,211,223,10,-12,8,-22,36,30,2,...,14,-16,-21,2224,2224,2224,2224,2224,2233,2236,11.668202,10.798898,11.471852,10.962917,11.211557,12.194587,10.446281,12.446295,9.365083,9.882124,-0.526385,0.352617,-0.974443,1.597834,1.329434,0.438654,0.705234,0.797272,0.93207,0.930604,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [None]:
# METHOD CHAINING
# Every method on an object returns a reference to that object
# Condense many different operations into one statement of code

# Pandorable way to write code with method chaining
# This code:
# pull out the state and city names as a multiple index, for data that has
# a summary level of 50. Then rename a column

# Start the statement with (): tells python the code statement will be over multiple lines

(df.where(df['SUMLEV']==50) #where function and pass a boolean mask 
    .dropna() # with the results of where function, we drop miss values
    .set_index(['STNAME','CTYNAME'])
    .rename(columns={'ESTIMATESBASE2010':'Estimates Base 2010'}))



Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,Estimates Base 2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,NPOPCHG_2010,NPOPCHG_2011,NPOPCHG_2012,NPOPCHG_2013,NPOPCHG_2014,NPOPCHG_2015,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,DEATHS2010,DEATHS2011,DEATHS2012,DEATHS2013,DEATHS2014,DEATHS2015,NATURALINC2010,NATURALINC2011,NATURALINC2012,NATURALINC2013,NATURALINC2014,NATURALINC2015,INTERNATIONALMIG2010,INTERNATIONALMIG2011,INTERNATIONALMIG2012,...,RESIDUAL2013,RESIDUAL2014,RESIDUAL2015,GQESTIMATESBASE2010,GQESTIMATES2010,GQESTIMATES2011,GQESTIMATES2012,GQESTIMATES2013,GQESTIMATES2014,GQESTIMATES2015,RBIRTH2011,RBIRTH2012,RBIRTH2013,RBIRTH2014,RBIRTH2015,RDEATH2011,RDEATH2012,RDEATH2013,RDEATH2014,RDEATH2015,RNATURALINC2011,RNATURALINC2012,RNATURALINC2013,RNATURALINC2014,RNATURALINC2015,RINTERNATIONALMIG2011,RINTERNATIONALMIG2012,RINTERNATIONALMIG2013,RINTERNATIONALMIG2014,RINTERNATIONALMIG2015,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1
Alabama,Autauga County,50.0,3.0,6.0,1.0,1.0,54571.0,54571.0,54660.0,55253.0,55175.0,55038.0,55290.0,55347.0,89.0,593.0,-78.0,-137.0,252.0,57.0,151.0,636.0,615.0,574.0,623.0,600.0,152.0,507.0,558.0,583.0,504.0,467.0,-1.0,129.0,57.0,-9.0,119.0,133.0,33.0,20.0,16.0,...,22.0,-10.0,45.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,11.572789,11.138479,10.416194,11.293597,10.846281,9.225478,10.106133,10.579514,9.136393,8.442022,2.347311,1.032347,-0.163320,2.157204,2.404259,0.363924,0.289782,0.290347,0.326300,0.343466,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.592270,-2.187333
Alabama,Baldwin County,50.0,3.0,6.0,1.0,3.0,182265.0,182265.0,183193.0,186659.0,190396.0,195126.0,199713.0,203709.0,928.0,3466.0,3737.0,4730.0,4587.0,3996.0,517.0,2187.0,2092.0,2160.0,2186.0,2240.0,532.0,1825.0,1879.0,1902.0,2044.0,1992.0,-15.0,362.0,213.0,258.0,142.0,248.0,69.0,187.0,172.0,...,91.0,434.0,58.0,2307.0,2307.0,2307.0,2249.0,2304.0,2308.0,2309.0,11.826352,11.096524,11.205586,11.072868,11.104997,9.868812,9.966716,9.867141,10.353587,9.875515,1.957540,1.129809,1.338445,0.719281,1.229482,1.011215,0.912334,0.881921,1.073855,1.095627,14.832960,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
Alabama,Barbour County,50.0,3.0,6.0,1.0,5.0,27457.0,27457.0,27341.0,27226.0,27159.0,26973.0,26815.0,26489.0,-116.0,-115.0,-67.0,-186.0,-158.0,-326.0,70.0,335.0,300.0,283.0,260.0,269.0,128.0,319.0,291.0,294.0,310.0,309.0,-58.0,16.0,9.0,-11.0,-50.0,-40.0,2.0,-4.0,-7.0,...,19.0,-1.0,-5.0,3193.0,3193.0,3382.0,3388.0,3389.0,3353.0,3352.0,12.278483,11.032454,10.455923,9.667584,10.093051,11.692048,10.701480,10.862337,11.526735,11.593877,0.586435,0.330974,-0.406414,-1.859151,-1.500825,-0.146609,-0.257424,-0.110840,-0.074366,0.000000,-4.728132,-2.500690,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
Alabama,Bibb County,50.0,3.0,6.0,1.0,7.0,22915.0,22919.0,22861.0,22733.0,22642.0,22512.0,22549.0,22583.0,-58.0,-128.0,-91.0,-130.0,37.0,34.0,44.0,266.0,245.0,259.0,247.0,253.0,34.0,278.0,237.0,281.0,211.0,223.0,10.0,-12.0,8.0,-22.0,36.0,30.0,2.0,10.0,16.0,...,14.0,-16.0,-21.0,2224.0,2224.0,2224.0,2224.0,2224.0,2233.0,2236.0,11.668202,10.798898,11.471852,10.962917,11.211557,12.194587,10.446281,12.446295,9.365083,9.882124,-0.526385,0.352617,-0.974443,1.597834,1.329434,0.438654,0.705234,0.797272,0.932070,0.930604,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
Alabama,Blount County,50.0,3.0,6.0,1.0,9.0,57322.0,57322.0,57373.0,57711.0,57776.0,57734.0,57658.0,57673.0,51.0,338.0,65.0,-42.0,-76.0,15.0,183.0,744.0,710.0,646.0,618.0,603.0,133.0,570.0,592.0,585.0,589.0,590.0,50.0,174.0,118.0,61.0,29.0,13.0,5.0,3.0,19.0,...,-22.0,-14.0,53.0,489.0,489.0,489.0,489.0,489.0,489.0,489.0,12.929686,12.295756,11.185179,10.711314,10.456859,9.905808,10.252236,10.128993,10.208680,10.231421,3.023878,2.043520,1.056186,0.502634,0.225438,0.052136,0.329041,0.346290,0.485302,0.485559,1.807375,-1.177622,-1.748766,-2.062535,-1.369970,1.859511,-0.848580,-1.402476,-1.577232,-0.884411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wyoming,Sweetwater County,50.0,4.0,8.0,56.0,37.0,43806.0,43806.0,43593.0,44041.0,45104.0,45162.0,44925.0,44626.0,-213.0,448.0,1063.0,58.0,-237.0,-299.0,167.0,640.0,595.0,657.0,629.0,620.0,76.0,251.0,273.0,296.0,246.0,262.0,91.0,389.0,322.0,361.0,383.0,358.0,5.0,8.0,0.0,...,-64.0,14.0,-27.0,679.0,679.0,694.0,697.0,731.0,671.0,672.0,14.606203,13.349038,14.556976,13.964279,13.846858,5.728370,6.124853,6.558394,5.461387,5.851414,8.877833,7.224185,7.998582,8.502892,7.995444,0.182578,0.000000,0.044313,0.177606,0.178669,1.072643,16.243199,-5.339774,-14.252889,-14.248864,1.255221,16.243199,-5.295460,-14.075283,-14.070195
Wyoming,Teton County,50.0,4.0,8.0,56.0,39.0,21294.0,21294.0,21297.0,21482.0,21697.0,22347.0,22905.0,23125.0,3.0,185.0,215.0,650.0,558.0,220.0,76.0,259.0,230.0,261.0,249.0,269.0,10.0,87.0,61.0,97.0,68.0,76.0,66.0,172.0,169.0,164.0,181.0,193.0,5.0,48.0,31.0,...,20.0,8.0,-8.0,271.0,271.0,271.0,270.0,268.0,268.0,267.0,12.108745,10.653327,11.851785,11.005038,11.688030,4.067416,2.825448,4.404686,3.005392,3.302194,8.041329,7.827879,7.447098,7.999646,8.385835,2.244092,1.435883,1.634729,2.165650,2.085596,-1.589565,0.972695,19.525929,14.143021,-0.564849,0.654527,2.408578,21.160658,16.308671,1.520747
Wyoming,Uinta County,50.0,4.0,8.0,56.0,41.0,21118.0,21118.0,21102.0,20912.0,20989.0,21022.0,20903.0,20822.0,-16.0,-190.0,77.0,33.0,-119.0,-81.0,73.0,324.0,311.0,316.0,316.0,316.0,49.0,139.0,115.0,136.0,130.0,137.0,24.0,185.0,196.0,180.0,186.0,179.0,2.0,-8.0,-13.0,...,11.0,4.0,3.0,270.0,270.0,245.0,236.0,254.0,254.0,254.0,15.423430,14.844514,15.043679,15.074538,15.146794,6.616842,5.489129,6.474495,6.201550,6.566806,8.806588,9.355385,8.569184,8.872987,8.579988,-0.380825,-0.620510,-0.618886,-0.524747,-0.479329,-17.755986,-4.916350,-6.902954,-14.215862,-12.127022,-18.136812,-5.536861,-7.521840,-14.740608,-12.606351
Wyoming,Washakie County,50.0,4.0,8.0,56.0,43.0,8533.0,8533.0,8545.0,8469.0,8443.0,8443.0,8316.0,8328.0,12.0,-76.0,-26.0,0.0,-127.0,12.0,26.0,108.0,90.0,95.0,96.0,90.0,34.0,79.0,105.0,77.0,70.0,79.0,-8.0,29.0,-15.0,18.0,26.0,11.0,1.0,-3.0,-3.0,...,1.0,-2.0,-11.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,12.695427,10.643330,11.251925,11.456531,10.814708,9.286470,12.417219,9.119981,8.353720,9.492910,3.408957,-1.773888,2.131944,3.102810,1.321798,-0.352651,-0.354778,-0.236883,-0.238678,-0.240327,-11.637475,-0.827815,-2.013502,-17.781491,1.682288,-11.990126,-1.182592,-2.250385,-18.020168,1.441961


In [None]:
# Traditional, non-pandorable way of writing the previous code

# 1st create a new dataframe from the original
df = df[df['SUMLEV']==50] # indexing operator [] which drops nans

# Update the df to have new index, inplace=True
df.set_index(['STNAME','CTYNAME'],inplace=True)

# Set the columns name
df.rename(columns={'ESTIMATESBASE2010':'Estimates Base 2010'})

Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,Estimates Base 2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,NPOPCHG_2010,NPOPCHG_2011,NPOPCHG_2012,NPOPCHG_2013,NPOPCHG_2014,NPOPCHG_2015,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,DEATHS2010,DEATHS2011,DEATHS2012,DEATHS2013,DEATHS2014,DEATHS2015,NATURALINC2010,NATURALINC2011,NATURALINC2012,NATURALINC2013,NATURALINC2014,NATURALINC2015,INTERNATIONALMIG2010,INTERNATIONALMIG2011,INTERNATIONALMIG2012,...,RESIDUAL2013,RESIDUAL2014,RESIDUAL2015,GQESTIMATESBASE2010,GQESTIMATES2010,GQESTIMATES2011,GQESTIMATES2012,GQESTIMATES2013,GQESTIMATES2014,GQESTIMATES2015,RBIRTH2011,RBIRTH2012,RBIRTH2013,RBIRTH2014,RBIRTH2015,RDEATH2011,RDEATH2012,RDEATH2013,RDEATH2014,RDEATH2015,RNATURALINC2011,RNATURALINC2012,RNATURALINC2013,RNATURALINC2014,RNATURALINC2015,RINTERNATIONALMIG2011,RINTERNATIONALMIG2012,RINTERNATIONALMIG2013,RINTERNATIONALMIG2014,RINTERNATIONALMIG2015,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1
Alabama,Autauga County,50,3,6,1,1,54571,54571,54660,55253,55175,55038,55290,55347,89,593,-78,-137,252,57,151,636,615,574,623,600,152,507,558,583,504,467,-1,129,57,-9,119,133,33,20,16,...,22,-10,45,455,455,455,455,455,455,455,11.572789,11.138479,10.416194,11.293597,10.846281,9.225478,10.106133,10.579514,9.136393,8.442022,2.347311,1.032347,-0.163320,2.157204,2.404259,0.363924,0.289782,0.290347,0.326300,0.343466,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.592270,-2.187333
Alabama,Baldwin County,50,3,6,1,3,182265,182265,183193,186659,190396,195126,199713,203709,928,3466,3737,4730,4587,3996,517,2187,2092,2160,2186,2240,532,1825,1879,1902,2044,1992,-15,362,213,258,142,248,69,187,172,...,91,434,58,2307,2307,2307,2249,2304,2308,2309,11.826352,11.096524,11.205586,11.072868,11.104997,9.868812,9.966716,9.867141,10.353587,9.875515,1.957540,1.129809,1.338445,0.719281,1.229482,1.011215,0.912334,0.881921,1.073855,1.095627,14.832960,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
Alabama,Barbour County,50,3,6,1,5,27457,27457,27341,27226,27159,26973,26815,26489,-116,-115,-67,-186,-158,-326,70,335,300,283,260,269,128,319,291,294,310,309,-58,16,9,-11,-50,-40,2,-4,-7,...,19,-1,-5,3193,3193,3382,3388,3389,3353,3352,12.278483,11.032454,10.455923,9.667584,10.093051,11.692048,10.701480,10.862337,11.526735,11.593877,0.586435,0.330974,-0.406414,-1.859151,-1.500825,-0.146609,-0.257424,-0.110840,-0.074366,0.000000,-4.728132,-2.500690,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
Alabama,Bibb County,50,3,6,1,7,22915,22919,22861,22733,22642,22512,22549,22583,-58,-128,-91,-130,37,34,44,266,245,259,247,253,34,278,237,281,211,223,10,-12,8,-22,36,30,2,10,16,...,14,-16,-21,2224,2224,2224,2224,2224,2233,2236,11.668202,10.798898,11.471852,10.962917,11.211557,12.194587,10.446281,12.446295,9.365083,9.882124,-0.526385,0.352617,-0.974443,1.597834,1.329434,0.438654,0.705234,0.797272,0.932070,0.930604,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
Alabama,Blount County,50,3,6,1,9,57322,57322,57373,57711,57776,57734,57658,57673,51,338,65,-42,-76,15,183,744,710,646,618,603,133,570,592,585,589,590,50,174,118,61,29,13,5,3,19,...,-22,-14,53,489,489,489,489,489,489,489,12.929686,12.295756,11.185179,10.711314,10.456859,9.905808,10.252236,10.128993,10.208680,10.231421,3.023878,2.043520,1.056186,0.502634,0.225438,0.052136,0.329041,0.346290,0.485302,0.485559,1.807375,-1.177622,-1.748766,-2.062535,-1.369970,1.859511,-0.848580,-1.402476,-1.577232,-0.884411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wyoming,Sweetwater County,50,4,8,56,37,43806,43806,43593,44041,45104,45162,44925,44626,-213,448,1063,58,-237,-299,167,640,595,657,629,620,76,251,273,296,246,262,91,389,322,361,383,358,5,8,0,...,-64,14,-27,679,679,694,697,731,671,672,14.606203,13.349038,14.556976,13.964279,13.846858,5.728370,6.124853,6.558394,5.461387,5.851414,8.877833,7.224185,7.998582,8.502892,7.995444,0.182578,0.000000,0.044313,0.177606,0.178669,1.072643,16.243199,-5.339774,-14.252889,-14.248864,1.255221,16.243199,-5.295460,-14.075283,-14.070195
Wyoming,Teton County,50,4,8,56,39,21294,21294,21297,21482,21697,22347,22905,23125,3,185,215,650,558,220,76,259,230,261,249,269,10,87,61,97,68,76,66,172,169,164,181,193,5,48,31,...,20,8,-8,271,271,271,270,268,268,267,12.108745,10.653327,11.851785,11.005038,11.688030,4.067416,2.825448,4.404686,3.005392,3.302194,8.041329,7.827879,7.447098,7.999646,8.385835,2.244092,1.435883,1.634729,2.165650,2.085596,-1.589565,0.972695,19.525929,14.143021,-0.564849,0.654527,2.408578,21.160658,16.308671,1.520747
Wyoming,Uinta County,50,4,8,56,41,21118,21118,21102,20912,20989,21022,20903,20822,-16,-190,77,33,-119,-81,73,324,311,316,316,316,49,139,115,136,130,137,24,185,196,180,186,179,2,-8,-13,...,11,4,3,270,270,245,236,254,254,254,15.423430,14.844514,15.043679,15.074538,15.146794,6.616842,5.489129,6.474495,6.201550,6.566806,8.806588,9.355385,8.569184,8.872987,8.579988,-0.380825,-0.620510,-0.618886,-0.524747,-0.479329,-17.755986,-4.916350,-6.902954,-14.215862,-12.127022,-18.136812,-5.536861,-7.521840,-14.740608,-12.606351
Wyoming,Washakie County,50,4,8,56,43,8533,8533,8545,8469,8443,8443,8316,8328,12,-76,-26,0,-127,12,26,108,90,95,96,90,34,79,105,77,70,79,-8,29,-15,18,26,11,1,-3,-3,...,1,-2,-11,140,140,140,140,140,140,140,12.695427,10.643330,11.251925,11.456531,10.814708,9.286470,12.417219,9.119981,8.353720,9.492910,3.408957,-1.773888,2.131944,3.102810,1.321798,-0.352651,-0.354778,-0.236883,-0.238678,-0.240327,-11.637475,-0.827815,-2.013502,-17.781491,1.682288,-11.990126,-1.182592,-2.250385,-18.020168,1.441961


In [None]:
# Time both methods: put into a function and pass the function into the timeit function

# Function for the first approach. Return all 
def first_approach():
  global df
  # Pass the code right here
  return (df.where(df['SUMLEV']==50) 
              .dropna() 
              .set_index(['STNAME','CTYNAME'])
              .rename(columns={'ESTIMATESBASE2010':'Estimates Base 2010'}))
  
# Read in our new dataser

df = pd.read_csv('census.csv')

# Lets run it

timeit.timeit(first_approach,number=10) # 10 times this function will be run

0.2246381019999717

In [None]:
# Second approach

def second_approach():
  global df
  new_df = df[df['SUMLEV']==50] 
  new_df.set_index(['STNAME','CTYNAME'],inplace=True)
  return new_df.rename(columns={'ESTIMATESBASE2010':'Estimates Base 2010'})

df = pd.read_csv('census.csv')

timeit.timeit(second_approach, number=10)

0.06699047899996913

In [None]:
# Another idiom: map across all of the rows in a DataFrame. Function apply

# Census dataframe: 5 columns for each column corresponding one year of estimates
# We want to create some new columns for min or max values

# First, write a function which takes in a particular row of data, and finds
# a min and max values, and return a new row of data. We called this function min_max

def min_max(row):
  # Create some small slice of a row by projecting pop columns
  data = row[['POPESTIMATE2010',
              'POPESTIMATE2011',
              'POPESTIMATE2012',
              'POPESTIMATE2013',
              'POPESTIMATE2014',
              'POPESTIMATE2015']]
  # Create new series and use Numpy min and max functions            
  return pd.Series({'min':np.min(data), 'max':np.max(data)})            

In [None]:
# Apply takes the function and the axis on which to operate as parameters
# To apply across all rows, which is applying on all columns, pass axis == 'columns'

df.apply(min_max,axis='columns').head()

Unnamed: 0,min,max
0,4785161,4858979
1,54660,55347
2,183193,203709
3,26489,27341
4,22512,22861


In [None]:
# Instead of returning a separate series to display, we add two new columns: min and max

def min_max(row):
  data = row[['POPESTIMATE2010',
              'POPESTIMATE2011',
              'POPESTIMATE2012',
              'POPESTIMATE2013',
              'POPESTIMATE2014',
              'POPESTIMATE2015']] 
  # Create a new entry for max:
  row['max']=np.max(data)
  # Create a new entrey for min;
  row['min']=np.min(data)
  return row

# Apply the function across the dataframe
df.apply(min_max,axis='columns')  


Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,NPOPCHG_2010,NPOPCHG_2011,NPOPCHG_2012,NPOPCHG_2013,NPOPCHG_2014,NPOPCHG_2015,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,DEATHS2010,DEATHS2011,DEATHS2012,DEATHS2013,DEATHS2014,DEATHS2015,NATURALINC2010,NATURALINC2011,NATURALINC2012,NATURALINC2013,NATURALINC2014,NATURALINC2015,INTERNATIONALMIG2010,...,RESIDUAL2015,GQESTIMATESBASE2010,GQESTIMATES2010,GQESTIMATES2011,GQESTIMATES2012,GQESTIMATES2013,GQESTIMATES2014,GQESTIMATES2015,RBIRTH2011,RBIRTH2012,RBIRTH2013,RBIRTH2014,RBIRTH2015,RDEATH2011,RDEATH2012,RDEATH2013,RDEATH2014,RDEATH2015,RNATURALINC2011,RNATURALINC2012,RNATURALINC2013,RNATURALINC2014,RNATURALINC2015,RINTERNATIONALMIG2011,RINTERNATIONALMIG2012,RINTERNATIONALMIG2013,RINTERNATIONALMIG2014,RINTERNATIONALMIG2015,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,max,min
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,4801108,4816089,4830533,4846411,4858979,5034,15947,14981,14444,15878,12568,14226,59689,59062,57938,58334,58305,11089,48811,48357,50843,50228,50330,3137,10878,10705,7095,8106,7975,1357,...,1135,116185,116212,115560,115666,116963,119088,119599,12.453020,12.282581,12.012080,12.056286,12.014973,10.183524,10.056360,10.541099,10.380963,10.371556,2.269496,2.226220,1.470981,1.675322,1.643417,1.027720,1.019840,1.002216,1.142716,1.179963,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594,4858979,4785161
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,55253,55175,55038,55290,55347,89,593,-78,-137,252,57,151,636,615,574,623,600,152,507,558,583,504,467,-1,129,57,-9,119,133,33,...,45,455,455,455,455,455,455,455,11.572789,11.138479,10.416194,11.293597,10.846281,9.225478,10.106133,10.579514,9.136393,8.442022,2.347311,1.032347,-0.163320,2.157204,2.404259,0.363924,0.289782,0.290347,0.326300,0.343466,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.592270,-2.187333,55347,54660
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,186659,190396,195126,199713,203709,928,3466,3737,4730,4587,3996,517,2187,2092,2160,2186,2240,532,1825,1879,1902,2044,1992,-15,362,213,258,142,248,69,...,58,2307,2307,2307,2249,2304,2308,2309,11.826352,11.096524,11.205586,11.072868,11.104997,9.868812,9.966716,9.867141,10.353587,9.875515,1.957540,1.129809,1.338445,0.719281,1.229482,1.011215,0.912334,0.881921,1.073855,1.095627,14.832960,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499,203709,183193
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,27226,27159,26973,26815,26489,-116,-115,-67,-186,-158,-326,70,335,300,283,260,269,128,319,291,294,310,309,-58,16,9,-11,-50,-40,2,...,-5,3193,3193,3382,3388,3389,3353,3352,12.278483,11.032454,10.455923,9.667584,10.093051,11.692048,10.701480,10.862337,11.526735,11.593877,0.586435,0.330974,-0.406414,-1.859151,-1.500825,-0.146609,-0.257424,-0.110840,-0.074366,0.000000,-4.728132,-2.500690,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299,27341,26489
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,22733,22642,22512,22549,22583,-58,-128,-91,-130,37,34,44,266,245,259,247,253,34,278,237,281,211,223,10,-12,8,-22,36,30,2,...,-21,2224,2224,2224,2224,2224,2233,2236,11.668202,10.798898,11.471852,10.962917,11.211557,12.194587,10.446281,12.446295,9.365083,9.882124,-0.526385,0.352617,-0.974443,1.597834,1.329434,0.438654,0.705234,0.797272,0.932070,0.930604,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861,22861,22512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,50,4,8,56,37,Wyoming,Sweetwater County,43806,43806,43593,44041,45104,45162,44925,44626,-213,448,1063,58,-237,-299,167,640,595,657,629,620,76,251,273,296,246,262,91,389,322,361,383,358,5,...,-27,679,679,694,697,731,671,672,14.606203,13.349038,14.556976,13.964279,13.846858,5.728370,6.124853,6.558394,5.461387,5.851414,8.877833,7.224185,7.998582,8.502892,7.995444,0.182578,0.000000,0.044313,0.177606,0.178669,1.072643,16.243199,-5.339774,-14.252889,-14.248864,1.255221,16.243199,-5.295460,-14.075283,-14.070195,45162,43593
3189,50,4,8,56,39,Wyoming,Teton County,21294,21294,21297,21482,21697,22347,22905,23125,3,185,215,650,558,220,76,259,230,261,249,269,10,87,61,97,68,76,66,172,169,164,181,193,5,...,-8,271,271,271,270,268,268,267,12.108745,10.653327,11.851785,11.005038,11.688030,4.067416,2.825448,4.404686,3.005392,3.302194,8.041329,7.827879,7.447098,7.999646,8.385835,2.244092,1.435883,1.634729,2.165650,2.085596,-1.589565,0.972695,19.525929,14.143021,-0.564849,0.654527,2.408578,21.160658,16.308671,1.520747,23125,21297
3190,50,4,8,56,41,Wyoming,Uinta County,21118,21118,21102,20912,20989,21022,20903,20822,-16,-190,77,33,-119,-81,73,324,311,316,316,316,49,139,115,136,130,137,24,185,196,180,186,179,2,...,3,270,270,245,236,254,254,254,15.423430,14.844514,15.043679,15.074538,15.146794,6.616842,5.489129,6.474495,6.201550,6.566806,8.806588,9.355385,8.569184,8.872987,8.579988,-0.380825,-0.620510,-0.618886,-0.524747,-0.479329,-17.755986,-4.916350,-6.902954,-14.215862,-12.127022,-18.136812,-5.536861,-7.521840,-14.740608,-12.606351,21102,20822
3191,50,4,8,56,43,Wyoming,Washakie County,8533,8533,8545,8469,8443,8443,8316,8328,12,-76,-26,0,-127,12,26,108,90,95,96,90,34,79,105,77,70,79,-8,29,-15,18,26,11,1,...,-11,140,140,140,140,140,140,140,12.695427,10.643330,11.251925,11.456531,10.814708,9.286470,12.417219,9.119981,8.353720,9.492910,3.408957,-1.773888,2.131944,3.102810,1.321798,-0.352651,-0.354778,-0.236883,-0.238678,-0.240327,-11.637475,-0.827815,-2.013502,-17.781491,1.682288,-11.990126,-1.182592,-2.250385,-18.020168,1.441961,8545,8316


In [None]:
# Lambdas

# Calculate the max of the columns using the apply function

rows = ['POPESTIMATE2010', 'POPESTIMATE2011','POPESTIMATE2012','POPESTIMATE2013','POPESTIMATE2014', 'POPESTIMATE2015']

# Lambda is an unname function. In this case, takes a single parameter, x, and
# returns a single value: max over all columns associated with row x
df.apply(lambda x: np.max(x[rows]),axis=1).head()

0    4858979
1      55347
2     203709
3      27341
4      22861
dtype: int64

In [None]:
# Divide the states into 4 categories: Northeast, Midwest, South and West

# Write a function

def get_state_region(x):
    northeast = ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 
                 'Rhode Island','Vermont','New York','New Jersey','Pennsylvania']
    midwest = ['Illinois','Indiana','Michigan','Ohio','Wisconsin','Iowa',
               'Kansas','Minnesota','Missouri','Nebraska','North Dakota',
               'South Dakota']
    south = ['Delaware','Florida','Georgia','Maryland','North Carolina',
             'South Carolina','Virginia','District of Columbia','West Virginia',
             'Alabama','Kentucky','Mississippi','Tennessee','Arkansas',
             'Louisiana','Oklahoma','Texas']
    west = ['Arizona','Colorado','Idaho','Montana','Nevada','New Mexico','Utah',
            'Wyoming','Alaska','California','Hawaii','Oregon','Washington']
    if x in northeast:
      return "Northeast"
    elif x in midwest:
      return "Midwest"
    elif x in south:
      return "South"
    else:
      return "West"      

In [None]:
# Create a new column called Region, which shows the state's region

# The get_state_region is supposed to work on the state name column 'STNAME'
# Set the apply function on the state name column and pass the function into the apply

df['state_region']= df['STNAME'].apply(lambda x: get_state_region(x))

In [None]:
# Lets see the results
df[['STNAME','state_region']].head()

Unnamed: 0,STNAME,state_region
0,Alabama,South
1,Alabama,South
2,Alabama,South
3,Alabama,South
4,Alabama,South


## Groupby

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Splitting

# US Census
df = pd.read_csv('census.csv')
# Exlude state level summarization
df = df[df['SUMLEV']==50]
df.head()

In [None]:
# Lets get a list of the unique states, then iterate over all states and for each the average

# Run it for 3 times and time it: cell magic function %%timeit

%%timeit -n 3

for state in df['STNAME'].unique():
  # Calculate the average for the particular state
  avg = np.average(df.where(df['STNAME']==state).dropna()['CENSUS2010POP'])
  # Print it
  print('Counties in the state '+ state + ' have an avg population of ' + str(avg))


Counties in the state Alabama have an avg population of 71339.34328358209
Counties in the state Alaska have an avg population of 24490.724137931036
Counties in the state Arizona have an avg population of 426134.4666666667
Counties in the state Arkansas have an avg population of 38878.90666666667
Counties in the state California have an avg population of 642309.5862068966
Counties in the state Colorado have an avg population of 78581.1875
Counties in the state Connecticut have an avg population of 446762.125
Counties in the state Delaware have an avg population of 299311.3333333333
Counties in the state District of Columbia have an avg population of 601723.0
Counties in the state Florida have an avg population of 280616.5671641791
Counties in the state Georgia have an avg population of 60928.63522012578
Counties in the state Hawaii have an avg population of 272060.2
Counties in the state Idaho have an avg population of 35626.86363636364
Counties in the state Illinois have an avg populat

In [None]:
%%timeit -n 3

# Tell panda to group by state name: this is the split

for group, frame in df.groupby('STNAME'):
  # groupby() returns a tuple. 1st value is the key we are trying to group by,
  # state name in this case ("group"). 2nd is the projected dataframe that 
  # was found for that group ("frame")

  # Calculate an average
  avg = np.average(frame['CENSUS2010POP'])
  # Print the results
  print('Counties in state '+ group +' have an avg population of '+ str(avg))


Counties in state Alabama have an avg population of 71339.34328358209
Counties in state Alaska have an avg population of 24490.724137931036
Counties in state Arizona have an avg population of 426134.4666666667
Counties in state Arkansas have an avg population of 38878.90666666667
Counties in state California have an avg population of 642309.5862068966
Counties in state Colorado have an avg population of 78581.1875
Counties in state Connecticut have an avg population of 446762.125
Counties in state Delaware have an avg population of 299311.3333333333
Counties in state District of Columbia have an avg population of 601723.0
Counties in state Florida have an avg population of 280616.5671641791
Counties in state Georgia have an avg population of 60928.63522012578
Counties in state Hawaii have an avg population of 272060.2
Counties in state Idaho have an avg population of 35626.86363636364
Counties in state Illinois have an avg population of 125790.50980392157
Counties in state Indiana have

In [None]:
# Provide a function to groupby and use that to segment your data

# Create function which returns a number between 0 and 2 based on the 1st character
# of the state name
# Then, tell groupby to use this function to split up our dataframe
# IMPORTANT: you need to set the index to be the column you want to groupby first

# Create new function set_batch_number
# If the 1st letter of the parameter is a capital M we'll return 0
# If it is Q, return 1. Otherwise, return 2

df= df.set_index('STNAME')

def set_batch_number(item):
  if item[0]<'M':
    return 0
  if item[0]<'Q':
    return 1
  return 2

# Dataframe grouped by according to the bacth number. 
# We loop through each batch group

for group, frame in df.groupby(set_batch_number):
  print('There are ' + str(len(frame)) + ' records in group ' + str(group) + ' for processing.') 
  

There are 1177 records in group 0 for processing.
There are 1134 records in group 1 for processing.
There are 831 records in group 2 for processing.


In [None]:
# Dataset: housing of airbnb

df = pd.read_csv('listings.csv')
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,...,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",,"The bus stop is 2 blocks away, and frequent. B...","You will have access to 2 bedrooms, a living r...",,Clean up and treat the home the way you'd like...,https://a2.muscache.com/im/pictures/c0842db1-e...,https://a2.muscache.com/im/pictures/c0842db1-e...,https://a2.muscache.com/im/pictures/c0842db1-e...,https://a2.muscache.com/im/pictures/c0842db1-e...,31303940,https://www.airbnb.com/users/show/31303940,Virginia,2015-04-15,"Boston, Massachusetts, United States",We are country and city connecting in our deck...,,,,f,https://a2.muscache.com/im/pictures/5936fef0-b...,https://a2.muscache.com/im/pictures/5936fef0-b...,Roslindale,1,1,"['email', 'phone', 'facebook', 'reviews']",t,f,"Birch Street, Boston, MA 02131, United States",Roslindale,Roslindale,...,2.0,3.0,Real Bed,"{TV,""Wireless Internet"",Kitchen,""Free Parking ...",,$250.00,,,,$35.00,1,$0.00,2,1125,2 weeks ago,,0,0,0,0,2016-09-06,0,,,,,,,,,,f,,,f,moderate,f,f,1,
1,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,none,"The room is in Roslindale, a diverse and prima...","If you don't have a US cell phone, you can tex...",Plenty of safe street parking. Bus stops a few...,Apt has one more bedroom (which I use) and lar...,"If I am at home, I am likely working in my hom...",Pet friendly but please confirm with me if the...,https://a1.muscache.com/im/pictures/39327812/d...,https://a1.muscache.com/im/pictures/39327812/d...,https://a1.muscache.com/im/pictures/39327812/d...,https://a1.muscache.com/im/pictures/39327812/d...,2572247,https://www.airbnb.com/users/show/2572247,Andrea,2012-06-07,"Boston, Massachusetts, United States",I live in Boston and I like to travel and have...,within an hour,100%,100%,f,https://a2.muscache.com/im/users/2572247/profi...,https://a2.muscache.com/im/users/2572247/profi...,Roslindale,1,1,"['email', 'phone', 'facebook', 'linkedin', 'am...",t,t,"Pinehurst Street, Boston, MA 02131, United States",Roslindale,Roslindale,...,1.0,1.0,Real Bed,"{TV,Internet,""Wireless Internet"",""Air Conditio...",,$65.00,$400.00,,$95.00,$10.00,0,$0.00,2,15,a week ago,,26,54,84,359,2016-09-06,36,2014-06-01,2016-08-13,94.0,10.0,9.0,10.0,10.0,9.0,9.0,f,,,t,moderate,f,f,1,1.3
2,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",none,The LOCATION: Roslindale is a safe and diverse...,I am in a scenic part of Boston with a couple ...,"PUBLIC TRANSPORTATION: From the house, quick p...","I am living in the apartment during your stay,...","ABOUT ME: I'm a laid-back, friendly, unmarried...","I encourage you to use my kitchen, cooking and...",https://a2.muscache.com/im/pictures/6ae8335d-9...,https://a2.muscache.com/im/pictures/6ae8335d-9...,https://a2.muscache.com/im/pictures/6ae8335d-9...,https://a2.muscache.com/im/pictures/6ae8335d-9...,16701,https://www.airbnb.com/users/show/16701,Phil,2009-05-11,"Boston, Massachusetts, United States","I am a middle-aged, single male with a wide ra...",within a few hours,100%,88%,t,https://a2.muscache.com/im/users/16701/profile...,https://a2.muscache.com/im/users/16701/profile...,Roslindale,1,1,"['email', 'phone', 'reviews', 'jumio']",t,t,"Ardale St., Boston, MA 02131, United States",Roslindale,Roslindale,...,1.0,1.0,Real Bed,"{TV,""Cable TV"",""Wireless Internet"",""Air Condit...",,$65.00,$395.00,"$1,350.00",,,1,$20.00,3,45,5 days ago,,19,46,61,319,2016-09-06,41,2009-07-19,2016-08-05,98.0,10.0,9.0,10.0,10.0,9.0,10.0,f,,,f,moderate,t,f,1,0.47
3,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,none,Roslindale is a lovely little neighborhood loc...,Please be mindful of the property as it is old...,There are buses that stop right in front of th...,The basement has a washer dryer and gym area. ...,We do live in the house therefore might be som...,- The bathroom and house are shared so please ...,https://a2.muscache.com/im/pictures/39764190-1...,https://a2.muscache.com/im/pictures/39764190-1...,https://a2.muscache.com/im/pictures/39764190-1...,https://a2.muscache.com/im/pictures/39764190-1...,6031442,https://www.airbnb.com/users/show/6031442,Meghna,2013-04-21,"Boston, Massachusetts, United States",My husband and I live on the property. He’s a...,within a few hours,100%,50%,f,https://a2.muscache.com/im/pictures/5d430cde-7...,https://a2.muscache.com/im/pictures/5d430cde-7...,,1,1,"['email', 'phone', 'reviews']",t,f,"Boston, MA, United States",,Roslindale,...,1.0,2.0,Real Bed,"{TV,Internet,""Wireless Internet"",""Air Conditio...",,$75.00,,,$100.00,$50.00,2,$25.00,1,1125,a week ago,,6,16,26,98,2016-09-06,1,2016-08-28,2016-08-28,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,,,f,moderate,f,f,1,1.0
4,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",none,"I love the proximity to downtown, the neighbor...",I have one roommate who lives on the lower lev...,From Logan Airport and South Station you have...,You will have access to the front and side por...,I love my city and really enjoy sharing it wit...,"Please no smoking in the house, porch or on th...",https://a1.muscache.com/im/pictures/97154760/8...,https://a1.muscache.com/im/pictures/97154760/8...,https://a1.muscache.com/im/pictures/97154760/8...,https://a1.muscache.com/im/pictures/97154760/8...,15396970,https://www.airbnb.com/users/show/15396970,Linda,2014-05-11,"Boston, Massachusetts, United States",I work full time for a public school district....,within an hour,100%,100%,t,https://a0.muscache.com/im/users/15396970/prof...,https://a0.muscache.com/im/users/15396970/prof...,Roslindale,1,1,"['email', 'phone', 'reviews', 'kba']",t,t,"Durnell Avenue, Boston, MA 02131, United States",Roslindale,Roslindale,...,1.0,2.0,Real Bed,"{Internet,""Wireless Internet"",""Air Conditionin...",,$79.00,,,,$15.00,1,$0.00,2,31,2 weeks ago,,13,34,59,334,2016-09-06,29,2015-08-18,2016-09-01,99.0,10.0,10.0,10.0,10.0,9.0,10.0,f,,,f,flexible,f,f,1,2.25


In [None]:
# Group by both of these columns
# 1st approach: multiindex

df=df.set_index(['cancellation_policy','review_scores_value'])

# When having multindex, we need to pass the levels

for group, frame in df.groupby(level=(0,1)):
  print(group)

('flexible', 2.0)
('flexible', 4.0)
('flexible', 5.0)
('flexible', 6.0)
('flexible', 7.0)
('flexible', 8.0)
('flexible', 9.0)
('flexible', 10.0)
('moderate', 2.0)
('moderate', 4.0)
('moderate', 6.0)
('moderate', 7.0)
('moderate', 8.0)
('moderate', 9.0)
('moderate', 10.0)
('strict', 2.0)
('strict', 3.0)
('strict', 4.0)
('strict', 5.0)
('strict', 6.0)
('strict', 7.0)
('strict', 8.0)
('strict', 9.0)
('strict', 10.0)
('super_strict_30', 6.0)
('super_strict_30', 7.0)
('super_strict_30', 8.0)
('super_strict_30', 9.0)
('super_strict_30', 10.0)


In [None]:
# Group by cancellation policy and review score, but separate out all 10's from nott 10's

def grouping_fun(item):
  # Check the review_score_value portion of the index.
  # item is in the format of (cancellation, review)
  if item[1]==10.0:
    return (item[0],"10.0")
  else:
    return (item[0],"not 10.0")

for group, frame in df.groupby(by=grouping_fun):
  print(group)      

('flexible', '10.0')
('flexible', 'not 10.0')
('moderate', '10.0')
('moderate', 'not 10.0')
('strict', '10.0')
('strict', 'not 10.0')
('super_strict_30', '10.0')
('super_strict_30', 'not 10.0')


### Aggregation

Agg() returns a single value per column, so one row per group

In [None]:
# Agg() on he groupby object
# With Agg() we can pass in a dictionary of the columns we are interested
# in aggregation along with the function we are looking to apply to aggregate

# Lets reset the index for the airbnb data

df=df.reset_index()

# Lets group by the cancellation policy abd find the average review scores by group

df.groupby('cancellation_policy').agg({'review_scores_value':np.average})

Unnamed: 0_level_0,review_scores_value
cancellation_policy,Unnamed: 1_level_1
flexible,
moderate,
strict,
super_strict_30,


In [None]:
# What is the problem?
# There is a bunch of no numbers. np.average does not ignore NaNs

df.groupby('cancellation_policy').agg({'review_scores_value':np.nanmean})

Unnamed: 0_level_0,review_scores_value
cancellation_policy,Unnamed: 1_level_1
flexible,9.237421
moderate,9.307398
strict,9.081441
super_strict_30,8.537313


In [None]:
# Aggregate by multiple functions or multiple columns

# df.groupby(): a groupby on the data frame by the column 'cancelation'-> new object
# Then, we invoke the agg() to that object
# df.groupby('xxx').agg(): the agg() function is going to apply one or more functions
# to the group dataframes and return a single row per group

df.groupby('cancellation_policy').agg({'review_scores_value':(np.nanmean, np.nanstd),
                                       'reviews_per_month':np.nanmean})

# We have two dictionaries entries. Each indicating which columns we wanted functions
# applied to


Unnamed: 0_level_0,review_scores_value,review_scores_value,reviews_per_month
Unnamed: 0_level_1,nanmean,nanstd,nanmean
cancellation_policy,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
flexible,9.237421,1.096271,1.82921
moderate,9.307398,0.859859,2.391922
strict,9.081441,1.040531,1.873467
super_strict_30,8.537313,0.840785,0.340143


### Transformation

Transform() returns an object that is the same size as the group

In [None]:
# Include the average rating values in a given group by cancellation policy,
# but preserve the dataframe shape

# Define a subset of columns we are interested in

cols = ['cancellation_policy','review_scores_value']

# Transform it and store in its own data frame
transform_df=df[cols].groupby('cancellation_policy').transform(np.nanmean)
transform_df.head()

Unnamed: 0,review_scores_value
0,9.307398
1,9.307398
2,9.307398
3,9.307398
4,9.237421


In [19]:
# Both dataframes have the same indeces

# Rename the column in the transformed version 
transform_df.rename({'review_scores_value':'mean_review_scores'},axis='columns',inplace=True)

# Merge both dataframes
df=df.merge(transform_df,left_index=True,right_index=True)
df.head()



Unnamed: 0,level_0,index,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,...,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value_x,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,mean_review_scores_x,mean_review_scores_y,mean_review_scores_x.1,review_scores_value_y,review_scores_value,mean_review_scores_y.1
0,0,0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",,"The bus stop is 2 blocks away, and frequent. B...","You will have access to 2 bedrooms, a living r...",,Clean up and treat the home the way you'd like...,https://a2.muscache.com/im/pictures/c0842db1-e...,https://a2.muscache.com/im/pictures/c0842db1-e...,https://a2.muscache.com/im/pictures/c0842db1-e...,https://a2.muscache.com/im/pictures/c0842db1-e...,31303940,https://www.airbnb.com/users/show/31303940,Virginia,2015-04-15,"Boston, Massachusetts, United States",We are country and city connecting in our deck...,,,,f,https://a2.muscache.com/im/pictures/5936fef0-b...,https://a2.muscache.com/im/pictures/5936fef0-b...,Roslindale,1,1,"['email', 'phone', 'facebook', 'reviews']",t,f,"Birch Street, Boston, MA 02131, United States",...,,,,$35.00,1,$0.00,2,1125,2 weeks ago,,0,0,0,0,2016-09-06,0,,,,,,,,,,f,,,f,moderate,f,f,1,,9.307398,9.307398,9.307398,9.307398,9.307398,9.307398
1,1,1,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,none,"The room is in Roslindale, a diverse and prima...","If you don't have a US cell phone, you can tex...",Plenty of safe street parking. Bus stops a few...,Apt has one more bedroom (which I use) and lar...,"If I am at home, I am likely working in my hom...",Pet friendly but please confirm with me if the...,https://a1.muscache.com/im/pictures/39327812/d...,https://a1.muscache.com/im/pictures/39327812/d...,https://a1.muscache.com/im/pictures/39327812/d...,https://a1.muscache.com/im/pictures/39327812/d...,2572247,https://www.airbnb.com/users/show/2572247,Andrea,2012-06-07,"Boston, Massachusetts, United States",I live in Boston and I like to travel and have...,within an hour,100%,100%,f,https://a2.muscache.com/im/users/2572247/profi...,https://a2.muscache.com/im/users/2572247/profi...,Roslindale,1,1,"['email', 'phone', 'facebook', 'linkedin', 'am...",t,t,"Pinehurst Street, Boston, MA 02131, United States",...,$400.00,,$95.00,$10.00,0,$0.00,2,15,a week ago,,26,54,84,359,2016-09-06,36,2014-06-01,2016-08-13,94.0,10.0,9.0,10.0,10.0,9.0,9.0,f,,,t,moderate,f,f,1,1.3,9.307398,9.307398,9.307398,9.307398,9.307398,9.307398
2,2,2,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",none,The LOCATION: Roslindale is a safe and diverse...,I am in a scenic part of Boston with a couple ...,"PUBLIC TRANSPORTATION: From the house, quick p...","I am living in the apartment during your stay,...","ABOUT ME: I'm a laid-back, friendly, unmarried...","I encourage you to use my kitchen, cooking and...",https://a2.muscache.com/im/pictures/6ae8335d-9...,https://a2.muscache.com/im/pictures/6ae8335d-9...,https://a2.muscache.com/im/pictures/6ae8335d-9...,https://a2.muscache.com/im/pictures/6ae8335d-9...,16701,https://www.airbnb.com/users/show/16701,Phil,2009-05-11,"Boston, Massachusetts, United States","I am a middle-aged, single male with a wide ra...",within a few hours,100%,88%,t,https://a2.muscache.com/im/users/16701/profile...,https://a2.muscache.com/im/users/16701/profile...,Roslindale,1,1,"['email', 'phone', 'reviews', 'jumio']",t,t,"Ardale St., Boston, MA 02131, United States",...,$395.00,"$1,350.00",,,1,$20.00,3,45,5 days ago,,19,46,61,319,2016-09-06,41,2009-07-19,2016-08-05,98.0,10.0,9.0,10.0,10.0,9.0,10.0,f,,,f,moderate,t,f,1,0.47,9.307398,9.307398,9.307398,9.307398,9.307398,9.307398
3,3,3,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,none,Roslindale is a lovely little neighborhood loc...,Please be mindful of the property as it is old...,There are buses that stop right in front of th...,The basement has a washer dryer and gym area. ...,We do live in the house therefore might be som...,- The bathroom and house are shared so please ...,https://a2.muscache.com/im/pictures/39764190-1...,https://a2.muscache.com/im/pictures/39764190-1...,https://a2.muscache.com/im/pictures/39764190-1...,https://a2.muscache.com/im/pictures/39764190-1...,6031442,https://www.airbnb.com/users/show/6031442,Meghna,2013-04-21,"Boston, Massachusetts, United States",My husband and I live on the property. He’s a...,within a few hours,100%,50%,f,https://a2.muscache.com/im/pictures/5d430cde-7...,https://a2.muscache.com/im/pictures/5d430cde-7...,,1,1,"['email', 'phone', 'reviews']",t,f,"Boston, MA, United States",...,,,$100.00,$50.00,2,$25.00,1,1125,a week ago,,6,16,26,98,2016-09-06,1,2016-08-28,2016-08-28,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,,,f,moderate,f,f,1,1.0,9.307398,9.307398,9.307398,9.307398,9.307398,9.307398
4,4,4,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",none,"I love the proximity to downtown, the neighbor...",I have one roommate who lives on the lower lev...,From Logan Airport and South Station you have...,You will have access to the front and side por...,I love my city and really enjoy sharing it wit...,"Please no smoking in the house, porch or on th...",https://a1.muscache.com/im/pictures/97154760/8...,https://a1.muscache.com/im/pictures/97154760/8...,https://a1.muscache.com/im/pictures/97154760/8...,https://a1.muscache.com/im/pictures/97154760/8...,15396970,https://www.airbnb.com/users/show/15396970,Linda,2014-05-11,"Boston, Massachusetts, United States",I work full time for a public school district....,within an hour,100%,100%,t,https://a0.muscache.com/im/users/15396970/prof...,https://a0.muscache.com/im/users/15396970/prof...,Roslindale,1,1,"['email', 'phone', 'reviews', 'kba']",t,t,"Durnell Avenue, Boston, MA 02131, United States",...,,,,$15.00,1,$0.00,2,31,2 weeks ago,,13,34,59,334,2016-09-06,29,2015-08-18,2016-09-01,99.0,10.0,10.0,10.0,10.0,9.0,10.0,f,,,f,flexible,f,f,1,2.25,9.237421,9.237421,9.237421,9.237421,9.237421,9.237421


In [None]:
# Create the difference between a given row and its group mean

# Create new variable mean_diff and apply absolute
df['mean_diff']=np.absolute(df['review_scores_value']-df['mean_review_scores'])
df['mean_diff'].head()

### Filtering

### Applying