# Data Collection and Cleaning
## Projecting US Food Insecurity in 2020
### By Khyatee Desai

In [77]:
# import necessary libraries
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')

# 1. Feeding America Datasets
Data available for: 09, 10, 14, 15, 16, 17, 18
### Import all the files

In [78]:
directory = "../datasets/feeding_america/"

df_FA_09 = pd.read_excel(directory+'FA_2011_2009.xlsx')
df_FA_10 = pd.read_excel(directory+'FA_2012_2010.xlsx')
df_FA_11 = pd.read_excel(directory+'FA_2013_2011.xlsx') # only by state for some reason..
df_FA_12 = pd.read_excel(directory+'FA_2014_2012.xlsx') # only by state for some reason..
df_FA_13 = pd.read_excel(directory+'FA_2015_2013.xlsx') # only by state for some reason..
df_FA_14 = pd.read_excel(directory+'FA_2016_2014.xlsx')
df_FA_15 = pd.read_excel(directory+'FA_2017_2015.xlsx')
df_FA_16 = pd.read_excel(directory+'FA_2018_2016.xlsx')
df_FA_17 = pd.read_excel(directory+'FA_2019_2017.xlsx')
df_FA_18 = pd.read_excel(directory+'FA_2020_2018.xlsx',header=1)
# df_FAprojection_20 = pd.read_excel(directory+'projection_10.2020.xlsx')


In [79]:
df_FA_10

Unnamed: 0,FIPS,State,"County, State",2010 Food Insecurity Rate,Number of Food Insecure Persons in 2010,Low Threshold in state,Low Threshold Type,High Threshold in state,High Threshold Type,% FI ≤ Low Threshold,% FI Btwn Thresholds,% FI > High Threshold,2010 Child food insecurity rate,Number of Food Insecure Children in 2010,% food insecure children in HH w/ HH incomes below 185 FPL,% of food insecure children in HH w/ HH incomes above 185 FPL,2010 Cost Per Meal,2010 Weighted Annual Food Budget Shortfall
0,1001,AL,"Autauga County, Alabama",0.134,7140,1.3,SNAP,1.85,Other Nutrition Program,0.327,0.208,0.465,0.203,2980.0,0.51,0.49,2.58,3170830
1,1003,AL,"Baldwin County, Alabama",0.134,23570,1.3,SNAP,1.85,Other Nutrition Program,0.347,0.287,0.366,0.238,9720.0,0.59,0.41,2.64,10710730
2,1005,AL,"Barbour County, Alabama",0.232,6440,1.3,SNAP,1.85,Other Nutrition Program,0.479,0.171,0.350,0.258,1600.0,0.87,0.13,2.53,2804540
3,1007,AL,"Bibb County, Alabama",0.157,3550,1.3,SNAP,1.85,Other Nutrition Program,0.358,0.288,0.354,0.249,1300.0,0.64,0.36,2.55,1558200
4,1009,AL,"Blount County, Alabama",0.126,7160,1.3,SNAP,1.85,Other Nutrition Program,0.410,0.305,0.285,0.254,3540.0,0.53,0.47,2.50,3081120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3138,56037,WY,"Sweetwater County, Wyoming",0.112,4720,1.3,SNAP,1.85,Other Nutrition Program,0.272,0.163,0.565,0.181,2040.0,0.40,0.60,2.54,2063630
3139,56039,WY,"Teton County, Wyoming",0.122,2540,1.3,SNAP,1.85,Other Nutrition Program,0.308,0.206,0.486,0.205,840.0,0.55,0.45,3.48,1521490
3140,56041,WY,"Uinta County, Wyoming",0.128,2620,1.3,SNAP,1.85,Other Nutrition Program,0.367,0.189,0.444,0.191,1160.0,0.48,0.52,2.38,1073330
3141,56043,WY,"Washakie County, Wyoming",0.108,900,1.3,SNAP,1.85,Other Nutrition Program,0.349,0.289,0.362,0.148,300.0,0.59,0.41,2.44,378000


### Drop unnecessary features

In [80]:
df_FA_09 = df_FA_09.drop(['State Name', 'County Code','Low Threshold in state', 'High Threshold in state', '% FI ≤ Low Threshold',
            '% FI Btwn Thresholds','% FI > High Threshold', '% of children in FI HH with HH incomes at or below 185% FPL',
              'Number Food Insecure Children','% of children in FI HH with HH incomes above 185% FPL'], axis=1)

In [81]:
df_FA_10 = df_FA_10.drop(['County, State', 'State','Low Threshold in state', 'High Threshold in state', '% FI ≤ Low Threshold',
              '% FI Btwn Thresholds', 'Number of Food Insecure Children in 2010 ',
              '% FI > High Threshold', '% food insecure children in HH w/ HH incomes below 185 FPL',
               '% of food insecure children in HH w/ HH incomes above 185 FPL'], axis=1)

In [82]:
df_FA_11 = df_FA_11.drop(['County, State', 'State','Low Threshold in state', 'High Threshold in state', '% FI ≤ Low Threshold',
                          '% FI Btwn Thresholds', 'Number of Food Insecure Children in 2011',
              '% FI > High Threshold', '% food insecure children in HH w/ HH incomes below 185 FPL',
               '% of food insecure children in HH w/ HH incomes above 185 FPL'], axis=1)

In [83]:
df_FA_12 = df_FA_12.drop(['State Name', 'State','Low Threshold in state', 'High Threshold in state', '% FI ≤ Low Threshold',
                '% FI Btwn Thresholds','% FI > High Threshold', '# of Food Insecure Children in 2012',
               '% food insecure Children in HH w/HH Incomes Below 185 FPL in 2012',
               '% food insecure Children in HH w/HH Incomes Above 185 FPL in 2012'], axis=1)

In [84]:
df_FA_13 = df_FA_13.drop(['State Name', 'State','Low Threshold in state', 'High Threshold in state', '% FI ≤ Low Threshold',
                '% FI Btwn Thresholds','% FI > High Threshold', '# of Food Insecure Children in 2013',
               '% food insecure Children in HH w/HH Incomes Below 185 FPL in 2013',
               '% food insecure Children in HH w/HH Incomes Above 185 FPL in 2013'], axis=1)

In [85]:
df_FA_14 = df_FA_14.drop(['County, State', 'State','Low Threshold in state', 'High Threshold in state', '% FI ≤ Low Threshold',
                '% FI Btwn Thresholds', '% FI > High Threshold', '# of Food Insecure Children in 2014',
               '% food insecure children in HH w/ HH incomes below 185 FPL in 2014',
               '% food insecure children in HH w/ HH incomes above 185 FPL in 2014'], axis=1)

In [86]:
df_FA_15 = df_FA_15.drop(['County, State', 'State','Low Threshold in state', 'High Threshold in state', '% FI ≤ Low Threshold',
                '% FI Btwn Thresholds','% FI > High Threshold', '# of Food Insecure Children in 2015',
               '% food insecure children in HH w/ HH incomes below 185 FPL in 2015',
               '% food insecure children in HH w/ HH incomes above 185 FPL in 2015'], axis=1)

In [87]:
df_FA_16 = df_FA_16.drop(['County, State', 'State','Low Threshold in state', 'High Threshold in state', '% FI ≤ Low Threshold',
                '% FI Btwn Thresholds','% FI > High Threshold', '# of Food Insecure Children in 2016',
               '% food insecure children in HH w/ HH incomes below 185 FPL in 2016',
               '% food insecure children in HH w/ HH incomes above 185 FPL in 2016'], axis=1)

In [88]:
df_FA_17 = df_FA_17.drop(['County, State', 'State','Low Threshold in state', 'High Threshold in state', '% FI ≤ Low Threshold',
                '% FI Btwn Thresholds', '% FI > High Threshold', '# of Food Insecure Children in 2017',
               '% food insecure children in HH w/ HH incomes below 185 FPL in 2017',
               '% food insecure children in HH w/ HH incomes above 185 FPL in 2017'], axis=1)

In [89]:
df_FA_18 = df_FA_18.drop(['County, State', 'State','Low Threshold in state', 'High Threshold in state', '% FI ≤ Low Threshold',
                '% FI Btwn Thresholds', '% FI > High Threshold', '# of Food Insecure Children in 2018',
               '% food insecure children in HH w/ HH incomes below 185 FPL in 2018',
               '% food insecure children in HH w/ HH incomes above 185 FPL in 2018'], axis=1)

### Reformat FIPS Column

In [90]:
# 2009
# drop null rows at the end
df_FA_09.drop(df_FA_09[df_FA_09['FIPS'].isnull()].index, axis=0, inplace=True)
# change FIPS to string and add leading zeros if needed
df_FA_09['FIPS'] = np.where(df_FA_09['FIPS']<10000, 
                        '0'+df_FA_09['FIPS'].astype(int).astype(str), df_FA_09['FIPS'].astype(int).astype(str))
# 2010
df_FA_10.drop(df_FA_10[df_FA_10['FIPS'].isnull()].index, axis=0, inplace=True)
df_FA_10['FIPS'] = np.where(df_FA_10['FIPS']<10000, 
                        '0'+df_FA_10['FIPS'].astype(int).astype(str), df_FA_10['FIPS'].astype(int).astype(str))
# 2011
df_FA_11.drop(df_FA_11[df_FA_11['FIPS'].isnull()].index, axis=0, inplace=True)
df_FA_11['FIPS'] = np.where(df_FA_11['FIPS']<10000, 
                        '0'+df_FA_11['FIPS'].astype(int).astype(str), df_FA_11['FIPS'].astype(int).astype(str))
# 2012
df_FA_12.drop(df_FA_12[df_FA_12['FIPS'].isnull()].index, axis=0, inplace=True)
df_FA_12['FIPS'] = np.where(df_FA_12['FIPS']<10000, 
                        '0'+df_FA_12['FIPS'].astype(int).astype(str), df_FA_12['FIPS'].astype(int).astype(str))
# 2013
df_FA_13.drop(df_FA_13[df_FA_13['FIPS'].isnull()].index, axis=0, inplace=True)
df_FA_13['FIPS'] = np.where(df_FA_13['FIPS']<10000, 
                        '0'+df_FA_13['FIPS'].astype(int).astype(str), df_FA_13['FIPS'].astype(int).astype(str))
# 2014
df_FA_14.drop(df_FA_14[df_FA_14['FIPS'].isnull()].index, axis=0, inplace=True)
df_FA_14['FIPS'] = np.where(df_FA_14['FIPS']<10000, 
                        '0'+df_FA_14['FIPS'].astype(int).astype(str), df_FA_14['FIPS'].astype(int).astype(str))
# 2015
df_FA_15.drop(df_FA_15[df_FA_15['FIPS'].isnull()].index, axis=0, inplace=True)
df_FA_15['FIPS'] = np.where(df_FA_15['FIPS']<10000, 
                        '0'+df_FA_15['FIPS'].astype(int).astype(str), df_FA_15['FIPS'].astype(int).astype(str))
# 2016
df_FA_16.drop(df_FA_16[df_FA_16['FIPS'].isnull()].index, axis=0, inplace=True)
df_FA_16['FIPS'] = np.where(df_FA_16['FIPS']<10000, 
                        '0'+df_FA_16['FIPS'].astype(int).astype(str), df_FA_16['FIPS'].astype(int).astype(str))
# 2017
df_FA_17.drop(df_FA_17[df_FA_17['FIPS'].isnull()].index, axis=0, inplace=True)
df_FA_17['FIPS'] = np.where(df_FA_17['FIPS']<10000, 
                        '0'+df_FA_17['FIPS'].astype(int).astype(str), df_FA_17['FIPS'].astype(int).astype(str))
# 2018
df_FA_18.drop(df_FA_18[df_FA_18['FIPS'].isnull()].index, axis=0, inplace=True)
df_FA_18['FIPS'] = np.where(df_FA_18['FIPS']<10000, 
                        '0'+df_FA_18['FIPS'].astype(int).astype(str), df_FA_18['FIPS'].astype(int).astype(str))



### Add Year column to each df

In [91]:
df_FA_09['Year'] = '2009'
df_FA_10['Year'] = '2010'
df_FA_11['Year'] = '2011'
df_FA_12['Year'] = '2012'
df_FA_13['Year'] = '2013'
df_FA_14['Year'] = '2014'
df_FA_15['Year'] = '2015'
df_FA_16['Year'] = '2016'
df_FA_17['Year'] = '2017'
df_FA_18['Year'] = '2018'

### Rename columns for uniformity

In [92]:
df_FA_10.rename(columns={'2010 Food Insecurity Rate':'FI Rate', 'Number of Food Insecure Persons in 2010':'Number Food Insecure Individuals',
       '2010 Weighted Annual Food Budget Shortfall':'Weighted Annual Dollars', '2010 Cost Per Meal':'Cost Per Meal', 
                         '2010 Child food insecurity rate':'Child FI Rate'}, inplace=True)
df_FA_11.rename(columns={'2011 Food Insecurity Rate':'FI Rate', 'Number of Food Insecure Persons in 2011':'Number Food Insecure Individuals',
       '2011 Weighted Annual Food Budget Shortfall':'Weighted Annual Dollars', '2011 Cost Per Meal':'Cost Per Meal', 
                         '2011 Child Food Insecurity Rate':'Child FI Rate'}, inplace=True)
df_FA_12.rename(columns={'2012 Food Insecurity Rate':'FI Rate', '# of Food Insecure Persons in 2012 ':'Number Food Insecure Individuals',
       '2012 Weighted Annual Food Budget Shortfall':'Weighted Annual Dollars', '2012 Cost Per Meal':'Cost Per Meal', 
                         '2012 Child Food Insecurity Rate':'Child FI Rate'}, inplace=True)
df_FA_13.rename(columns={'2013 Food Insecurity Rate':'FI Rate', '# of Food Insecure Persons in 2013 ':'Number Food Insecure Individuals',
       '2013 Weighted Annual Food Budget Shortfall':'Weighted Annual Dollars', '2013 Cost Per Meal':'Cost Per Meal', 
                         '2013 Child Food Insecurity Rate':'Child FI Rate'}, inplace=True)
df_FA_14.rename(columns={'2014 Food Insecurity Rate':'FI Rate', '# of Food Insecure Persons in 2014':'Number Food Insecure Individuals',
       '2014 Weighted Annual Food Budget Shortfall':'Weighted Annual Dollars', '2014 Cost Per Meal':'Cost Per Meal', 
                         '2014 Child food insecurity rate':'Child FI Rate'}, inplace=True)
df_FA_15.rename(columns={'2015 Food Insecurity Rate':'FI Rate', '# of Food Insecure Persons in 2015':'Number Food Insecure Individuals',
       '2015 Weighted Annual Food Budget Shortfall':'Weighted Annual Dollars', '2015 Cost Per Meal':'Cost Per Meal', 
                         '2015 Child food insecurity rate':'Child FI Rate'}, inplace=True)
df_FA_16.rename(columns={'2016 Food Insecurity Rate':'FI Rate', '# of Food Insecure Persons in 2016':'Number Food Insecure Individuals',
       '2016 Weighted Annual Food Budget Shortfall':'Weighted Annual Dollars', '2016 Cost Per Meal':'Cost Per Meal', 
                         '2016 Child food insecurity rate':'Child FI Rate'}, inplace=True)
df_FA_17.rename(columns={'2017 Food Insecurity Rate':'FI Rate', '# of Food Insecure Persons in 2017':'Number Food Insecure Individuals',
       '2017 Weighted Annual Food Budget Shortfall':'Weighted Annual Dollars', '2017 Cost Per Meal':'Cost Per Meal', 
                         '2017 Child food insecurity rate':'Child FI Rate'}, inplace=True)
df_FA_18.rename(columns={'2018 Food Insecurity Rate':'FI Rate', '# of Food Insecure Persons in 2018':'Number Food Insecure Individuals',
       '2018 Weighted Annual Food Budget Shortfall':'Weighted Annual Dollars', '2018 Cost Per Meal':'Cost Per Meal', 
                         '2018 Child food insecurity rate':'Child FI Rate'}, inplace=True)

### Concatenate all df's to create master dataframe for all years

In [93]:
df_FA = pd.concat([df_FA_09, df_FA_10,df_FA_11,df_FA_12,df_FA_13,df_FA_14,df_FA_15,df_FA_16,df_FA_17,df_FA_18 ])
df_FA

Unnamed: 0,FIPS,FI Rate,Number Food Insecure Individuals,Low Threshold Type,High Threshold Type,Weighted Annual Dollars,Cost Per Meal,Child FI Rate,Year
0,02013,0.151,450.0,SNAP,other nutrition pgm,,,0.248,2009
1,02016,0.136,750.0,SNAP,other nutrition pgm,,,0.178,2009
2,02020,0.119,33260.0,SNAP,other nutrition pgm,,,0.186,2009
3,02050,0.211,3620.0,SNAP,other nutrition pgm,,,0.314,2009
4,02060,0.095,60.0,SNAP,other nutrition pgm,,,0.166,2009
...,...,...,...,...,...,...,...,...,...
3137,56037,0.117,5140.0,SNAP,Other Nutrition Program,2865000.0,3.29,0.154,2018
3138,56039,0.095,2200.0,SNAP,Other Nutrition Program,1683000.0,4.52,0.084,2018
3139,56041,0.135,2780.0,SNAP,Other Nutrition Program,1444000.0,3.07,0.187,2018
3140,56043,0.126,1020.0,SNAP,Other Nutrition Program,562000.0,3.26,0.184,2018


# 2. Unemployment Data

In [94]:
directory = "../datasets/unemployment/"

df_unemp_09 = pd.read_excel(directory + 'laucnty09.xlsx', header=4).drop(0,axis=0)
df_unemp_10 = pd.read_excel(directory + 'laucnty10.xlsx', header=4).drop(0,axis=0)
df_unemp_11 = pd.read_excel(directory + 'laucnty11.xlsx', header=4).drop(0,axis=0)
df_unemp_12 = pd.read_excel(directory + 'laucnty12.xlsx', header=4).drop(0,axis=0)
df_unemp_13 = pd.read_excel(directory + 'laucnty13.xlsx', header=4).drop(0,axis=0)
df_unemp_14 = pd.read_excel(directory + 'laucnty14.xlsx', header=4).drop(0,axis=0)
df_unemp_15 = pd.read_excel(directory + 'laucnty15.xlsx', header=4).drop(0,axis=0)
df_unemp_16 = pd.read_excel(directory + 'laucnty16.xlsx', header=4).drop(0,axis=0)
df_unemp_17 = pd.read_excel(directory + 'laucnty17.xlsx', header=4).drop(0,axis=0)
df_unemp_18 = pd.read_excel(directory + 'laucnty18.xlsx', header=4).drop(0,axis=0)
df_unemp_19 = pd.read_excel(directory + 'laucnty19.xlsx', header=4).drop(0,axis=0)


### Rename columns using data dictionary

In [95]:
df_unemp_09.rename(columns = {'Code':'CN', 'Code.1':'FIPS_state', 'Code.2':'FIPS_county', 
                             'County Name/State Abbreviation': 'State/County', 'Unnamed: 5': 'idk',
                              'Force':'Total_workforce','(%)':'Unemployment_rate'}, inplace=True)

In [96]:
df_unemp_10.rename(columns = {'Code':'CN', 'Code.1':'FIPS_state', 'Code.2':'FIPS_county', 
                             'County Name/State Abbreviation': 'State/County', 'Unnamed: 5': 'idk',
                              'Force':'Total_workforce','(%)':'Unemployment_rate'}, inplace=True)

In [97]:
df_unemp_11.rename(columns = {'Code':'CN', 'Code.1':'FIPS_state', 'Code.2':'FIPS_county', 
                             'County Name/State Abbreviation': 'State/County', 'Unnamed: 5': 'idk',
                              'Force':'Total_workforce','(%)':'Unemployment_rate'}, inplace=True)

In [98]:
df_unemp_12.rename(columns = {'Code':'CN', 'Code.1':'FIPS_state', 'Code.2':'FIPS_county', 
                             'County Name/State Abbreviation': 'State/County', 'Unnamed: 5': 'idk',
                              'Force':'Total_workforce','(%)':'Unemployment_rate'}, inplace=True)

In [99]:
df_unemp_13.rename(columns = {'Code':'CN', 'Code.1':'FIPS_state', 'Code.2':'FIPS_county', 
                             'County Name/State Abbreviation': 'State/County', 'Unnamed: 5': 'idk',
                              'Force':'Total_workforce','(%)':'Unemployment_rate'}, inplace=True)

In [100]:
df_unemp_14.rename(columns = {'Code':'CN', 'Code.1':'FIPS_state', 'Code.2':'FIPS_county', 
                             'County Name/State Abbreviation': 'State/County', 'Unnamed: 5': 'idk',
                              'Force':'Total_workforce','(%)':'Unemployment_rate'}, inplace=True)

In [101]:
df_unemp_15.rename(columns = {'Code':'CN', 'Code.1':'FIPS_state', 'Code.2':'FIPS_county', 
                             'County Name/State Abbreviation': 'State/County', 'Unnamed: 5': 'idk',
                              'Force':'Total_workforce','(%)':'Unemployment_rate'}, inplace=True)

In [102]:
df_unemp_16.rename(columns = {'Code':'CN', 'Code.1':'FIPS_state', 'Code.2':'FIPS_county', 
                             'County Name/State Abbreviation': 'State/County', 'Unnamed: 5': 'idk',
                              'Force':'Total_workforce','(%)':'Unemployment_rate'}, inplace=True)

In [103]:
df_unemp_17.rename(columns = {'Code':'CN', 'Code.1':'FIPS_state', 'Code.2':'FIPS_county', 
                             'County Name/State Abbreviation': 'State/County', 'Unnamed: 5': 'idk',
                              'Force':'Total_workforce','(%)':'Unemployment_rate'}, inplace=True)

In [104]:
df_unemp_18.rename(columns = {'Code':'CN', 'Code.1':'FIPS_state', 'Code.2':'FIPS_county', 
                             'County Name/State Abbreviation': 'State/County', 'Unnamed: 5': 'idk',
                              'Force':'Total_workforce','(%)':'Unemployment_rate'}, inplace=True)

In [105]:
df_unemp_19.rename(columns = {'LAUS Code':'CN', 'Code':'FIPS_state', 'Code.1':'FIPS_county', 
                             'County Name/State Abbreviation': 'State/County', 
                              'Force':'Total_workforce','(%)':'Unemployment_rate'}, inplace=True)

### Drop null rows, drop some columns, and reformat year column

In [106]:
# 2009
# drop last three rows which were null
df_unemp_09.drop(df_unemp_09[df_unemp_09['FIPS_state'].isnull()].index, inplace=True)
# change year column to string
df_unemp_09['Year'] = df_unemp_09['Year'].astype(int).astype(str)
# drop unneeded columns
df_unemp_09.drop(['CN', 'idk'], axis=1, inplace=True)

# 2010
df_unemp_10.drop(df_unemp_10[df_unemp_10['FIPS_state'].isnull()].index, inplace=True)
df_unemp_10['Year'] = df_unemp_10['Year'].astype(int).astype(str)
df_unemp_10.drop(['CN', 'idk'], axis=1, inplace=True)

# 2011
df_unemp_11.drop(df_unemp_11[df_unemp_11['FIPS_state'].isnull()].index, inplace=True)
df_unemp_11['Year'] = df_unemp_11['Year'].astype(int).astype(str)
df_unemp_11.drop(['CN', 'idk'], axis=1, inplace=True)

# 2012
df_unemp_12.drop(df_unemp_12[df_unemp_12['FIPS_state'].isnull()].index, inplace=True)
df_unemp_12['Year'] = df_unemp_12['Year'].astype(int).astype(str)
df_unemp_12.drop(['CN', 'idk'], axis=1, inplace=True)

# 2013
df_unemp_13.drop(df_unemp_13[df_unemp_13['FIPS_state'].isnull()].index, inplace=True)
df_unemp_13['Year'] = df_unemp_13['Year'].astype(int).astype(str)
df_unemp_13.drop(['CN', 'idk'], axis=1, inplace=True)

# 2014
df_unemp_14.drop(df_unemp_14[df_unemp_14['FIPS_state'].isnull()].index, inplace=True)
df_unemp_14['Year'] = df_unemp_14['Year'].astype(int).astype(str)
df_unemp_14.drop(['CN', 'idk'], axis=1, inplace=True)

# 2015
df_unemp_15.drop(df_unemp_15[df_unemp_15['FIPS_state'].isnull()].index, inplace=True)
df_unemp_15['Year'] = df_unemp_15['Year'].astype(int).astype(str)
df_unemp_15.drop(['CN', 'idk'], axis=1, inplace=True)

# 2016
df_unemp_16.drop(df_unemp_16[df_unemp_16['FIPS_state'].isnull()].index, inplace=True)
df_unemp_16['Year'] = df_unemp_16['Year'].astype(int).astype(str)
df_unemp_16.drop(['CN', 'idk'], axis=1, inplace=True)

# 2017
df_unemp_17.drop(df_unemp_17[df_unemp_17['FIPS_state'].isnull()].index, inplace=True)
df_unemp_17['Year'] = df_unemp_17['Year'].astype(int).astype(str)
df_unemp_17.drop(['CN', 'idk'], axis=1, inplace=True)

# 2018
df_unemp_18.drop(df_unemp_18[df_unemp_18['FIPS_state'].isnull()].index, inplace=True)
df_unemp_18['Year'] = df_unemp_18['Year'].astype(int).astype(str)
df_unemp_18.drop(['CN', 'idk'], axis=1, inplace=True)

# 2019
# drop last three rows which were null
df_unemp_19.drop(df_unemp_19[df_unemp_19['FIPS_state'].isnull()].index, inplace=True)
df_unemp_19.drop(['CN'], axis=1, inplace=True)


### Reformat FIPS

In [107]:
# 2009
# add leading zeros to FIPS codes and convert to string
df_unemp_09['FIPS_county'] = np.select([df_unemp_09['FIPS_county']<10, df_unemp_09['FIPS_county']<100],
                    ['00'+df_unemp_09['FIPS_county'].astype(int).astype(str), '0'+df_unemp_09['FIPS_county'].astype(int).astype(str)],
                    default= df_unemp_09['FIPS_county'].astype(int).astype(str))
df_unemp_09['FIPS_state'] = np.where(df_unemp_09['FIPS_state']<10, 
                        '0'+df_unemp_09['FIPS_state'].astype(int).astype(str), df_unemp_09['FIPS_state'].astype(int).astype(str))
# Create main fips code
df_unemp_09['FIPS'] = df_unemp_09['FIPS_state'] + df_unemp_09['FIPS_county']

# 2010
df_unemp_10['FIPS_county'] = np.select([df_unemp_10['FIPS_county']<10, df_unemp_10['FIPS_county']<100],
                    ['00'+df_unemp_10['FIPS_county'].astype(int).astype(str), '0'+df_unemp_10['FIPS_county'].astype(int).astype(str)],
                    default= df_unemp_10['FIPS_county'].astype(int).astype(str))
df_unemp_10['FIPS_state'] = np.where(df_unemp_10['FIPS_state']<10, 
                        '0'+df_unemp_10['FIPS_state'].astype(int).astype(str), df_unemp_10['FIPS_state'].astype(int).astype(str))
df_unemp_10['FIPS'] = df_unemp_10['FIPS_state'] + df_unemp_10['FIPS_county']

# 2011
df_unemp_11['FIPS_county'] = np.select([df_unemp_11['FIPS_county']<10, df_unemp_11['FIPS_county']<100],
                    ['00'+df_unemp_11['FIPS_county'].astype(int).astype(str), '0'+df_unemp_11['FIPS_county'].astype(int).astype(str)],
                    default= df_unemp_11['FIPS_county'].astype(int).astype(str))
df_unemp_11['FIPS_state'] = np.where(df_unemp_11['FIPS_state']<10, 
                        '0'+df_unemp_11['FIPS_state'].astype(int).astype(str), df_unemp_11['FIPS_state'].astype(int).astype(str))
df_unemp_11['FIPS'] = df_unemp_11['FIPS_state'] + df_unemp_11['FIPS_county']

# 2012
df_unemp_12['FIPS_county'] = np.select([df_unemp_12['FIPS_county']<10, df_unemp_12['FIPS_county']<100],
                    ['00'+df_unemp_12['FIPS_county'].astype(int).astype(str), '0'+df_unemp_12['FIPS_county'].astype(int).astype(str)],
                    default= df_unemp_12['FIPS_county'].astype(int).astype(str))
df_unemp_12['FIPS_state'] = np.where(df_unemp_12['FIPS_state']<10, 
                        '0'+df_unemp_12['FIPS_state'].astype(int).astype(str), df_unemp_12['FIPS_state'].astype(int).astype(str))
df_unemp_12['FIPS'] = df_unemp_12['FIPS_state'] + df_unemp_12['FIPS_county']

# 2013
df_unemp_13['FIPS_county'] = np.select([df_unemp_13['FIPS_county']<10, df_unemp_13['FIPS_county']<100],
                    ['00'+df_unemp_13['FIPS_county'].astype(int).astype(str), '0'+df_unemp_13['FIPS_county'].astype(int).astype(str)],
                    default= df_unemp_13['FIPS_county'].astype(int).astype(str))
df_unemp_13['FIPS_state'] = np.where(df_unemp_13['FIPS_state']<10, 
                        '0'+df_unemp_13['FIPS_state'].astype(int).astype(str), df_unemp_13['FIPS_state'].astype(int).astype(str))
df_unemp_13['FIPS'] = df_unemp_13['FIPS_state'] + df_unemp_13['FIPS_county']

# 2014
df_unemp_14['FIPS_county'] = np.select([df_unemp_14['FIPS_county']<10, df_unemp_14['FIPS_county']<100],
                    ['00'+df_unemp_14['FIPS_county'].astype(int).astype(str), '0'+df_unemp_14['FIPS_county'].astype(int).astype(str)],
                    default= df_unemp_14['FIPS_county'].astype(int).astype(str))
df_unemp_14['FIPS_state'] = np.where(df_unemp_14['FIPS_state']<10, 
                        '0'+df_unemp_14['FIPS_state'].astype(int).astype(str), df_unemp_14['FIPS_state'].astype(int).astype(str))
df_unemp_14['FIPS'] = df_unemp_14['FIPS_state'] + df_unemp_14['FIPS_county']

# 2015
df_unemp_15['FIPS_county'] = np.select([df_unemp_15['FIPS_county']<10, df_unemp_15['FIPS_county']<100],
                    ['00'+df_unemp_15['FIPS_county'].astype(int).astype(str), '0'+df_unemp_15['FIPS_county'].astype(int).astype(str)],
                    default= df_unemp_15['FIPS_county'].astype(int).astype(str))
df_unemp_15['FIPS_state'] = np.where(df_unemp_15['FIPS_state']<10, 
                        '0'+df_unemp_15['FIPS_state'].astype(int).astype(str), df_unemp_15['FIPS_state'].astype(int).astype(str))
df_unemp_15['FIPS'] = df_unemp_15['FIPS_state'] + df_unemp_15['FIPS_county']

# 2016
df_unemp_16['FIPS_county'] = np.select([df_unemp_16['FIPS_county']<10, df_unemp_16['FIPS_county']<100],
                    ['00'+df_unemp_16['FIPS_county'].astype(int).astype(str), '0'+df_unemp_16['FIPS_county'].astype(int).astype(str)],
                    default= df_unemp_16['FIPS_county'].astype(int).astype(str))
df_unemp_16['FIPS_state'] = np.where(df_unemp_16['FIPS_state']<10, 
                        '0'+df_unemp_16['FIPS_state'].astype(int).astype(str), df_unemp_16['FIPS_state'].astype(int).astype(str))
df_unemp_16['FIPS'] = df_unemp_16['FIPS_state'] + df_unemp_16['FIPS_county']

# 2017
df_unemp_17['FIPS_county'] = np.select([df_unemp_17['FIPS_county']<10, df_unemp_17['FIPS_county']<100],
                    ['00'+df_unemp_17['FIPS_county'].astype(int).astype(str), '0'+df_unemp_17['FIPS_county'].astype(int).astype(str)],
                    default= df_unemp_17['FIPS_county'].astype(int).astype(str))
df_unemp_17['FIPS_state'] = np.where(df_unemp_17['FIPS_state']<10, 
                        '0'+df_unemp_17['FIPS_state'].astype(int).astype(str), df_unemp_17['FIPS_state'].astype(int).astype(str))
df_unemp_17['FIPS'] = df_unemp_17['FIPS_state'] + df_unemp_17['FIPS_county']

# 2018
df_unemp_18['FIPS_county'] = np.select([df_unemp_18['FIPS_county']<10, df_unemp_18['FIPS_county']<100],
                    ['00'+df_unemp_18['FIPS_county'].astype(int).astype(str), '0'+df_unemp_18['FIPS_county'].astype(int).astype(str)],
                    default= df_unemp_18['FIPS_county'].astype(int).astype(str))
df_unemp_18['FIPS_state'] = np.where(df_unemp_18['FIPS_state']<10, 
                        '0'+df_unemp_18['FIPS_state'].astype(int).astype(str), df_unemp_18['FIPS_state'].astype(int).astype(str))
df_unemp_18['FIPS'] = df_unemp_18['FIPS_state'] + df_unemp_18['FIPS_county']

# 2019
df_unemp_19['FIPS_county'] = np.select([df_unemp_19['FIPS_county']<10, df_unemp_19['FIPS_county']<100],
                    ['00'+df_unemp_19['FIPS_county'].astype(int).astype(str), '0'+df_unemp_19['FIPS_county'].astype(int).astype(str)],
                    default= df_unemp_19['FIPS_county'].astype(int).astype(str))
df_unemp_19['FIPS_state'] = np.where(df_unemp_19['FIPS_state']<10, 
                        '0'+df_unemp_19['FIPS_state'].astype(int).astype(str), df_unemp_19['FIPS_state'].astype(int).astype(str))
df_unemp_19['FIPS'] = df_unemp_19['FIPS_state'] + df_unemp_19['FIPS_county']


### Break down 2019 dataset into 2019 and 2020 dataframes

In [108]:
# Add year column for each, derived from Period column, and then drop period column
df_unemp_20 = df_unemp_19[df_unemp_19['Period'].str.contains('20')]
df_unemp_20['Year'] = '2020'
df_unemp_20.drop('Period', axis=1, inplace=True)

df_unemp_19 = df_unemp_19[df_unemp_19['Period'].str.contains('19')]
df_unemp_19['Year'] = '2019'
df_unemp_19.drop('Period', axis=1, inplace=True)

### Concatenate all df's to create master dataframe of all years

In [109]:
df_unemployment = pd.concat([df_unemp_09, df_unemp_10,df_unemp_11,df_unemp_12,df_unemp_13,df_unemp_14,df_unemp_15,df_unemp_16,
          df_unemp_17,df_unemp_18,df_unemp_19,df_unemp_20])
df_unemployment


Unnamed: 0,FIPS_state,FIPS_county,State/County,Year,Total_workforce,Employed,Unemployed,Unemployment_rate,FIPS
1,01,001,"Autauga County, AL",2009,24703,22301,2402,9.7,01001
2,01,003,"Baldwin County, AL",2009,82451,74403,8048,9.8,01003
3,01,005,"Barbour County, AL",2009,10003,8572,1431,14.3,01005
4,01,007,"Bibb County, AL",2009,8742,7581,1161,13.3,01007
5,01,009,"Blount County, AL",2009,26480,23832,2648,10,01009
...,...,...,...,...,...,...,...,...,...
45062,72,145,"Vega Baja Municipio, PR",2020,12543,11146,1397,11.1,72145
45063,72,147,"Vieques Municipio, PR",2020,2386,2133,253,10.6,72147
45064,72,149,"Villalba Municipio, PR",2020,6603,5969,634,9.6,72149
45065,72,151,"Yabucoa Municipio, PR",2020,7961,7168,793,10,72151


# 3. CPS Data (2019 &2020)
data dict 2019: https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar19.pdf<br>
data dict 2020: https://www2.census.gov/programs-surveys/cps/datasets/2020/march/ASEC2020ddl_pub_full.pdf


In [110]:
df_household_19 = pd.read_csv('../datasets/household/hhpub19.csv')
df_household_20 = pd.read_csv('../datasets/household/hhpub20.csv')

### Map column values to data dictionary

In [111]:
# use np.select to map values on 2019 data
conditions=[df_household_19['GTMETSTA'] ==1,df_household_19['GTMETSTA'] ==2, df_household_19['GTMETSTA'] ==3]
choices = ['HH_Metrop', 'HH_Non-Metrop','N/A']
df_household_19['GTMETSTA'] = np.select(conditions, choices,default='N/A')

conditions=[df_household_19['H_TENURE'] ==0,df_household_19['H_TENURE'] ==1,df_household_19['H_TENURE'] ==2, df_household_19['H_TENURE'] ==3]
choices = ['N/A', 'HH_owned', 'HH_rented','HH_rented_noCash']
df_household_19['H_TENURE'] = np.select(conditions, choices,default='N/A')

conditions=[df_household_19['HDIS_YN'] ==0,df_household_19['HDIS_YN'] ==1,df_household_19['HDIS_YN'] ==2]
choices = ['N/A',  'HH_disabled','HH_not_disabled' ]
df_household_19['HDIS_YN'] = np.select(conditions, choices,default='N/A')

conditions=[df_household_19['HCSP_YN'] ==0,df_household_19['HCSP_YN'] ==1,df_household_19['HCSP_YN'] ==2]
choices = ['N/A','HH_Child_support', 'HH_no_child_support' ]
df_household_19['HCSP_YN'] = np.select(conditions, choices,default='N/A')

conditions=[df_household_19['HINC_UC'] ==0,df_household_19['HINC_UC'] ==1,df_household_19['HINC_UC'] ==2]
choices = ['N/A','HH_unemployment_pay', 'HH_no_unemployment_pay' ]
df_household_19['HINC_UC'] = np.select(conditions, choices,default='N/A')

conditions=[df_household_19['NOW_HCOV'] ==1,df_household_19['NOW_HCOV'] ==2,df_household_19['NOW_HCOV'] ==3]
choices = [ 'HH_health_insured','HH_some_health_insured','HH_no_health_insured' ]
df_household_19['NOW_HCOV'] = np.select(conditions, choices,default='N/A')

In [112]:
# use np.select to map values on 2020 data
conditions=[df_household_20['GTMETSTA'] ==1,df_household_20['GTMETSTA'] ==2, df_household_20['GTMETSTA'] ==3]
choices = ['HH_Metrop', 'HH_Non-Metrop','N/A']
df_household_20['GTMETSTA'] = np.select(conditions, choices,default='N/A')

conditions=[df_household_20['H_TENURE'] ==0,df_household_20['H_TENURE'] ==1,df_household_20['H_TENURE'] ==2, df_household_20['H_TENURE'] ==3]
choices = ['N/A', 'HH_owned', 'HH_rented','HH_rented_noCash']
df_household_20['H_TENURE'] = np.select(conditions, choices,default='N/A')

conditions=[df_household_20['HDIS_YN'] ==0,df_household_20['HDIS_YN'] ==1,df_household_20['HDIS_YN'] ==2]
choices = ['N/A',  'HH_disabled','HH_not_disabled' ]
df_household_20['HDIS_YN'] = np.select(conditions, choices,default='N/A')

conditions=[df_household_20['HCSP_YN'] ==0,df_household_20['HCSP_YN'] ==1,df_household_20['HCSP_YN'] ==2]
choices = ['N/A','HH_Child_support', 'HH_no_child_support' ]
df_household_20['HCSP_YN'] = np.select(conditions, choices,default='N/A')

conditions=[df_household_20['HINC_UC'] ==0,df_household_20['HINC_UC'] ==1,df_household_20['HINC_UC'] ==2]
choices = ['N/A','HH_unemployment_pay', 'HH_no_unemployment_pay' ]
df_household_20['HINC_UC'] = np.select(conditions, choices,default='N/A')

conditions=[df_household_20['NOW_HCOV'] ==1,df_household_20['NOW_HCOV'] ==2,df_household_20['NOW_HCOV'] ==3]
choices = [ 'HH_health_insured','HH_some_health_insured','HH_no_health_insured' ]
df_household_20['NOW_HCOV'] = np.select(conditions, choices,default='N/A')

### Rename columns for interpretability

In [113]:
# rename 2019 data
df_household_19 = df_household_19.loc[:,['GESTFIPS', 'GTCO', 'GTMETSTA', 'HTOTVAL','H_NUMPER', 'HUNDER18',
                 'H_TENURE','HDIS_YN', 'HCSP_YN', 'HINC_UC','NOW_HCOV']]
df_household_19 = df_household_19.rename(columns={'GESTFIPS':'FIPS_state', 'GTCO':'FIPS_county', 'GTMETSTA':'Metro_status',
                               'HEFAMINC':'HH_income', 
                                'H_NUMPER':'HH_size', 'HUNDER18':'Num_minors','H_TENURE':'Rent_vs_Owned',
                               'HDIS_YN':'Disability', 'HCSP_YN':'Child_support', 'HINC_UC':'Unemployment_payments',
                               'NOW_HCOV':'Health_insurance'})


In [114]:
# rename 2020 data
df_household_20 = df_household_20.loc[:,['GESTFIPS', 'GTCO', 'GTMETSTA', 'HTOTVAL','H_NUMPER', 'HUNDER18',
                 'H_TENURE','HDIS_YN', 'HCSP_YN', 'HINC_UC','NOW_HCOV']]
df_household_20 = df_household_20.rename(columns={'GESTFIPS':'FIPS_state', 'GTCO':'FIPS_county', 'GTMETSTA':'Metro_status',
                               'HEFAMINC':'HH_income', 
                                'H_NUMPER':'HH_size', 'HUNDER18':'Num_minors','H_TENURE':'Rent_vs_Owned',
                               'HDIS_YN':'Disability', 'HCSP_YN':'Child_support', 'HINC_UC':'Unemployment_payments',
                               'NOW_HCOV':'Health_insurance'})


In [115]:
df_household_20

Unnamed: 0,FIPS_state,FIPS_county,Metro_status,HTOTVAL,HH_size,Num_minors,Rent_vs_Owned,Disability,Child_support,Unemployment_payments,Health_insurance
0,23,0,HH_Non-Metrop,127449,2,0,HH_owned,HH_not_disabled,HH_no_child_support,HH_no_unemployment_pay,HH_health_insured
1,23,0,HH_Non-Metrop,64680,2,0,HH_owned,HH_not_disabled,HH_no_child_support,HH_no_unemployment_pay,HH_health_insured
2,23,0,HH_Non-Metrop,40002,1,0,HH_owned,HH_not_disabled,HH_no_child_support,HH_no_unemployment_pay,HH_health_insured
3,23,0,HH_Non-Metrop,8424,2,0,HH_rented,HH_not_disabled,HH_no_child_support,HH_no_unemployment_pay,HH_health_insured
4,23,0,HH_Non-Metrop,59114,4,0,HH_owned,HH_not_disabled,HH_no_child_support,HH_no_unemployment_pay,HH_health_insured
...,...,...,...,...,...,...,...,...,...,...,...
91495,15,3,HH_Metrop,40700,1,0,HH_owned,HH_not_disabled,HH_no_child_support,HH_no_unemployment_pay,HH_health_insured
91496,15,3,HH_Metrop,20421,1,0,HH_rented_noCash,HH_not_disabled,HH_no_child_support,HH_no_unemployment_pay,HH_health_insured
91497,15,3,HH_Metrop,72455,2,0,HH_owned,HH_not_disabled,HH_no_child_support,HH_no_unemployment_pay,HH_some_health_insured
91498,15,3,HH_Metrop,13626,1,0,HH_rented,HH_not_disabled,HH_no_child_support,HH_no_unemployment_pay,HH_health_insured


# 4. Demographic Data
Data Dict: https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/cc-est2019-alldata.pdf

In [116]:
df_demographics = pd.read_csv('../datasets/demographics/demographics.csv',encoding='iso-8859-1')

### Map categorical variables to values from data dictionary

In [117]:
df_demographics = df_demographics.loc[:,['STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'YEAR', 'AGEGRP', 'TOT_POP','TOT_MALE', 'TOT_FEMALE',
    'WA_MALE','WA_FEMALE','BA_MALE','BA_FEMALE','IA_MALE','IA_FEMALE','AA_MALE','AA_FEMALE','NA_MALE','NA_FEMALE']]

conditions=[((df_demographics['YEAR'] ==1) | (df_demographics['YEAR'] ==2) | (df_demographics['YEAR'] ==3)),
            df_demographics['YEAR'] ==4, df_demographics['YEAR'] ==5, df_demographics['YEAR'] ==6, 
            df_demographics['YEAR'] ==7, df_demographics['YEAR'] ==8, df_demographics['YEAR'] ==9,
            df_demographics['YEAR'] ==10, df_demographics['YEAR'] ==11, df_demographics['YEAR'] ==12]
choices = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
df_demographics['YEAR'] = np.select(conditions, choices,default='N/A')

conditions=[df_demographics['AGEGRP'] ==0, df_demographics['AGEGRP'] ==1, df_demographics['AGEGRP'] ==2, df_demographics['AGEGRP'] ==3,
            df_demographics['AGEGRP'] ==4, df_demographics['AGEGRP'] ==5, df_demographics['AGEGRP'] ==6, 
            df_demographics['AGEGRP'] ==7, df_demographics['AGEGRP'] ==8, df_demographics['AGEGRP'] ==9,
            df_demographics['AGEGRP'] ==10, df_demographics['AGEGRP'] ==11, df_demographics['AGEGRP'] ==12,
           df_demographics['AGEGRP'] ==13, df_demographics['AGEGRP'] ==14, df_demographics['AGEGRP'] ==15,
           df_demographics['AGEGRP'] ==16, df_demographics['AGEGRP'] ==17, df_demographics['AGEGRP'] ==18]
choices = ['All Ages', 'Age 0 to 4 years', 'Age 5 to 9 years', 'Age 10 to 14 years', 'Age 15 to 19 years', 
           'Age 20 to 24 years', 'Age 25 to 29 years', 'Age 30 to 34 years', 'Age 35 to 39 years', 
           'Age 40 to 44 years', 'Age 45 to 49 years', 'Age 50 to 54 years', 'Age 55 to 59 years',
           'Age 60 to 64 years', 'Age 65 to 69 years', 'Age 70 to 74 years', 'Age 75 to 79 years',
            'Age 80 to 84 years', 'Age 85+']
df_demographics['AGEGRP'] = np.select(conditions, choices,default='N/A')



### Create new columns for totals

In [118]:
df_demographics['TOT_WHITE'] = df_demographics['WA_MALE'] + df_demographics['WA_FEMALE']
df_demographics['TOT_BLACK'] = df_demographics['BA_MALE'] + df_demographics['BA_FEMALE']
df_demographics['TOT_NATIVE'] = df_demographics['IA_MALE'] + df_demographics['IA_FEMALE']
df_demographics['TOT_ASIAN'] = df_demographics['AA_MALE'] + df_demographics['AA_FEMALE']
df_demographics['TOT_PACIFIC'] = df_demographics['NA_MALE'] + df_demographics['NA_FEMALE']

# drop unnecessary cols
df_demographics.drop(['WA_MALE','WA_FEMALE','BA_MALE','BA_FEMALE','IA_MALE','IA_FEMALE',
                      'AA_MALE','AA_FEMALE','NA_MALE','NA_FEMALE'], axis=1, inplace=True)

### Rename columns for interpretability

In [119]:
df_demographics.rename(columns={'YEAR':'Year','STATE':'FIPS_state', 'COUNTY':'FIPS_county', 'STNAME': 'State', 'CTYNAME':'County'}, inplace=True)

### Fix format of FIPS columns

In [120]:
df_demographics['FIPS_county'] = np.select([df_demographics['FIPS_county']<10, df_demographics['FIPS_county']<100],
                    ['00'+df_demographics['FIPS_county'].astype(str), '0'+df_demographics['FIPS_county'].astype(str)],
                    default= df_demographics['FIPS_county'].astype(str))
df_demographics['FIPS_state'] = np.where(df_demographics['FIPS_state']<10, 
                        '0'+df_demographics['FIPS_state'].astype(str), df_demographics['FIPS_state'].astype(str))

# Create main fips code
df_demographics['FIPS'] = df_demographics['FIPS_state'] + df_demographics['FIPS_county']
df_demographics

Unnamed: 0,FIPS_state,FIPS_county,State,County,Year,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,FIPS
0,01,001,Alabama,Autauga County,2010,All Ages,54571,26569,28002,43297,9689,258,484,47,01001
1,01,001,Alabama,Autauga County,2010,Age 0 to 4 years,3579,1866,1713,2727,679,8,28,1,01001
2,01,001,Alabama,Autauga County,2010,Age 5 to 9 years,3991,2001,1990,3047,773,22,38,4,01001
3,01,001,Alabama,Autauga County,2010,Age 10 to 14 years,4290,2171,2119,3278,837,27,41,5,01001
4,01,001,Alabama,Autauga County,2010,Age 15 to 19 years,4290,2213,2077,3213,926,19,39,6,01001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
716371,56,045,Wyoming,Weston County,2019,Age 65 to 69 years,499,280,219,459,1,2,31,0,56045
716372,56,045,Wyoming,Weston County,2019,Age 70 to 74 years,352,180,172,342,0,2,4,0,56045
716373,56,045,Wyoming,Weston County,2019,Age 75 to 79 years,229,107,122,225,0,2,0,0,56045
716374,56,045,Wyoming,Weston County,2019,Age 80 to 84 years,198,82,116,195,0,2,0,0,56045


In [157]:
df_demographics[(df_demographics.Year=='2010') & (df_demographics.AGEGRP=='All Ages')]

Unnamed: 0,FIPS_state,FIPS_county,State,County,Year,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,FIPS
0,01,001,Alabama,Autauga County,2010,All Ages,54571,26569,28002,43297,9689,258,484,47,01001
19,01,001,Alabama,Autauga County,2010,All Ages,54597,26584,28013,43313,9699,258,484,47,01001
38,01,001,Alabama,Autauga County,2010,All Ages,54773,26672,28101,43420,9750,251,497,46,01001
228,01,003,Alabama,Baldwin County,2010,All Ages,182265,89196,93069,159710,17274,1339,1369,121,01003
247,01,003,Alabama,Baldwin County,2010,All Ages,182265,89196,93069,159710,17274,1339,1369,121,01003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715939,56,043,Wyoming,Washakie County,2010,All Ages,8528,4255,4273,8170,24,135,59,8,56043
715958,56,043,Wyoming,Washakie County,2010,All Ages,8530,4252,4278,8170,24,132,59,8,56043
716148,56,045,Wyoming,Weston County,2010,All Ages,7208,3790,3418,6954,22,97,21,3,56045
716167,56,045,Wyoming,Weston County,2010,All Ages,7208,3790,3418,6954,22,97,21,3,56045


# 5. Houselessness

In [121]:
df_houseless_19 = pd.read_excel('../datasets/houseless/houseless_coc.xlsx', sheet_name='2019')
df_houseless_18 = pd.read_excel('../datasets/houseless/houseless_coc.xlsx', sheet_name='2018')
df_houseless_17 = pd.read_excel('../datasets/houseless/houseless_coc.xlsx', sheet_name='2017')
df_houseless_16 = pd.read_excel('../datasets/houseless/houseless_coc.xlsx', sheet_name='2016')
df_houseless_15 = pd.read_excel('../datasets/houseless/houseless_coc.xlsx', sheet_name='2015')
df_houseless_14 = pd.read_excel('../datasets/houseless/houseless_coc.xlsx', sheet_name='2014')
df_houseless_13 = pd.read_excel('../datasets/houseless/houseless_coc.xlsx', sheet_name='2013')
df_houseless_12 = pd.read_excel('../datasets/houseless/houseless_coc.xlsx', sheet_name='2012')
df_houseless_11 = pd.read_excel('../datasets/houseless/houseless_coc.xlsx', sheet_name='2011')
df_houseless_10 = pd.read_excel('../datasets/houseless/houseless_coc.xlsx', sheet_name='2010')
df_houseless_09 = pd.read_excel('../datasets/houseless/houseless_coc.xlsx', sheet_name='2009')


### Trim down unnecessary columns

In [122]:
df_houseless_19 = df_houseless_19.loc[:,['CoC Number', 'CoC Name','Overall Homeless, 2019', 'Sheltered Total Homeless, 2019', 'Unsheltered Homeless, 2019']]
df_houseless_18 = df_houseless_18.loc[:,['CoC Number', 'CoC Name','Overall Homeless, 2018', 'Sheltered Total Homeless, 2018', 'Unsheltered Homeless, 2018']]
df_houseless_17 = df_houseless_17.loc[:,['CoC Number', 'CoC Name','Overall Homeless, 2017', 'Sheltered Total Homeless, 2017', 'Unsheltered Homeless, 2017']]
df_houseless_16 = df_houseless_16.loc[:,['CoC Number', 'CoC Name','Overall Homeless, 2016', 'Sheltered Total Homeless, 2016', 'Unsheltered Homeless, 2016']]
df_houseless_15 = df_houseless_15.loc[:,['CoC Number', 'CoC Name','Overall Homeless, 2015', 'Sheltered Total Homeless, 2015', 'Unsheltered Homeless, 2015']]
df_houseless_14 = df_houseless_14.loc[:,['CoC Number', 'CoC Name','Overall Homeless, 2014', 'Sheltered Total Homeless, 2014', 'Unsheltered Homeless, 2014']]
df_houseless_13 = df_houseless_13.loc[:,['CoC Number', 'CoC Name','Overall Homeless, 2013', 'Sheltered Total Homeless, 2013', 'Unsheltered Homeless, 2013']]
df_houseless_12 = df_houseless_12.loc[:,['CoC Number', 'CoC Name','Overall Homeless, 2012', 'Sheltered Total Homeless, 2012', 'Unsheltered Homeless, 2012']]
df_houseless_11 = df_houseless_11.loc[:,['CoC Number', 'CoC Name','Overall Homeless, 2011', 'Sheltered Total Homeless, 2011', 'Unsheltered Homeless, 2011']]
df_houseless_10 = df_houseless_10.loc[:,['CoC Number', 'CoC Name','Overall Homeless, 2010', 'Sheltered Total Homeless, 2010', 'Unsheltered Homeless, 2010']]
df_houseless_09 = df_houseless_09.loc[:,['CoC Number', 'CoC Name','Overall Homeless, 2009', 'Sheltered Total Homeless, 2009', 'Unsheltered Homeless, 2009']]



### Rename columns for interpretability

In [123]:
df_houseless_19.rename(columns={'Overall Homeless, 2019':'Tot_houseless', 
                       'Sheltered Total Homeless, 2019': 'Sheltered_houseless',
                       'Unsheltered Homeless, 2019': 'Unsheltered_houseless'}, inplace=True)
df_houseless_18.rename(columns={'Overall Homeless, 2018':'Tot_houseless', 
                       'Sheltered Total Homeless, 2018': 'Sheltered_houseless',
                       'Unsheltered Homeless, 2018': 'Unsheltered_houseless'}, inplace=True)
df_houseless_17.rename(columns={'Overall Homeless, 2017':'Tot_houseless', 
                       'Sheltered Total Homeless, 2017': 'Sheltered_houseless',
                       'Unsheltered Homeless, 2017': 'Unsheltered_houseless'}, inplace=True)
df_houseless_16.rename(columns={'Overall Homeless, 2016':'Tot_houseless', 
                       'Sheltered Total Homeless, 2016': 'Sheltered_houseless',
                       'Unsheltered Homeless, 2016': 'Unsheltered_houseless'}, inplace=True)
df_houseless_15.rename(columns={'Overall Homeless, 2015':'Tot_houseless', 
                       'Sheltered Total Homeless, 2015': 'Sheltered_houseless',
                       'Unsheltered Homeless, 2015': 'Unsheltered_houseless'}, inplace=True)
df_houseless_14.rename(columns={'Overall Homeless, 2014':'Tot_houseless', 
                       'Sheltered Total Homeless, 2014': 'Sheltered_houseless',
                       'Unsheltered Homeless, 2014': 'Unsheltered_houseless'}, inplace=True)
df_houseless_13.rename(columns={'Overall Homeless, 2013':'Tot_houseless', 
                       'Sheltered Total Homeless, 2013': 'Sheltered_houseless',
                       'Unsheltered Homeless, 2013': 'Unsheltered_houseless'}, inplace=True)
df_houseless_12.rename(columns={'Overall Homeless, 2012':'Tot_houseless', 
                       'Sheltered Total Homeless, 2012': 'Sheltered_houseless',
                       'Unsheltered Homeless, 2012': 'Unsheltered_houseless'}, inplace=True)
df_houseless_11.rename(columns={'Overall Homeless, 2011':'Tot_houseless', 
                       'Sheltered Total Homeless, 2011': 'Sheltered_houseless',
                       'Unsheltered Homeless, 2011': 'Unsheltered_houseless'}, inplace=True)
df_houseless_10.rename(columns={'Overall Homeless, 2010':'Tot_houseless', 
                       'Sheltered Total Homeless, 2010': 'Sheltered_houseless',
                       'Unsheltered Homeless, 2010': 'Unsheltered_houseless'}, inplace=True)
df_houseless_09.rename(columns={'Overall Homeless, 2009':'Tot_houseless', 
                       'Sheltered Total Homeless, 2009': 'Sheltered_houseless',
                       'Unsheltered Homeless, 2009': 'Unsheltered_houseless'}, inplace=True)


# Subtask: Map CoC's to Counties
Method: merge CoC column to demographics df to get population perCoC.<br>
Join that with houseless df to derive houseless rate per CoC.<br>
Join that to CoC_county to get houseless rate per county.<br>
(optional) Join rates back with demographic df to get number of houseless, per county<br>
Source: https://github.com/tomhbyrne/HUD-CoC-Geography-Crosswalk/blob/master/output/county_coc_match.csv

### Import csv that maps counties to a CoC

In [124]:
CoC_county = pd.read_csv('../datasets/houseless/county_coc_match.csv', encoding='ISO-8859-1')
CoC_county = CoC_county.loc[:,['county_fips','coc_number']]

# rename columns
CoC_county.rename(columns={'county_fips':'FIPS'}, inplace=True)

# drop 2 rows with no FIPS
CoC_county.drop(CoC_county[CoC_county['FIPS'].isnull()].index, axis=0, inplace=True)

### Fix format of FIPS column

In [125]:
# change FIPS to string and add leading zeros if needed
CoC_county['FIPS'] = np.where(CoC_county['FIPS']<10000, 
                        '0'+CoC_county['FIPS'].astype(int).astype(str), CoC_county['FIPS'].astype(int).astype(str))

### Merge CoC mapping with demographics df to get population count per CoC

In [126]:
# get total population by county
avg = df_demographics[(df_demographics['Year']=='2010') & (df_demographics['AGEGRP']=='All Ages')].groupby('FIPS').mean()
# merge with CoC_county to get pop count per CoC
merged = avg.merge(CoC_county, on='FIPS', how='left')
pop_per_coc_10 = merged.groupby('coc_number').sum()['TOT_POP'].reset_index()

# repeat for each year
avg = df_demographics[(df_demographics['Year']=='2011') & (df_demographics['AGEGRP']=='All Ages')].groupby('FIPS').mean()
merged = avg.merge(CoC_county, on='FIPS', how='left')
pop_per_coc_11 = merged.groupby('coc_number').sum()['TOT_POP'].reset_index()

# 2012
avg = df_demographics[(df_demographics['Year']=='2012') & (df_demographics['AGEGRP']=='All Ages')].groupby('FIPS').mean()
merged = avg.merge(CoC_county, on='FIPS', how='left')
pop_per_coc_12 = merged.groupby('coc_number').sum()['TOT_POP'].reset_index()

# 2013
avg = df_demographics[(df_demographics['Year']=='2013') & (df_demographics['AGEGRP']=='All Ages')].groupby('FIPS').mean()
merged = avg.merge(CoC_county, on='FIPS', how='left')
pop_per_coc_13 = merged.groupby('coc_number').sum()['TOT_POP'].reset_index()

# 2014
avg = df_demographics[(df_demographics['Year']=='2014') & (df_demographics['AGEGRP']=='All Ages')].groupby('FIPS').mean()
merged = avg.merge(CoC_county, on='FIPS', how='left')
pop_per_coc_14 = merged.groupby('coc_number').sum()['TOT_POP'].reset_index()

# 2015
avg = df_demographics[(df_demographics['Year']=='2015') & (df_demographics['AGEGRP']=='All Ages')].groupby('FIPS').mean()
merged = avg.merge(CoC_county, on='FIPS', how='left')
pop_per_coc_15 = merged.groupby('coc_number').sum()['TOT_POP'].reset_index()

# 2016
avg = df_demographics[(df_demographics['Year']=='2016') & (df_demographics['AGEGRP']=='All Ages')].groupby('FIPS').mean()
merged = avg.merge(CoC_county, on='FIPS', how='left')
pop_per_coc_16 = merged.groupby('coc_number').sum()['TOT_POP'].reset_index()

# 2017
avg = df_demographics[(df_demographics['Year']=='2017') & (df_demographics['AGEGRP']=='All Ages')].groupby('FIPS').mean()
merged = avg.merge(CoC_county, on='FIPS', how='left')
pop_per_coc_17 = merged.groupby('coc_number').sum()['TOT_POP'].reset_index()

# 2018
avg = df_demographics[(df_demographics['Year']=='2018') & (df_demographics['AGEGRP']=='All Ages')].groupby('FIPS').mean()
merged = avg.merge(CoC_county, on='FIPS', how='left')
pop_per_coc_18 = merged.groupby('coc_number').sum()['TOT_POP'].reset_index()

# 2019
avg = df_demographics[(df_demographics['Year']=='2019') & (df_demographics['AGEGRP']=='All Ages')].groupby('FIPS').mean()
merged = avg.merge(CoC_county, on='FIPS', how='left')
pop_per_coc_19 = merged.groupby('coc_number').sum()['TOT_POP'].reset_index()


### Merge population count per CoC with houselessness df to derive houselessness rate per CoC

In [127]:
# Get houseless rate per CoC 
# 2010
merged = pop_per_coc_10.merge(df_houseless_10, left_on='coc_number', right_on='CoC Number')
merged['Houseless_rate'] = merged['Tot_houseless']/merged['TOT_POP'] 
merged['Sheltered_rate'] = merged['Sheltered_houseless']/merged['TOT_POP'] 
merged['Unsheltered_rate'] = merged['Unsheltered_houseless']/merged['TOT_POP'] 
rates_10 = merged.loc[:,['coc_number', 'Houseless_rate','Sheltered_rate','Unsheltered_rate']]

# 2011
merged = pop_per_coc_11.merge(df_houseless_11, left_on='coc_number', right_on='CoC Number')
merged['Houseless_rate'] = merged['Tot_houseless']/merged['TOT_POP'] 
merged['Sheltered_rate'] = merged['Sheltered_houseless']/merged['TOT_POP'] 
merged['Unsheltered_rate'] = merged['Unsheltered_houseless']/merged['TOT_POP'] 
rates_11 = merged.loc[:,['coc_number', 'Houseless_rate','Sheltered_rate','Unsheltered_rate']]

# 2012
merged = pop_per_coc_12.merge(df_houseless_12, left_on='coc_number', right_on='CoC Number')
merged['Houseless_rate'] = merged['Tot_houseless']/merged['TOT_POP'] 
merged['Sheltered_rate'] = merged['Sheltered_houseless']/merged['TOT_POP'] 
merged['Unsheltered_rate'] = merged['Unsheltered_houseless']/merged['TOT_POP'] 
rates_12 = merged.loc[:,['coc_number', 'Houseless_rate','Sheltered_rate','Unsheltered_rate']]

# 2013
merged = pop_per_coc_13.merge(df_houseless_13, left_on='coc_number', right_on='CoC Number')
merged['Houseless_rate'] = merged['Tot_houseless']/merged['TOT_POP'] 
merged['Sheltered_rate'] = merged['Sheltered_houseless']/merged['TOT_POP'] 
merged['Unsheltered_rate'] = merged['Unsheltered_houseless']/merged['TOT_POP'] 
rates_13 = merged.loc[:,['coc_number', 'Houseless_rate','Sheltered_rate','Unsheltered_rate']]

# 2014
merged = pop_per_coc_14.merge(df_houseless_14, left_on='coc_number', right_on='CoC Number')
merged['Houseless_rate'] = merged['Tot_houseless']/merged['TOT_POP'] 
merged['Sheltered_rate'] = merged['Sheltered_houseless']/merged['TOT_POP'] 
merged['Unsheltered_rate'] = merged['Unsheltered_houseless']/merged['TOT_POP'] 
rates_14 = merged.loc[:,['coc_number', 'Houseless_rate','Sheltered_rate','Unsheltered_rate']]

# 2015
merged = pop_per_coc_15.merge(df_houseless_15, left_on='coc_number', right_on='CoC Number')
merged['Houseless_rate'] = merged['Tot_houseless']/merged['TOT_POP'] 
merged['Sheltered_rate'] = merged['Sheltered_houseless']/merged['TOT_POP'] 
merged['Unsheltered_rate'] = merged['Unsheltered_houseless']/merged['TOT_POP'] 
rates_15 = merged.loc[:,['coc_number', 'Houseless_rate','Sheltered_rate','Unsheltered_rate']]

# 2016
merged = pop_per_coc_16.merge(df_houseless_16, left_on='coc_number', right_on='CoC Number')
merged['Houseless_rate'] = merged['Tot_houseless']/merged['TOT_POP'] 
merged['Sheltered_rate'] = merged['Sheltered_houseless']/merged['TOT_POP'] 
merged['Unsheltered_rate'] = merged['Unsheltered_houseless']/merged['TOT_POP'] 
rates_16 = merged.loc[:,['coc_number', 'Houseless_rate','Sheltered_rate','Unsheltered_rate']]

# 2017
merged = pop_per_coc_17.merge(df_houseless_17, left_on='coc_number', right_on='CoC Number')
merged['Houseless_rate'] = merged['Tot_houseless']/merged['TOT_POP'] 
merged['Sheltered_rate'] = merged['Sheltered_houseless']/merged['TOT_POP'] 
merged['Unsheltered_rate'] = merged['Unsheltered_houseless']/merged['TOT_POP'] 
rates_17 = merged.loc[:,['coc_number', 'Houseless_rate','Sheltered_rate','Unsheltered_rate']]

# 2018
merged = pop_per_coc_18.merge(df_houseless_18, left_on='coc_number', right_on='CoC Number')
merged['Houseless_rate'] = merged['Tot_houseless']/merged['TOT_POP'] 
merged['Sheltered_rate'] = merged['Sheltered_houseless']/merged['TOT_POP'] 
merged['Unsheltered_rate'] = merged['Unsheltered_houseless']/merged['TOT_POP'] 
rates_18 = merged.loc[:,['coc_number', 'Houseless_rate','Sheltered_rate','Unsheltered_rate']]

# 2019
merged = pop_per_coc_19.merge(df_houseless_19, left_on='coc_number', right_on='CoC Number')
merged['Houseless_rate'] = merged['Tot_houseless']/merged['TOT_POP'] 
merged['Sheltered_rate'] = merged['Sheltered_houseless']/merged['TOT_POP'] 
merged['Unsheltered_rate'] = merged['Unsheltered_houseless']/merged['TOT_POP'] 
rates_19 = merged.loc[:,['coc_number', 'Houseless_rate','Sheltered_rate','Unsheltered_rate']]

### Merge CoC rates with CoC_county mapping to get houseless rates per county

In [128]:
# Get houseless rate per county, for each year
df_houseless_10 = rates_10.merge(CoC_county, on='coc_number')
df_houseless_11 = rates_11.merge(CoC_county, on='coc_number')
df_houseless_12 = rates_12.merge(CoC_county, on='coc_number')
df_houseless_13 = rates_13.merge(CoC_county, on='coc_number')
df_houseless_14 = rates_14.merge(CoC_county, on='coc_number')
df_houseless_15 = rates_15.merge(CoC_county, on='coc_number')
df_houseless_16 = rates_16.merge(CoC_county, on='coc_number')
df_houseless_17 = rates_17.merge(CoC_county, on='coc_number')
df_houseless_18 = rates_18.merge(CoC_county, on='coc_number')
df_houseless_19 = rates_19.merge(CoC_county, on='coc_number')

### Add Year column to each df

In [129]:
df_houseless_10['Year'] = '2010'
df_houseless_11['Year'] = '2011'
df_houseless_12['Year'] = '2012'
df_houseless_13['Year'] = '2013'
df_houseless_14['Year'] = '2014'
df_houseless_15['Year'] = '2015'
df_houseless_16['Year'] = '2016'
df_houseless_17['Year'] = '2017'
df_houseless_18['Year'] = '2018'
df_houseless_19['Year'] = '2019'


### Concatenate all df's to create master dataframe of all years

In [130]:
df_houseless = pd.concat([df_houseless_10, df_houseless_11,df_houseless_12,df_houseless_13,df_houseless_14,df_houseless_15,
          df_houseless_16,df_houseless_17,df_houseless_18,df_houseless_19])
df_houseless

Unnamed: 0,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,FIPS,Year
0,AK-500,0.004211,0.003807,0.000404,02020,2010
1,AK-501,0.001508,0.001331,0.000177,02013,2010
2,AK-501,0.001508,0.001331,0.000177,02016,2010
3,AK-501,0.001508,0.001331,0.000177,02050,2010
4,AK-501,0.001508,0.001331,0.000177,02060,2010
...,...,...,...,...,...,...
3158,WY-500,0.000947,0.000731,0.000216,56037,2019
3159,WY-500,0.000947,0.000731,0.000216,56039,2019
3160,WY-500,0.000947,0.000731,0.000216,56041,2019
3161,WY-500,0.000947,0.000731,0.000216,56043,2019


# 6. Rent Prices
Zillow Observed Rent Index (ZORI): A smoothed measure of the typical observed market rate rent across a given region. ZORI is a repeat-rent index that is weighted to the rental housing stock to ensure representativeness across the entire market, not just those homes currently listed for-rent. The index is dollar-denominated by computing the mean of listed rents that fall into the 40th to 60th percentile range for all homes and apartments in a given region, which is once again weighted to reflect the rental housing stock. Details available in ZORI methodology.


### Take averages of months in each year

In [131]:
# import Zillow dataset
df_rent = pd.read_csv('../datasets/rent_prices/rent_prices.csv')

# isolate columns corresponding to each year, and make new column
drop_14 = df_rent.columns[df_rent.columns.str.contains('2014')]
df_rent['2014'] = df_rent.loc[:,drop_14].mean(axis=1)

drop_15 = df_rent.columns[df_rent.columns.str.contains('2015')]
df_rent['2015'] = df_rent.loc[:,drop_15].mean(axis=1)

drop_16 = df_rent.columns[df_rent.columns.str.contains('2016')]
df_rent['2016'] = df_rent.loc[:,drop_16].mean(axis=1)

drop_17 = df_rent.columns[df_rent.columns.str.contains('2017')]
df_rent['2017'] = df_rent.loc[:,drop_17].mean(axis=1)

drop_18 = df_rent.columns[df_rent.columns.str.contains('2018')]
df_rent['2018'] = df_rent.loc[:,drop_18].mean(axis=1)

drop_19 = df_rent.columns[df_rent.columns.str.contains('2019')]
df_rent['2019'] = df_rent.loc[:,drop_19].mean(axis=1)

drop_20 = df_rent.columns[df_rent.columns.str.contains('2020')]
df_rent['2020'] = df_rent.loc[:,drop_20].mean(axis=1)

# drop all monthly data 
to_drop = drop_14.append(drop_15).append(drop_16).append(drop_17).append(drop_18).append(drop_19).append(drop_20)
df_rent.drop(to_drop, axis=1, inplace=True)

### Rename columns

In [132]:
df_rent.rename(columns={'RegionName':'Zipcode', 'MsaName':'City/State'}, inplace=True)

## Subtask: Map Zipcodes to Counties
Method: join county data to each zipcode, then groupby county and take mean

In [133]:
zips = pd.read_csv('../datasets/rent_prices/uszips.csv')


### Clean Zipcode dataset

In [134]:
# drop some columns and rename some
zips =zips.loc[:,['zip', 'lat', 'lng','county_fips']]
zips.rename(columns={'zip': 'Zipcode', 'county_fips': 'FIPS'},inplace=True)

# add leading zero to FIPS values where needed
zips['FIPS'] = np.where(zips['FIPS']<10000, 
                        '0'+zips['FIPS'].astype(str), zips['FIPS'].astype(str))

In [135]:
merged = df_rent.merge(zips, on='Zipcode', how='left')

In [136]:
df_rent = merged.groupby('FIPS').mean().drop(['RegionID', 'Zipcode', 'SizeRank'],axis=1).reset_index()
df_rent

Unnamed: 0,FIPS,2014,2015,2016,2017,2018,2019,2020,lat,lng
0,01073,1019.995960,1049.230909,1070.125758,1090.527273,1127.700000,1165.383333,1199.082222,33.508132,-86.754922
1,01117,1229.755051,1265.133333,1282.000000,1296.611111,1333.997475,1380.585859,1416.266667,33.280850,-86.721547
2,04003,1051.250000,1047.458333,1035.083333,1033.000000,1084.791667,1151.583333,1217.494444,31.535215,-110.189590
3,04013,1095.670228,1164.839760,1224.381957,1274.524778,1349.587550,1443.687661,1537.921840,33.499874,-112.038614
4,04019,928.546429,947.020635,974.154497,1015.562169,1070.978307,1135.423696,1206.153817,32.212688,-110.936371
...,...,...,...,...,...,...,...,...,...,...
307,53061,1390.782323,1490.695286,1610.368754,1717.006818,1793.825455,1868.800000,1924.686667,47.930571,-122.181536
308,53063,808.358297,844.682540,900.673160,959.858225,1031.226190,1086.455318,1146.716667,47.680049,-117.399160
309,55025,1280.044444,1348.669913,1388.861111,1418.138889,1437.537879,1468.611111,1506.958333,43.055933,-89.425323
310,55059,689.000000,695.400000,741.272727,782.416667,818.416667,872.100000,921.800000,42.622560,-87.830000


### Break down into separate df's by year, and add year column

In [137]:
# 2014
# isolate relevant columns, rename, and creat year column
df_rent_14 = df_rent[['FIPS', '2014', 'lat','lng']]
df_rent_14.rename(columns={'2014':'Rent'}, inplace=True)
df_rent_14['Year'] = '2014'

# 2015
df_rent_15 = df_rent[['FIPS', '2015', 'lat','lng']]
df_rent_15.rename(columns={'2015':'Rent'}, inplace=True)
df_rent_15['Year'] = '2015'

# 2016
df_rent_16 = df_rent[['FIPS', '2016', 'lat','lng']]
df_rent_16.rename(columns={'2016':'Rent'}, inplace=True)
df_rent_16['Year'] = '2016'

# 2016
df_rent_16 = df_rent[['FIPS', '2016', 'lat','lng']]
df_rent_16.rename(columns={'2016':'Rent'}, inplace=True)
df_rent_16['Year'] = '2016'

# 2017
df_rent_17 = df_rent[['FIPS', '2017', 'lat','lng']]
df_rent_17.rename(columns={'2017':'Rent'}, inplace=True)
df_rent_17['Year'] = '2017'

# 2018
df_rent_18 = df_rent[['FIPS', '2018', 'lat','lng']]
df_rent_18.rename(columns={'2018':'Rent'}, inplace=True)
df_rent_18['Year'] = '2018'

# 2019
df_rent_19 = df_rent[['FIPS', '2019', 'lat','lng']]
df_rent_19.rename(columns={'2019':'Rent'}, inplace=True)
df_rent_19['Year'] = '2019'

# 2020
df_rent_20 = df_rent[['FIPS', '2020', 'lat','lng']]
df_rent_20.rename(columns={'2020':'Rent'}, inplace=True)
df_rent_20['Year'] = '2020'


### Concatenate all df's to create master dataframe of all years

In [138]:
df_rent = pd.concat([df_rent_14,df_rent_15,df_rent_16,df_rent_17,df_rent_18,df_rent_19,df_rent_20])
df_rent

Unnamed: 0,FIPS,Rent,lat,lng,Year
0,01073,1019.995960,33.508132,-86.754922,2014
1,01117,1229.755051,33.280850,-86.721547,2014
2,04003,1051.250000,31.535215,-110.189590,2014
3,04013,1095.670228,33.499874,-112.038614,2014
4,04019,928.546429,32.212688,-110.936371,2014
...,...,...,...,...,...
307,53061,1924.686667,47.930571,-122.181536,2020
308,53063,1146.716667,47.680049,-117.399160,2020
309,55025,1506.958333,43.055933,-89.425323,2020
310,55059,921.800000,42.622560,-87.830000,2020


## 7. Businesses Data
Data Dict: https://www2.census.gov/programs-surveys/cbp/technical-documentation/records-layouts/2018_record_layouts/county-layout-2018.txt
<br>
naics dict: https://www2.census.gov/programs-surveys/cbp/technical-documentation/reference/naics-descriptions/naics2017.txt

In [139]:
df_business_09 = pd.read_csv('../datasets/businesses/bus_09.txt')
df_business_10 = pd.read_csv('../datasets/businesses/bus_10.txt')
df_business_11 = pd.read_csv('../datasets/businesses/bus_11.txt')
df_business_12 = pd.read_csv('../datasets/businesses/bus_12.txt')
df_business_13 = pd.read_csv('../datasets/businesses/bus_13.txt')
df_business_14 = pd.read_csv('../datasets/businesses/bus_14.txt')
df_business_15 = pd.read_csv('../datasets/businesses/bus_15.txt')
df_business_16 = pd.read_csv('../datasets/businesses/bus_16.txt')
df_business_17 = pd.read_csv('../datasets/businesses/bus_17.txt')
df_business_18 = pd.read_csv('../datasets/businesses/bus_18.txt')


### Drop columns and rename

In [140]:
# 2009
df_business_09 = df_business_09.loc[:,['fipstate', 'fipscty', 'naics', 'est']]
df_business_09 = df_business_09.rename(columns={'fipstate': 'FIPS_state', 'fipscty':'FIPS_county', 
                                'naics':'Industry','est':'Num_establishments'})
# 2010
df_business_10 = df_business_10.loc[:,['fipstate', 'fipscty', 'naics', 'est']]
df_business_10 = df_business_10.rename(columns={'fipstate': 'FIPS_state', 'fipscty':'FIPS_county', 
                                'naics':'Industry','est':'Num_establishments'})
# 2011
df_business_11 = df_business_11.loc[:,['fipstate', 'fipscty', 'naics', 'est']]
df_business_11 = df_business_11.rename(columns={'fipstate': 'FIPS_state', 'fipscty':'FIPS_county', 
                                'naics':'Industry','est':'Num_establishments'})
# 2012
df_business_12 = df_business_12.loc[:,['fipstate', 'fipscty', 'naics', 'est']]
df_business_12 = df_business_12.rename(columns={'fipstate': 'FIPS_state', 'fipscty':'FIPS_county', 
                                'naics':'Industry','est':'Num_establishments'})
# 2013
df_business_13 = df_business_13.loc[:,['fipstate', 'fipscty', 'naics', 'est']]
df_business_13 = df_business_13.rename(columns={'fipstate': 'FIPS_state', 'fipscty':'FIPS_county', 
                                'naics':'Industry','est':'Num_establishments'})
# 2014
df_business_14 = df_business_14.loc[:,['fipstate', 'fipscty', 'naics', 'est']]
df_business_14 = df_business_14.rename(columns={'fipstate': 'FIPS_state', 'fipscty':'FIPS_county', 
                                'naics':'Industry','est':'Num_establishments'})
# 2015
df_business_15 = df_business_15.loc[:,['FIPSTATE', 'FIPSCTY', 'NAICS', 'EST']]
df_business_15 = df_business_15.rename(columns={'FIPSTATE': 'FIPS_state', 'FIPSCTY':'FIPS_county', 
                                'NAICS':'Industry','EST':'Num_establishments'})
# 2016
df_business_16 = df_business_16.loc[:,['fipstate', 'fipscty', 'naics', 'est']]
df_business_16 = df_business_16.rename(columns={'fipstate': 'FIPS_state', 'fipscty':'FIPS_county', 
                                'naics':'Industry','est':'Num_establishments'})
# 2017
df_business_17 = df_business_17.loc[:,['fipstate', 'fipscty', 'naics', 'est']]
df_business_17 = df_business_17.rename(columns={'fipstate': 'FIPS_state', 'fipscty':'FIPS_county', 
                                'naics':'Industry','est':'Num_establishments'})
# 2018
df_business_18 = df_business_18.loc[:,['fipstate', 'fipscty', 'naics', 'est']]
df_business_18 = df_business_18.rename(columns={'fipstate': 'FIPS_state', 'fipscty':'FIPS_county', 
                                'naics':'Industry','est':'Num_establishments'})


### Reformat FIPS codes and join together state+county codes

In [141]:
# Add leading zeros to FIPS codes, and concat state and county FIPS codes
# 2009
df_business_09['FIPS_county'] = np.select([df_business_09['FIPS_county']<10, df_business_09['FIPS_county']<100],
                    ['00'+df_business_09['FIPS_county'].astype(str), '0'+df_business_09['FIPS_county'].astype(str)],
                    default= df_business_09['FIPS_county'].astype(str))
df_business_09['FIPS_state'] = np.where(df_business_09['FIPS_state']<10, 
                        '0'+df_business_09['FIPS_state'].astype(str), df_business_09['FIPS_state'].astype(str))
df_business_09['FIPS'] = df_business_09['FIPS_state'] + df_business_09['FIPS_county']

# 2010
df_business_10['FIPS_county'] = np.select([df_business_10['FIPS_county']<10, df_business_10['FIPS_county']<100],
                    ['00'+df_business_10['FIPS_county'].astype(str), '0'+df_business_10['FIPS_county'].astype(str)],
                    default= df_business_10['FIPS_county'].astype(str))
df_business_10['FIPS_state'] = np.where(df_business_10['FIPS_state']<10, 
                        '0'+df_business_10['FIPS_state'].astype(str), df_business_10['FIPS_state'].astype(str))
df_business_10['FIPS'] = df_business_10['FIPS_state'] + df_business_10['FIPS_county']

# 2011
df_business_11['FIPS_county'] = np.select([df_business_11['FIPS_county']<10, df_business_11['FIPS_county']<100],
                    ['00'+df_business_11['FIPS_county'].astype(str), '0'+df_business_11['FIPS_county'].astype(str)],
                    default= df_business_11['FIPS_county'].astype(str))
df_business_11['FIPS_state'] = np.where(df_business_11['FIPS_state']<10, 
                        '0'+df_business_11['FIPS_state'].astype(str), df_business_11['FIPS_state'].astype(str))
df_business_11['FIPS'] = df_business_11['FIPS_state'] + df_business_11['FIPS_county']

# 2012
df_business_12['FIPS_county'] = np.select([df_business_12['FIPS_county']<10, df_business_12['FIPS_county']<100],
                    ['00'+df_business_12['FIPS_county'].astype(str), '0'+df_business_12['FIPS_county'].astype(str)],
                    default= df_business_12['FIPS_county'].astype(str))
df_business_12['FIPS_state'] = np.where(df_business_12['FIPS_state']<10, 
                        '0'+df_business_12['FIPS_state'].astype(str), df_business_12['FIPS_state'].astype(str))
df_business_12['FIPS'] = df_business_12['FIPS_state'] + df_business_12['FIPS_county']

# 2013
df_business_13['FIPS_county'] = np.select([df_business_13['FIPS_county']<10, df_business_13['FIPS_county']<100],
                    ['00'+df_business_13['FIPS_county'].astype(str), '0'+df_business_13['FIPS_county'].astype(str)],
                    default= df_business_13['FIPS_county'].astype(str))
df_business_13['FIPS_state'] = np.where(df_business_13['FIPS_state']<10, 
                        '0'+df_business_13['FIPS_state'].astype(str), df_business_13['FIPS_state'].astype(str))
df_business_13['FIPS'] = df_business_13['FIPS_state'] + df_business_13['FIPS_county']

# 2014
df_business_14['FIPS_county'] = np.select([df_business_14['FIPS_county']<10, df_business_14['FIPS_county']<100],
                    ['00'+df_business_14['FIPS_county'].astype(str), '0'+df_business_14['FIPS_county'].astype(str)],
                    default= df_business_14['FIPS_county'].astype(str))
df_business_14['FIPS_state'] = np.where(df_business_14['FIPS_state']<10, 
                        '0'+df_business_14['FIPS_state'].astype(str), df_business_14['FIPS_state'].astype(str))
df_business_14['FIPS'] = df_business_14['FIPS_state'] + df_business_14['FIPS_county']

# 2015
df_business_15['FIPS_county'] = np.select([df_business_15['FIPS_county']<10, df_business_15['FIPS_county']<100],
                    ['00'+df_business_15['FIPS_county'].astype(str), '0'+df_business_15['FIPS_county'].astype(str)],
                    default= df_business_15['FIPS_county'].astype(str))
df_business_15['FIPS_state'] = np.where(df_business_15['FIPS_state']<10, 
                        '0'+df_business_15['FIPS_state'].astype(str), df_business_15['FIPS_state'].astype(str))
df_business_15['FIPS'] = df_business_15['FIPS_state'] + df_business_15['FIPS_county']

# 2016
df_business_16['FIPS_county'] = np.select([df_business_16['FIPS_county']<10, df_business_16['FIPS_county']<100],
                    ['00'+df_business_16['FIPS_county'].astype(str), '0'+df_business_16['FIPS_county'].astype(str)],
                    default= df_business_16['FIPS_county'].astype(str))
df_business_16['FIPS_state'] = np.where(df_business_16['FIPS_state']<10, 
                        '0'+df_business_16['FIPS_state'].astype(str), df_business_16['FIPS_state'].astype(str))
df_business_16['FIPS'] = df_business_16['FIPS_state'] + df_business_16['FIPS_county']

# 2017
df_business_17['FIPS_county'] = np.select([df_business_17['FIPS_county']<10, df_business_17['FIPS_county']<100],
                    ['00'+df_business_17['FIPS_county'].astype(str), '0'+df_business_17['FIPS_county'].astype(str)],
                    default= df_business_17['FIPS_county'].astype(str))
df_business_17['FIPS_state'] = np.where(df_business_17['FIPS_state']<10, 
                        '0'+df_business_17['FIPS_state'].astype(str), df_business_17['FIPS_state'].astype(str))
df_business_17['FIPS'] = df_business_17['FIPS_state'] + df_business_17['FIPS_county']

# 2018
df_business_18['FIPS_county'] = np.select([df_business_18['FIPS_county']<10, df_business_18['FIPS_county']<100],
                    ['00'+df_business_18['FIPS_county'].astype(str), '0'+df_business_18['FIPS_county'].astype(str)],
                    default= df_business_18['FIPS_county'].astype(str))
df_business_18['FIPS_state'] = np.where(df_business_18['FIPS_state']<10, 
                        '0'+df_business_18['FIPS_state'].astype(str), df_business_18['FIPS_state'].astype(str))
df_business_18['FIPS'] = df_business_18['FIPS_state'] + df_business_18['FIPS_county']



### Isolate rows for each type of food establishment

In [142]:
# 2009
restaraunts = ['72233/', '722330', '722///', '7224//', '72241/', '722410', '7225//', '72251/', '722511', '722513', '722514', '722515']
restaraunt_indices = df_business_09[df_business_09['Industry'].isin(restaraunts)].index
grocery = ['445///', '4451//', '44511/', '445110', '44512/', '445120', '4452//', '44521/', '445210', '44522/', '445220', '44523/', '445230', '44529/', '445291', '445292', '445299', '4453//', '44531/', '445310', '446///', '4461//', '44611/', '446110']
grocery_indices = df_business_09[df_business_09['Industry'].isin(grocery)].index
wholesale = ['4244//', '42441/', '424410', '42442/', '424420', '42443/', '424430', '42444/', '424440', '42445/', '424450', '42446/', '424460', '42447/', '424470', '42448/', '424480', '42449/', '424490', '4245//', '42451/', '424510', '42452/', '424520', '42459/', '424590']
wholesale_indices = df_business_09[df_business_09['Industry'].isin(wholesale)].index

# 2010
restaraunts = ['72233/', '722330', '722///', '7224//', '72241/', '722410', '7225//', '72251/', '722511', '722513', '722514', '722515']
restaraunt_indices = df_business_10[df_business_10['Industry'].isin(restaraunts)].index
grocery = ['445///', '4451//', '44511/', '445110', '44512/', '445120', '4452//', '44521/', '445210', '44522/', '445220', '44523/', '445230', '44529/', '445291', '445292', '445299', '4453//', '44531/', '445310', '446///', '4461//', '44611/', '446110']
grocery_indices = df_business_10[df_business_10['Industry'].isin(grocery)].index
wholesale = ['4244//', '42441/', '424410', '42442/', '424420', '42443/', '424430', '42444/', '424440', '42445/', '424450', '42446/', '424460', '42447/', '424470', '42448/', '424480', '42449/', '424490', '4245//', '42451/', '424510', '42452/', '424520', '42459/', '424590']
wholesale_indices = df_business_10[df_business_10['Industry'].isin(wholesale)].index

# 2011
restaraunts = ['72233/', '722330', '722///', '7224//', '72241/', '722410', '7225//', '72251/', '722511', '722513', '722514', '722515']
restaraunt_indices = df_business_11[df_business_11['Industry'].isin(restaraunts)].index
grocery = ['445///', '4451//', '44511/', '445110', '44512/', '445120', '4452//', '44521/', '445210', '44522/', '445220', '44523/', '445230', '44529/', '445291', '445292', '445299', '4453//', '44531/', '445310', '446///', '4461//', '44611/', '446110']
grocery_indices = df_business_11[df_business_11['Industry'].isin(grocery)].index
wholesale = ['4244//', '42441/', '424410', '42442/', '424420', '42443/', '424430', '42444/', '424440', '42445/', '424450', '42446/', '424460', '42447/', '424470', '42448/', '424480', '42449/', '424490', '4245//', '42451/', '424510', '42452/', '424520', '42459/', '424590']
wholesale_indices = df_business_11[df_business_11['Industry'].isin(wholesale)].index

# 2012
restaraunts = ['72233/', '722330', '722///', '7224//', '72241/', '722410', '7225//', '72251/', '722511', '722513', '722514', '722515']
restaraunt_indices = df_business_12[df_business_12['Industry'].isin(restaraunts)].index
grocery = ['445///', '4451//', '44511/', '445110', '44512/', '445120', '4452//', '44521/', '445210', '44522/', '445220', '44523/', '445230', '44529/', '445291', '445292', '445299', '4453//', '44531/', '445310', '446///', '4461//', '44611/', '446110']
grocery_indices = df_business_12[df_business_12['Industry'].isin(grocery)].index
wholesale = ['4244//', '42441/', '424410', '42442/', '424420', '42443/', '424430', '42444/', '424440', '42445/', '424450', '42446/', '424460', '42447/', '424470', '42448/', '424480', '42449/', '424490', '4245//', '42451/', '424510', '42452/', '424520', '42459/', '424590']
wholesale_indices = df_business_12[df_business_12['Industry'].isin(wholesale)].index

# 2013
restaraunts = ['72233/', '722330', '722///', '7224//', '72241/', '722410', '7225//', '72251/', '722511', '722513', '722514', '722515']
restaraunt_indices = df_business_13[df_business_13['Industry'].isin(restaraunts)].index
grocery = ['445///', '4451//', '44511/', '445110', '44512/', '445120', '4452//', '44521/', '445210', '44522/', '445220', '44523/', '445230', '44529/', '445291', '445292', '445299', '4453//', '44531/', '445310', '446///', '4461//', '44611/', '446110']
grocery_indices = df_business_13[df_business_13['Industry'].isin(grocery)].index
wholesale = ['4244//', '42441/', '424410', '42442/', '424420', '42443/', '424430', '42444/', '424440', '42445/', '424450', '42446/', '424460', '42447/', '424470', '42448/', '424480', '42449/', '424490', '4245//', '42451/', '424510', '42452/', '424520', '42459/', '424590']
wholesale_indices = df_business_13[df_business_13['Industry'].isin(wholesale)].index

# 2014
restaraunts = ['72233/', '722330', '722///', '7224//', '72241/', '722410', '7225//', '72251/', '722511', '722513', '722514', '722515']
restaraunt_indices = df_business_14[df_business_14['Industry'].isin(restaraunts)].index
grocery = ['445///', '4451//', '44511/', '445110', '44512/', '445120', '4452//', '44521/', '445210', '44522/', '445220', '44523/', '445230', '44529/', '445291', '445292', '445299', '4453//', '44531/', '445310', '446///', '4461//', '44611/', '446110']
grocery_indices = df_business_14[df_business_14['Industry'].isin(grocery)].index
wholesale = ['4244//', '42441/', '424410', '42442/', '424420', '42443/', '424430', '42444/', '424440', '42445/', '424450', '42446/', '424460', '42447/', '424470', '42448/', '424480', '42449/', '424490', '4245//', '42451/', '424510', '42452/', '424520', '42459/', '424590']
wholesale_indices = df_business_14[df_business_14['Industry'].isin(wholesale)].index

# 2015
restaraunts = ['72233/', '722330', '722///', '7224//', '72241/', '722410', '7225//', '72251/', '722511', '722513', '722514', '722515']
restaraunt_indices = df_business_15[df_business_15['Industry'].isin(restaraunts)].index
grocery = ['445///', '4451//', '44511/', '445110', '44512/', '445120', '4452//', '44521/', '445210', '44522/', '445220', '44523/', '445230', '44529/', '445291', '445292', '445299', '4453//', '44531/', '445310', '446///', '4461//', '44611/', '446110']
grocery_indices = df_business_15[df_business_15['Industry'].isin(grocery)].index
wholesale = ['4244//', '42441/', '424410', '42442/', '424420', '42443/', '424430', '42444/', '424440', '42445/', '424450', '42446/', '424460', '42447/', '424470', '42448/', '424480', '42449/', '424490', '4245//', '42451/', '424510', '42452/', '424520', '42459/', '424590']
wholesale_indices = df_business_15[df_business_15['Industry'].isin(wholesale)].index

# 2016
restaraunts = ['72233/', '722330', '722///', '7224//', '72241/', '722410', '7225//', '72251/', '722511', '722513', '722514', '722515']
restaraunt_indices = df_business_16[df_business_16['Industry'].isin(restaraunts)].index
grocery = ['445///', '4451//', '44511/', '445110', '44512/', '445120', '4452//', '44521/', '445210', '44522/', '445220', '44523/', '445230', '44529/', '445291', '445292', '445299', '4453//', '44531/', '445310', '446///', '4461//', '44611/', '446110']
grocery_indices = df_business_16[df_business_16['Industry'].isin(grocery)].index
wholesale = ['4244//', '42441/', '424410', '42442/', '424420', '42443/', '424430', '42444/', '424440', '42445/', '424450', '42446/', '424460', '42447/', '424470', '42448/', '424480', '42449/', '424490', '4245//', '42451/', '424510', '42452/', '424520', '42459/', '424590']
wholesale_indices = df_business_16[df_business_16['Industry'].isin(wholesale)].index

# 2017
restaraunts = ['72233/', '722330', '722///', '7224//', '72241/', '722410', '7225//', '72251/', '722511', '722513', '722514', '722515']
restaraunt_indices = df_business_17[df_business_17['Industry'].isin(restaraunts)].index
grocery = ['445///', '4451//', '44511/', '445110', '44512/', '445120', '4452//', '44521/', '445210', '44522/', '445220', '44523/', '445230', '44529/', '445291', '445292', '445299', '4453//', '44531/', '445310', '446///', '4461//', '44611/', '446110']
grocery_indices = df_business_17[df_business_17['Industry'].isin(grocery)].index
wholesale = ['4244//', '42441/', '424410', '42442/', '424420', '42443/', '424430', '42444/', '424440', '42445/', '424450', '42446/', '424460', '42447/', '424470', '42448/', '424480', '42449/', '424490', '4245//', '42451/', '424510', '42452/', '424520', '42459/', '424590']
wholesale_indices = df_business_17[df_business_17['Industry'].isin(wholesale)].index

# 2018
restaraunts = ['72233/', '722330', '722///', '7224//', '72241/', '722410', '7225//', '72251/', '722511', '722513', '722514', '722515']
restaraunt_indices = df_business_18[df_business_18['Industry'].isin(restaraunts)].index
grocery = ['445///', '4451//', '44511/', '445110', '44512/', '445120', '4452//', '44521/', '445210', '44522/', '445220', '44523/', '445230', '44529/', '445291', '445292', '445299', '4453//', '44531/', '445310', '446///', '4461//', '44611/', '446110']
grocery_indices = df_business_18[df_business_18['Industry'].isin(grocery)].index
wholesale = ['4244//', '42441/', '424410', '42442/', '424420', '42443/', '424430', '42444/', '424440', '42445/', '424450', '42446/', '424460', '42447/', '424470', '42448/', '424480', '42449/', '424490', '4245//', '42451/', '424510', '42452/', '424520', '42459/', '424590']
wholesale_indices = df_business_18[df_business_18['Industry'].isin(wholesale)].index


### Group dataframe by type of establishment, and sum # of businesses in each county

In [143]:
# 2009
restaraunt_df = df_business_09.loc[restaraunt_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
restaraunt_df.rename(columns={'Num_establishments':'Num_restaraunts'},inplace=True)
grocery_df = df_business_09.loc[grocery_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
grocery_df.rename(columns={'Num_establishments':'Num_grocery'},inplace=True)
wholesale_df = df_business_09.loc[wholesale_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
wholesale_df.rename(columns={'Num_establishments':'Num_wholesale'},inplace=True)
# merge three df's together
df_food_09 = wholesale_df.merge(restaraunt_df, how='outer',on='FIPS').merge(grocery_df, on='FIPS', how='outer')

# 2010
restaraunt_df = df_business_10.loc[restaraunt_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
restaraunt_df.rename(columns={'Num_establishments':'Num_restaraunts'},inplace=True)
grocery_df = df_business_10.loc[grocery_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
grocery_df.rename(columns={'Num_establishments':'Num_grocery'},inplace=True)
wholesale_df = df_business_10.loc[wholesale_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
wholesale_df.rename(columns={'Num_establishments':'Num_wholesale'},inplace=True)
# merge three df's together
df_food_10 = wholesale_df.merge(restaraunt_df, how='outer',on='FIPS').merge(grocery_df, on='FIPS', how='outer')

# 2011
restaraunt_df = df_business_11.loc[restaraunt_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
restaraunt_df.rename(columns={'Num_establishments':'Num_restaraunts'},inplace=True)
grocery_df = df_business_11.loc[grocery_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
grocery_df.rename(columns={'Num_establishments':'Num_grocery'},inplace=True)
wholesale_df = df_business_11.loc[wholesale_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
wholesale_df.rename(columns={'Num_establishments':'Num_wholesale'},inplace=True)
# merge three df's together
df_food_11 = wholesale_df.merge(restaraunt_df, how='outer',on='FIPS').merge(grocery_df, on='FIPS', how='outer')

# 2012
restaraunt_df = df_business_12.loc[restaraunt_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
restaraunt_df.rename(columns={'Num_establishments':'Num_restaraunts'},inplace=True)
grocery_df = df_business_12.loc[grocery_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
grocery_df.rename(columns={'Num_establishments':'Num_grocery'},inplace=True)
wholesale_df = df_business_12.loc[wholesale_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
wholesale_df.rename(columns={'Num_establishments':'Num_wholesale'},inplace=True)
# merge three df's together
df_food_12 = wholesale_df.merge(restaraunt_df, how='outer',on='FIPS').merge(grocery_df, on='FIPS', how='outer')

# 2013
restaraunt_df = df_business_13.loc[restaraunt_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
restaraunt_df.rename(columns={'Num_establishments':'Num_restaraunts'},inplace=True)
grocery_df = df_business_13.loc[grocery_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
grocery_df.rename(columns={'Num_establishments':'Num_grocery'},inplace=True)
wholesale_df = df_business_13.loc[wholesale_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
wholesale_df.rename(columns={'Num_establishments':'Num_wholesale'},inplace=True)
# merge three df's together
df_food_13 = wholesale_df.merge(restaraunt_df, how='outer',on='FIPS').merge(grocery_df, on='FIPS', how='outer')

# 2014
restaraunt_df = df_business_14.loc[restaraunt_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
restaraunt_df.rename(columns={'Num_establishments':'Num_restaraunts'},inplace=True)
grocery_df = df_business_14.loc[grocery_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
grocery_df.rename(columns={'Num_establishments':'Num_grocery'},inplace=True)
wholesale_df = df_business_14.loc[wholesale_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
wholesale_df.rename(columns={'Num_establishments':'Num_wholesale'},inplace=True)
# merge three df's together
df_food_14 = wholesale_df.merge(restaraunt_df, how='outer',on='FIPS').merge(grocery_df, on='FIPS', how='outer')

# 2015
restaraunt_df = df_business_15.loc[restaraunt_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
restaraunt_df.rename(columns={'Num_establishments':'Num_restaraunts'},inplace=True)
grocery_df = df_business_15.loc[grocery_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
grocery_df.rename(columns={'Num_establishments':'Num_grocery'},inplace=True)
wholesale_df = df_business_15.loc[wholesale_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
wholesale_df.rename(columns={'Num_establishments':'Num_wholesale'},inplace=True)
# merge three df's together
df_food_15 = wholesale_df.merge(restaraunt_df, how='outer',on='FIPS').merge(grocery_df, on='FIPS', how='outer')

# 2016
restaraunt_df = df_business_16.loc[restaraunt_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
restaraunt_df.rename(columns={'Num_establishments':'Num_restaraunts'},inplace=True)
grocery_df = df_business_16.loc[grocery_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
grocery_df.rename(columns={'Num_establishments':'Num_grocery'},inplace=True)
wholesale_df = df_business_16.loc[wholesale_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
wholesale_df.rename(columns={'Num_establishments':'Num_wholesale'},inplace=True)
# merge three df's together
df_food_16 = wholesale_df.merge(restaraunt_df, how='outer',on='FIPS').merge(grocery_df, on='FIPS', how='outer')

# 2017
restaraunt_df = df_business_17.loc[restaraunt_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
restaraunt_df.rename(columns={'Num_establishments':'Num_restaraunts'},inplace=True)
grocery_df = df_business_17.loc[grocery_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
grocery_df.rename(columns={'Num_establishments':'Num_grocery'},inplace=True)
wholesale_df = df_business_17.loc[wholesale_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
wholesale_df.rename(columns={'Num_establishments':'Num_wholesale'},inplace=True)
# merge three df's together
df_food_17 = wholesale_df.merge(restaraunt_df, how='outer',on='FIPS').merge(grocery_df, on='FIPS', how='outer')

# 2018
restaraunt_df = df_business_18.loc[restaraunt_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
restaraunt_df.rename(columns={'Num_establishments':'Num_restaraunts'},inplace=True)
grocery_df = df_business_18.loc[grocery_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
grocery_df.rename(columns={'Num_establishments':'Num_grocery'},inplace=True)
wholesale_df = df_business_18.loc[wholesale_indices,['FIPS', 'Num_establishments']].groupby('FIPS').sum().reset_index()
wholesale_df.rename(columns={'Num_establishments':'Num_wholesale'},inplace=True)
# merge three df's together
df_food_18 = wholesale_df.merge(restaraunt_df, how='outer',on='FIPS').merge(grocery_df, on='FIPS', how='outer')



### Add Year column to each df and impute nulls with zero

In [144]:
df_food_09['Year'] = '2009'
df_food_09.fillna(0, inplace=True)
df_food_10['Year'] = '2010'
df_food_10.fillna(0, inplace=True)
df_food_11['Year'] = '2011'
df_food_11.fillna(0, inplace=True)
df_food_12['Year'] = '2012'
df_food_12.fillna(0, inplace=True)
df_food_13['Year'] = '2013'
df_food_13.fillna(0, inplace=True)
df_food_14['Year'] = '2014'
df_food_14.fillna(0, inplace=True)
df_food_15['Year'] = '2015'
df_food_15.fillna(0, inplace=True)
df_food_16['Year'] = '2016'
df_food_16.fillna(0, inplace=True)
df_food_17['Year'] = '2017'
df_food_17.fillna(0, inplace=True)
df_food_18['Year'] = '2018'
df_food_18.fillna(0, inplace=True)


### Concatenate all df's to create master dataframe of all years

In [145]:
df_food = pd.concat([df_food_09,df_food_10,df_food_11,df_food_12,df_food_13,df_food_14,df_food_15,
          df_food_16,df_food_17,df_food_18])
df_food

Unnamed: 0,FIPS,Num_wholesale,Num_restaraunts,Num_grocery,Year
0,01001,71.0,13.0,109.0,2009
1,01005,5.0,7.0,109.0,2009
2,01007,3.0,21.0,92.0,2009
3,01009,19.0,32.0,186.0,2009
4,01013,13.0,106.0,29.0,2009
...,...,...,...,...,...
3105,29211,0.0,0.0,18.0,2018
3106,46121,0.0,0.0,12.0,2018
3107,48155,0.0,0.0,3.0,2018
3108,51045,0.0,0.0,14.0,2018


# Merge together all dataframes

In [146]:
# all: df_rent,df_houseless, df_demographics, df_unemployment, df_FA, df_food
merge1 = df_rent.merge(df_houseless, on=['FIPS','Year'],how='outer')


In [147]:
merge2 = merge1.merge(df_demographics, on=['FIPS','Year'],how='outer')


In [148]:
merge3 = merge2.merge(df_unemployment, on=['FIPS','Year','FIPS_state','FIPS_county'],how='outer')


In [149]:
merge4 = merge3.merge(df_FA, on=['FIPS','Year'],how='outer')


In [150]:
df_all = merge4.merge(df_food, on=['FIPS','Year'],how='outer')

In [154]:
df_all[df_all.Year=='2010']

Unnamed: 0,FIPS,Rent,lat,lng,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,FIPS_state,FIPS_county,State,County,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,State/County,Total_workforce,Employed,Unemployed,Unemployment_rate,FI Rate,Number Food Insecure Individuals,Low Threshold Type,High Threshold Type,Weighted Annual Dollars,Cost Per Meal,Child FI Rate,Num_wholesale,Num_restaraunts,Num_grocery
64479,02020,,,,2010,AK-500,0.004211,0.003807,0.000404,02,020,Alaska,Anchorage Municipality,All Ages,291826.0,148209.0,143617.0,198531.0,17125.0,23875.0,24034.0,6097.0,"Anchorage Borough/municipality, AK",157923,147539,10384,6.6,0.123,35090.0,SNAP,Other Nutrition Program,15945680.0,2.64,0.17,279.0,457.0,3566.0
64480,02020,,,,2010,AK-500,0.004211,0.003807,0.000404,02,020,Alaska,Anchorage Municipality,Age 0 to 4 years,21961.0,11349.0,10612.0,12199.0,1347.0,1942.0,1765.0,747.0,"Anchorage Borough/municipality, AK",157923,147539,10384,6.6,0.123,35090.0,SNAP,Other Nutrition Program,15945680.0,2.64,0.17,279.0,457.0,3566.0
64481,02020,,,,2010,AK-500,0.004211,0.003807,0.000404,02,020,Alaska,Anchorage Municipality,Age 5 to 9 years,20618.0,10542.0,10076.0,11423.0,1338.0,1880.0,1936.0,696.0,"Anchorage Borough/municipality, AK",157923,147539,10384,6.6,0.123,35090.0,SNAP,Other Nutrition Program,15945680.0,2.64,0.17,279.0,457.0,3566.0
64482,02020,,,,2010,AK-500,0.004211,0.003807,0.000404,02,020,Alaska,Anchorage Municipality,Age 10 to 14 years,20443.0,10407.0,10036.0,11668.0,1350.0,1923.0,2013.0,628.0,"Anchorage Borough/municipality, AK",157923,147539,10384,6.6,0.123,35090.0,SNAP,Other Nutrition Program,15945680.0,2.64,0.17,279.0,457.0,3566.0
64483,02020,,,,2010,AK-500,0.004211,0.003807,0.000404,02,020,Alaska,Anchorage Municipality,Age 15 to 19 years,21187.0,10990.0,10197.0,12605.0,1416.0,2099.0,2096.0,624.0,"Anchorage Borough/municipality, AK",157923,147539,10384,6.6,0.123,35090.0,SNAP,Other Nutrition Program,15945680.0,2.64,0.17,279.0,457.0,3566.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950013,26999,,,,2010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,39.0,0.0
950014,02280,,,,2010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,37.0
950015,02999,,,,2010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,59.0
950016,04999,,,,2010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,38.0


In [152]:
df_all.to_csv('cleaned_data.csv')