# Data Cleaning Part 2: Handle Missing Data

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import os
# pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Import cleaned dataset produced from first cleaning notebook

In [139]:
df = pd.read_csv('../datasets/cleaned_data.csv',index_col='Unnamed: 0')


### Drop unnecessary features

In [145]:
# retain "all ages" rows and observations that dont have age data, then drop unneeded features
df = pd.concat([df[df.AGEGRP=='All Ages'],df[df.AGEGRP.isnull()]]).drop(['lat', 'lng','AGEGRP',
                                'Number Food Insecure Individuals','Child FI Rate'], axis=1)
df

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,FIPS_state,FIPS_county,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,State/County,Total_workforce,Employed,Unemployed,Unemployment_rate,FI Rate,Low Threshold Type,High Threshold Type,Weighted Annual Dollars,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery
0,1073,1019.995960,2014,AL-500,0.001396,0.000961,0.000435,1.0,73.0,Alabama,Jefferson County,659972.0,312491.0,347481.0,355635.0,284082.0,2029.0,10378.0,347.0,"Jefferson County, AL",312131,292505,19626,6.3,0.197,SNAP,Other Nutrition Program,67209000.0,2.93,483.0,2693.0,400.0
19,1117,1229.755051,2014,AL-500,0.001396,0.000961,0.000435,1.0,117.0,Alabama,Shelby County,206280.0,100304.0,105976.0,174094.0,24247.0,805.0,4403.0,101.0,"Shelby County, AL",107208,102400,4808,4.5,0.105,SNAP,Other Nutrition Program,12614000.0,3.37,1.0,743.0,2706.0
38,4003,1051.250000,2014,AZ-500,0.001469,0.000826,0.000643,4.0,3.0,Arizona,Cochise County,127314.0,64661.0,62653.0,112238.0,5737.0,2165.0,2757.0,510.0,"Cochise County, AZ",50969,46682,4287,8.4,0.161,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",10417000.0,2.81,800.0,72.0,340.0
57,4013,1095.670228,2014,AZ-502,0.001318,0.001084,0.000235,4.0,13.0,Arizona,Maricopa County,4093648.0,2024659.0,2068989.0,3449404.0,235660.0,112383.0,172425.0,11190.0,"Maricopa County, AZ",1.962e+06,1.84812e+06,113878,5.8,0.158,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",319193000.0,2.90,2389.0,16857.0,6320.0
76,4019,928.546429,2014,AZ-501,0.002170,0.001705,0.000465,4.0,19.0,Arizona,Pima County,1004229.0,494684.0,509545.0,858334.0,41043.0,42683.0,31905.0,2266.0,"Pima County, AZ",463126,435183,27943,6,0.154,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",77182000.0,2.85,3591.0,838.0,1604.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950228,49999,,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,0.0,0.0
950229,51999,,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,16.0,0.0,0.0
950230,53999,,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,16.0,0.0,0.0
950231,54999,,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,0.0,0.0


# Feeding America Dataset: Impute Missing Values
The Feeding America datasets on food insecurity are missing county-level data for the years 2011, 2012, and 2013. I will be using data from 2010 and 2014 to calculate average yearly change, and fill in the missing values.
### Isolate rows from main df with missing values for Feeding America data

In [146]:
# create separate dataframe for each year, including features that need to be added
fi_10 = df[df.Year==2010][['FIPS','FI Rate','Cost Per Meal','Weighted Annual Dollars']]
fi_11 = df[df.Year==2011][['FIPS','FI Rate','Cost Per Meal','Weighted Annual Dollars']]
fi_12 = df[df.Year==2012][['FIPS','FI Rate','Cost Per Meal','Weighted Annual Dollars']]
fi_13 = df[df.Year==2013][['FIPS','FI Rate','Cost Per Meal','Weighted Annual Dollars']]
fi_14 = df[df.Year==2014][['FIPS','FI Rate','Cost Per Meal','Weighted Annual Dollars']]
fi_13

Unnamed: 0,FIPS,FI Rate,Cost Per Meal,Weighted Annual Dollars
364603,2020,,,
364622,2013,,,
364641,2016,,,
364660,2050,,,
364679,2060,,,
...,...,...,...,...
950078,27999,,,
950079,29999,,,
950080,2270,,,
950081,16999,,,


### Merge together 2010 and 2014 dataframes

In [147]:
# merge and rename columns for interpretability
fi_10_and_14 = fi_10.merge(fi_14, on='FIPS',how='inner')
fi_10_and_14.rename(columns={'FI Rate_x':'FI Rate 10', 'Cost Per Meal_x': 'Meal cost 10', 
                             'FI Rate_y':'FI Rate 14', 'Cost Per Meal_y': 'Meal cost 14',
                            'Weighted Annual Dollars_x':'Dollars 10','Weighted Annual Dollars_y':'Dollars 14'}, inplace=True)
fi_10_and_14

Unnamed: 0,FIPS,FI Rate 10,Meal cost 10,Dollars 10,FI Rate 14,Meal cost 14,Dollars 14
0,2020,0.123,2.64,15945680.0,0.127,3.30,22075000.0
1,2020,0.123,2.64,15945680.0,0.127,3.30,22075000.0
2,2020,0.123,2.64,15945680.0,0.127,3.30,22075000.0
3,2013,0.145,2.69,250040.0,0.143,4.13,343000.0
4,2013,0.145,2.69,250040.0,0.143,4.13,343000.0
...,...,...,...,...,...,...,...
10165,25999,,,,,,
10166,26999,,,,,,
10167,2999,,,,,,
10168,4999,,,,,,


### Impute missing values for each year
Find difference between 2014 and 2010, and add 1/4 of the difference consecutively for each year with missing data

In [148]:
# calculate difference between 2014 and 2010 for each feature of interest
fi_10_and_14['Rate_diff'] = fi_10_and_14['FI Rate 14'] - fi_10_and_14['FI Rate 10']
fi_10_and_14['Meal_cost_dif'] = fi_10_and_14['Meal cost 14'] - fi_10_and_14['Meal cost 10']
fi_10_and_14['Dollar_diff'] = fi_10_and_14['Dollars 14'] - fi_10_and_14['Dollars 10']

# fill in the missing values for each feature and each of the three years
fi_10_and_14['FI Rate 11'] = fi_10_and_14['FI Rate 10'] + fi_10_and_14['Rate_diff']/4
fi_10_and_14['FI Rate 12'] = fi_10_and_14['FI Rate 11'] + fi_10_and_14['Rate_diff']/4
fi_10_and_14['FI Rate 13'] = fi_10_and_14['FI Rate 12'] + fi_10_and_14['Rate_diff']/4

fi_10_and_14['Meal cost 11'] = fi_10_and_14['Meal cost 10'] + fi_10_and_14['Meal_cost_dif']/4
fi_10_and_14['Meal cost 12'] = fi_10_and_14['Meal cost 11'] + fi_10_and_14['Meal_cost_dif']/4
fi_10_and_14['Meal cost 13'] = fi_10_and_14['Meal cost 12'] + fi_10_and_14['Meal_cost_dif']/4

fi_10_and_14['Dollars 11'] = fi_10_and_14['Dollars 10'] + fi_10_and_14['Dollar_diff']/4
fi_10_and_14['Dollars 12'] = fi_10_and_14['Dollars 10'] + fi_10_and_14['Dollar_diff']/4
fi_10_and_14['Dollars 13'] = fi_10_and_14['Dollars 10'] + fi_10_and_14['Dollar_diff']/4

fi_10_and_14.drop_duplicates(inplace=True)


### Create new df for each year with imputed values

In [149]:
df_11 = df[df.Year==2011]
df_11 = df_11.merge(fi_10_and_14[['FIPS','FI Rate 11','Meal cost 11','Dollars 11']], on='FIPS')
df_11.drop(['FI Rate','Weighted Annual Dollars','Cost Per Meal'],axis=1,inplace=True)
df_11.rename(columns={'FI Rate 11':'FI Rate','Meal cost 11':'Cost Per Meal','Dollars 11':'Weighted Annual Dollars'},inplace=True)


In [150]:
df_12 = df[df.Year==2012]
df_12 = df_12.merge(fi_10_and_14[['FIPS','FI Rate 12','Meal cost 12','Dollars 12']], on='FIPS')
df_12.drop(['FI Rate','Weighted Annual Dollars','Cost Per Meal'],axis=1,inplace=True)
df_12.rename(columns={'FI Rate 12':'FI Rate','Meal cost 12':'Cost Per Meal','Dollars 12':'Weighted Annual Dollars'},inplace=True)


In [151]:
df_13 = df[df.Year==2013]
df_13 = df_13.merge(fi_10_and_14[['FIPS','FI Rate 13','Meal cost 13','Dollars 13']], on='FIPS')
df_13.drop(['FI Rate','Weighted Annual Dollars','Cost Per Meal'],axis=1,inplace=True)
df_13.rename(columns={'FI Rate 13':'FI Rate','Meal cost 13':'Cost Per Meal','Dollars 13':'Weighted Annual Dollars'},inplace=True)


### Concatenate new dfs back with main df

In [152]:
# drop older missing data from main df
df_trimmed = df.drop(df[df.Year.isin([2011, 2012,2013])].index)

# add new data
df = pd.concat([df_trimmed, df_11, df_12, df_13])

In [153]:
df[df.Year==2009]

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,FIPS_state,FIPS_county,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,State/County,Total_workforce,Employed,Unemployed,Unemployment_rate,FI Rate,Low Threshold Type,High Threshold Type,Weighted Annual Dollars,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery
916515,1001,,2009,,,,,1.0,1.0,,,,,,,,,,,"Autauga County, AL",24703,22301,2402,9.7,0.143,SNAP,other nutrition pgm,3085290.0,2.60096,71.0,13.0,109.0
916516,1003,,2009,,,,,1.0,3.0,,,,,,,,,,,"Baldwin County, AL",82451,74403,8048,9.8,0.142,SNAP,other nutrition pgm,10884090.0,2.66446,0.0,588.0,627.0
916517,1005,,2009,,,,,1.0,5.0,,,,,,,,,,,"Barbour County, AL",10003,8572,1431,14.3,0.236,SNAP,other nutrition pgm,3245490.0,2.77114,5.0,7.0,109.0
916518,1007,,2009,,,,,1.0,7.0,,,,,,,,,,,"Bibb County, AL",8742,7581,1161,13.3,0.178,SNAP,other nutrition pgm,1658350.0,2.59842,3.0,21.0,92.0
916519,1009,,2009,,,,,1.0,9.0,,,,,,,,,,,"Blount County, AL",26480,23832,2648,10,0.137,SNAP,other nutrition pgm,3305260.0,2.55270,19.0,32.0,186.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
949991,17999,,2009,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,12.0,122.0
949992,18999,,2009,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,11.0,0.0
949993,23999,,2009,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54.0,0.0
949994,6999,,2009,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,418.0
