# Data Cleaning Part Two: Missing Data
## Projecting US Food Insecurity in 2020
### By Khyatee Desai

In [2]:
# import necessary libraries
import pandas as pd
import numpy as np
import os
# pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
import warnings
import pickle
# warnings.filterwarnings('ignore')

### Import dataset produced from first cleaning notebook

In [3]:
with open('../pickled/partially_cleaned_data.pickle', "rb") as input_file:
    df = pickle.load(input_file) 
    
# retain "all ages" rows and observations that dont have age data, then drop un needed features
# df = pd.concat([df[df.AGEGRP=='All Ages'],df[df.AGEGRP.isnull()]]).drop(['lat', 'lng','AGEGRP',
#                                 'Number Food Insecure Individuals','Child FI Rate'], axis=1)
 

In [4]:
df

Unnamed: 0,FIPS,Rent,lat,lng,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,FIPS_state,FIPS_county,State,County,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,State/County,Total_workforce,Employed,Unemployed,Unemployment_rate,FI Rate,Number Food Insecure Individuals,Low Threshold Type,High Threshold Type,Weighted Annual Dollars,Cost Per Meal,Child FI Rate,Num_wholesale,Num_restaraunts,Num_grocery
0,01073,1019.99596,33.508132,-86.754922,2014,AL-500,0.001396,0.000961,0.000435,01,073,Alabama,Jefferson County,All Ages,659972.0,312491.0,347481.0,355635.0,284082.0,2029.0,10378.0,347.0,"Jefferson County, AL",312131,292505,19626,6.3,0.197,129800.0,SNAP,Other Nutrition Program,67209000.0,2.93,0.232,483.0,2693.0,400.0
1,01073,1019.99596,33.508132,-86.754922,2014,AL-500,0.001396,0.000961,0.000435,01,073,Alabama,Jefferson County,Age 0 to 4 years,43043.0,22156.0,20887.0,21115.0,19752.0,73.0,675.0,25.0,"Jefferson County, AL",312131,292505,19626,6.3,0.197,129800.0,SNAP,Other Nutrition Program,67209000.0,2.93,0.232,483.0,2693.0,400.0
2,01073,1019.99596,33.508132,-86.754922,2014,AL-500,0.001396,0.000961,0.000435,01,073,Alabama,Jefferson County,Age 5 to 9 years,42888.0,21829.0,21059.0,20147.0,20880.0,168.0,685.0,24.0,"Jefferson County, AL",312131,292505,19626,6.3,0.197,129800.0,SNAP,Other Nutrition Program,67209000.0,2.93,0.232,483.0,2693.0,400.0
3,01073,1019.99596,33.508132,-86.754922,2014,AL-500,0.001396,0.000961,0.000435,01,073,Alabama,Jefferson County,Age 10 to 14 years,41638.0,20943.0,20695.0,19345.0,20716.0,128.0,652.0,25.0,"Jefferson County, AL",312131,292505,19626,6.3,0.197,129800.0,SNAP,Other Nutrition Program,67209000.0,2.93,0.232,483.0,2693.0,400.0
4,01073,1019.99596,33.508132,-86.754922,2014,AL-500,0.001396,0.000961,0.000435,01,073,Alabama,Jefferson County,Age 15 to 19 years,41267.0,20820.0,20447.0,19913.0,19941.0,123.0,624.0,26.0,"Jefferson County, AL",312131,292505,19626,6.3,0.197,129800.0,SNAP,Other Nutrition Program,67209000.0,2.93,0.232,483.0,2693.0,400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950228,49999,,,,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,0.0,0.0
950229,51999,,,,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16.0,0.0,0.0
950230,53999,,,,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16.0,0.0,0.0
950231,54999,,,,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,0.0,0.0


# Unemployment Dataset: Fix Datatypes
Unemployment data was saved as a string due to special characters. Need to remove, convert to float, then add back into main df
### remove special characters '-' and change numeric datatypes to floats

In [55]:
# Total Workforce Column
df_workf = df[['FIPS','Year','Total_workforce']]
df_workf = df_workf.dropna()
df_workf.drop(df_workf[df_workf['Total_workforce'].values == '–'].index,inplace=True)
df_workf = df_workf['Total_workforce'].astype(float)

# Employed column
df_employed = df[['FIPS','Year','Employed']]
df_employed = df_employed.dropna()
df_employed.drop(df_employed[df_employed['Employed'].values == '–'].index,inplace=True)
df_employed = df_employed['Employed'].astype(float)

# Unemployed column
df_unemployed = df[['FIPS','Year','Unemployed']]
df_unemployed = df_unemployed.dropna()
df_unemployed.drop(df_unemployed[df_unemployed['Unemployed'].values == '–'].index,inplace=True)
df_unemployed = df_unemployed['Unemployed'].astype(float)

# Unemployment Rate column
df_unemploy_rate = df[['FIPS','Year','Unemployment_rate']]
df_unemploy_rate = df_unemploy_rate.dropna()
df_unemploy_rate.drop(df_unemploy_rate[df_unemploy_rate['Unemployment_rate'].values == '–'].index,inplace=True)
df_unemploy_rate = df_unemploy_rate['Unemployment_rate'].astype(float)


### Drop old version from main df, and add float versions back in

In [56]:
df = df.drop(['Total_workforce','Employed','Unemployed','Unemployment_rate'],axis=1)
df = pd.concat([df,df_workf,df_employed,df_unemployed,df_unemploy_rate],axis=1)

In [57]:
df.dtypes

FIPS                        object
Rent                       float64
Year                        object
coc_number                  object
Houseless_rate             float64
Sheltered_rate             float64
Unsheltered_rate           float64
FIPS_state                  object
FIPS_county                 object
State                       object
County                      object
TOT_POP                    float64
TOT_MALE                   float64
TOT_FEMALE                 float64
TOT_WHITE                  float64
TOT_BLACK                  float64
TOT_NATIVE                 float64
TOT_ASIAN                  float64
TOT_PACIFIC                float64
State/County                object
FI Rate                    float64
Low Threshold Type          object
High Threshold Type         object
Weighted Annual Dollars    float64
Cost Per Meal              float64
Num_wholesale              float64
Num_restaraunts            float64
Num_grocery                float64
Total_workforce     

# Feeding America Dataset: Impute Missing Values
The Feeding America datasets on food insecurity are missing county-level data for the years 2011, 2012, and 2013. The following process uses data from 2010 and 2014 to calculate average yearly change, and fill in the missing values.
### Isolate rows from main df with missing values for Feeding America data

In [58]:
# create separate dataframe for each year, including features that need to be added
fi_10 = df[df.Year=='2010'][['FIPS','FI Rate','Cost Per Meal','Weighted Annual Dollars']]
fi_11 = df[df.Year=='2011'][['FIPS','FI Rate','Cost Per Meal','Weighted Annual Dollars']]
fi_12 = df[df.Year=='2012'][['FIPS','FI Rate','Cost Per Meal','Weighted Annual Dollars']]
fi_13 = df[df.Year=='2013'][['FIPS','FI Rate','Cost Per Meal','Weighted Annual Dollars']]
fi_14 = df[df.Year=='2014'][['FIPS','FI Rate','Cost Per Meal','Weighted Annual Dollars']]


### Merge together 2010 and 2014 dataframes

In [59]:
# merge and rename columns for interpretability
fi_10_and_14 = fi_10.merge(fi_14, on='FIPS',how='inner')
fi_10_and_14.rename(columns={'FI Rate_x':'FI Rate 10', 'Cost Per Meal_x': 'Meal cost 10', 
                             'FI Rate_y':'FI Rate 14', 'Cost Per Meal_y': 'Meal cost 14',
                            'Weighted Annual Dollars_x':'Dollars 10','Weighted Annual Dollars_y':'Dollars 14'}, inplace=True)
fi_10_and_14

Unnamed: 0,FIPS,FI Rate 10,Meal cost 10,Dollars 10,FI Rate 14,Meal cost 14,Dollars 14
0,02020,0.123,2.64,15945680.0,0.127,3.30,22075000.0
1,02020,0.123,2.64,15945680.0,0.127,3.30,22075000.0
2,02020,0.123,2.64,15945680.0,0.127,3.30,22075000.0
3,02013,0.145,2.69,250040.0,0.143,4.13,343000.0
4,02013,0.145,2.69,250040.0,0.143,4.13,343000.0
...,...,...,...,...,...,...,...
10165,25999,,,,,,
10166,26999,,,,,,
10167,02999,,,,,,
10168,04999,,,,,,


### Impute missing values for each year
Find difference between 2014 and 2010, and add 1/4 of the difference consecutively for each year with missing data

In [60]:
# calculate difference between 2014 and 2010 for each feature of interest
fi_10_and_14['Rate_diff'] = fi_10_and_14['FI Rate 14'] - fi_10_and_14['FI Rate 10']
fi_10_and_14['Meal_cost_dif'] = fi_10_and_14['Meal cost 14'] - fi_10_and_14['Meal cost 10']
fi_10_and_14['Dollar_diff'] = fi_10_and_14['Dollars 14'] - fi_10_and_14['Dollars 10']

# fill in the missing values for each feature and each of the three years
fi_10_and_14['FI Rate 11'] = fi_10_and_14['FI Rate 10'] + fi_10_and_14['Rate_diff']/4
fi_10_and_14['FI Rate 12'] = fi_10_and_14['FI Rate 11'] + fi_10_and_14['Rate_diff']/4
fi_10_and_14['FI Rate 13'] = fi_10_and_14['FI Rate 12'] + fi_10_and_14['Rate_diff']/4

fi_10_and_14['Meal cost 11'] = fi_10_and_14['Meal cost 10'] + fi_10_and_14['Meal_cost_dif']/4
fi_10_and_14['Meal cost 12'] = fi_10_and_14['Meal cost 11'] + fi_10_and_14['Meal_cost_dif']/4
fi_10_and_14['Meal cost 13'] = fi_10_and_14['Meal cost 12'] + fi_10_and_14['Meal_cost_dif']/4

fi_10_and_14['Dollars 11'] = fi_10_and_14['Dollars 10'] + fi_10_and_14['Dollar_diff']/4
fi_10_and_14['Dollars 12'] = fi_10_and_14['Dollars 10'] + fi_10_and_14['Dollar_diff']/4
fi_10_and_14['Dollars 13'] = fi_10_and_14['Dollars 10'] + fi_10_and_14['Dollar_diff']/4

fi_10_and_14.drop_duplicates(inplace=True)


### Create new df for each year with imputed values

In [61]:
df_11 = df[df.Year=='2011']
df_11 = df_11.merge(fi_10_and_14[['FIPS','FI Rate 11','Meal cost 11','Dollars 11']], on='FIPS')
df_11.drop(['FI Rate','Weighted Annual Dollars','Cost Per Meal'],axis=1,inplace=True)
df_11.rename(columns={'FI Rate 11':'FI Rate','Meal cost 11':'Cost Per Meal','Dollars 11':'Weighted Annual Dollars'},inplace=True)


In [62]:
df_12 = df[df.Year=='2012']
df_12 = df_12.merge(fi_10_and_14[['FIPS','FI Rate 12','Meal cost 12','Dollars 12']], on='FIPS')
df_12.drop(['FI Rate','Weighted Annual Dollars','Cost Per Meal'],axis=1,inplace=True)
df_12.rename(columns={'FI Rate 12':'FI Rate','Meal cost 12':'Cost Per Meal','Dollars 12':'Weighted Annual Dollars'},inplace=True)


In [63]:
df_13 = df[df.Year=='2013']
df_13 = df_13.merge(fi_10_and_14[['FIPS','FI Rate 13','Meal cost 13','Dollars 13']], on='FIPS')
df_13.drop(['FI Rate','Weighted Annual Dollars','Cost Per Meal'],axis=1,inplace=True)
df_13.rename(columns={'FI Rate 13':'FI Rate','Meal cost 13':'Cost Per Meal','Dollars 13':'Weighted Annual Dollars'},inplace=True)


### Concatenate new dfs back with main df

In [64]:
# drop older missing data from main df
df_trimmed = df.drop(df[df.Year.isin(['2011', '2012','2013'])].index)

# add new data
df = pd.concat([df_trimmed, df_11, df_12, df_13])
df

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,FIPS_state,FIPS_county,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,State/County,FI Rate,Low Threshold Type,High Threshold Type,Weighted Annual Dollars,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate
0,01073,1019.995960,2014,AL-500,0.001396,0.000961,0.000435,01,073,Alabama,Jefferson County,659972.0,312491.0,347481.0,355635.0,284082.0,2029.0,10378.0,347.0,"Jefferson County, AL",0.19700,SNAP,Other Nutrition Program,67209000.0,2.930,483.0,2693.0,400.0,312131.0,292505.0,19626.0,6.3
19,01117,1229.755051,2014,AL-500,0.001396,0.000961,0.000435,01,117,Alabama,Shelby County,206280.0,100304.0,105976.0,174094.0,24247.0,805.0,4403.0,101.0,"Shelby County, AL",0.10500,SNAP,Other Nutrition Program,12614000.0,3.370,1.0,743.0,2706.0,107208.0,102400.0,4808.0,4.5
38,04003,1051.250000,2014,AZ-500,0.001469,0.000826,0.000643,04,003,Arizona,Cochise County,127314.0,64661.0,62653.0,112238.0,5737.0,2165.0,2757.0,510.0,"Cochise County, AZ",0.16100,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",10417000.0,2.810,800.0,72.0,340.0,50969.0,46682.0,4287.0,8.4
57,04013,1095.670228,2014,AZ-502,0.001318,0.001084,0.000235,04,013,Arizona,Maricopa County,4093648.0,2024659.0,2068989.0,3449404.0,235660.0,112383.0,172425.0,11190.0,"Maricopa County, AZ",0.15800,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",319193000.0,2.900,2389.0,16857.0,6320.0,1961997.0,1848119.0,113878.0,5.8
76,04019,928.546429,2014,AZ-501,0.002170,0.001705,0.000465,04,019,Arizona,Pima County,1004229.0,494684.0,509545.0,858334.0,41043.0,42683.0,31905.0,2266.0,"Pima County, AZ",0.15400,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",77182000.0,2.850,3591.0,838.0,1604.0,463126.0,435183.0,27943.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3292,22999,,2013,,,,,,,,,,,,,,,,,,,,,,,0.0,131.0,0.0,,,,
3293,25999,,2013,,,,,,,,,,,,,,,,,,,,,,,0.0,104.0,3.0,,,,
3294,02270,,2013,,,,,,,,,,,,,,,,,,0.26875,,,380250.0,3.105,0.0,0.0,96.0,,,,
3295,16999,,2013,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,51.0,,,,


In [65]:
df.Year.value_counts()

2020    32502
2019    13129
2010     9709
2018     3328
2017     3317
2014     3308
2016     3308
2015     3306
2012     3298
2013     3297
2011     3296
2009     3239
Name: Year, dtype: int64

# Remove Duplicates per County
Some FIPS (counties) are present multiple times with slightly varying observations (likely taken from different sources.) The following process groups the dataframe by FIPS, while taking the mean of the observations, so that there is one row per county.

In [66]:
# Check value counts per FIPS code, indicating there are multiple entries in some cases
df[df.Year=='2010'].FIPS.value_counts()

17031    12
17197    12
06037    12
25017    12
25009     9
         ..
72149     1
72001     1
16999     1
24999     1
72085     1
Name: FIPS, Length: 3245, dtype: int64

In [67]:
# perform groupby's on the years containing duplicates
no_dups_10 = df[df.Year=='2010'].groupby('FIPS').mean().reset_index()
no_dups_19 = df[df.Year=='2019'].groupby('FIPS').mean().reset_index()
no_dups_20 = df[df.Year=='2020'].groupby('FIPS').mean().reset_index()


In [70]:
# re-add in the columns that were removed during groupby operation
no_dups_10 = no_dups_10.merge(df[df.Year=='2010'][['Year','FIPS','Low Threshold Type','High Threshold Type', 'State',
                                                   'coc_number','County','State/County']],on='FIPS')
no_dups_19 = no_dups_19.merge(df[df.Year=='2019'][['Year','FIPS','Low Threshold Type','High Threshold Type', 'State',
                                         'coc_number','County','State/County']],on='FIPS')
no_dups_20 = no_dups_20.merge(df[df.Year=='2020'][['Year','FIPS','Low Threshold Type','High Threshold Type', 'State',
                                         'coc_number','County','State/County']],on='FIPS')


In [71]:
# drop older data from main df, which contained duplicates
df_trimmed = df.drop(df[df.Year.isin(['2010', '2019','2020'])].index)

# add newly grouped data with no duplicates back into main df
ndf = pd.concat([df_trimmed, no_dups_10, no_dups_19, no_dups_20])
ndf

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,FIPS_state,FIPS_county,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,State/County,FI Rate,Low Threshold Type,High Threshold Type,Weighted Annual Dollars,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate
0,01073,1019.995960,2014,AL-500,0.001396,0.000961,0.000435,01,073,Alabama,Jefferson County,659972.0,312491.0,347481.0,355635.0,284082.0,2029.0,10378.0,347.0,"Jefferson County, AL",0.197,SNAP,Other Nutrition Program,67209000.0,2.93,483.0,2693.0,400.0,312131.0,292505.000,19626.000,6.30
19,01117,1229.755051,2014,AL-500,0.001396,0.000961,0.000435,01,117,Alabama,Shelby County,206280.0,100304.0,105976.0,174094.0,24247.0,805.0,4403.0,101.0,"Shelby County, AL",0.105,SNAP,Other Nutrition Program,12614000.0,3.37,1.0,743.0,2706.0,107208.0,102400.000,4808.000,4.50
38,04003,1051.250000,2014,AZ-500,0.001469,0.000826,0.000643,04,003,Arizona,Cochise County,127314.0,64661.0,62653.0,112238.0,5737.0,2165.0,2757.0,510.0,"Cochise County, AZ",0.161,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",10417000.0,2.81,800.0,72.0,340.0,50969.0,46682.000,4287.000,8.40
57,04013,1095.670228,2014,AZ-502,0.001318,0.001084,0.000235,04,013,Arizona,Maricopa County,4093648.0,2024659.0,2068989.0,3449404.0,235660.0,112383.0,172425.0,11190.0,"Maricopa County, AZ",0.158,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",319193000.0,2.90,2389.0,16857.0,6320.0,1961997.0,1848119.000,113878.000,5.80
76,04019,928.546429,2014,AZ-501,0.002170,0.001705,0.000465,04,019,Arizona,Pima County,1004229.0,494684.0,509545.0,858334.0,41043.0,42683.0,31905.0,2266.0,"Pima County, AZ",0.154,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",77182000.0,2.85,3591.0,838.0,1604.0,463126.0,435183.000,27943.000,6.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32497,72153,,2020,,,,,,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,,9261.5,8086.875,1174.625,12.55
32498,72153,,2020,,,,,,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,,9261.5,8086.875,1174.625,12.55
32499,72153,,2020,,,,,,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,,9261.5,8086.875,1174.625,12.55
32500,72153,,2020,,,,,,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,,9261.5,8086.875,1174.625,12.55


In [76]:
ndf.Year.value_counts()

2020    32502
2019    13129
2010     9709
2018     3328
2017     3317
2014     3308
2016     3308
2015     3306
2012     3298
2013     3297
2011     3296
2009     3239
Name: Year, dtype: int64

In [77]:

with open('../pickled/fully_cleaned_data.pickle', "wb") as output_file:
    pickle.dump(ndf, output_file)