# Projecting Food Insecurity Rates in the US by County
## Secondary Data Cleaning
The following process imports a cleaned dataset produced from cleaning_pt1.ipynb. This notebook is used for additional cleaning, such as adjusting datatypes, imputing missing data, and removing/averaging duplicate observations.
### Flatiron School Data Science Capstone<br>By Khyatee Desai

In [10]:
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_columns', None)
import warnings
import pickle
# warnings.filterwarnings('ignore')

### Import dataset produced from first cleaning notebook
For the MVP model, the AGEGRP column is dropped to reduce number of observations and data prep involved in incorporating age as a feature

In [14]:
with open('../pickled/partially_cleaned_data.pickle', "rb") as input_file:
    df = pickle.load(input_file) 
    
# retain "all ages" rows and observations that dont have age data, then drop un needed features
df.drop(['FIPS_state', 'FIPS_county'], axis=1, inplace=True)
 

In [15]:
df.sample(5)

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,hh_med_income,pop_disabled,pop_hs_grad,pop_bachelors,pop_grad_degree,pop_priv_health,pop_public_health,pop_no_health,pop_total,percent_hh_poverty,hh_avg_size,pop_65+,hh_no_vehicle,num_hh,pop_non_citizen,hh_SNAP,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,TOT_LATINX,State/County,Total_workforce,Employed,Unemployed,Unemployment_rate,FI Rate,Low Threshold Type,High Threshold Type,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery
25245,48185,,2013,TX-701,0.005209,0.004048,0.001161,43994.0,403.0,6869.0,1361.0,687.0,14017.0,7375.0,5099.0,26663.0,12.7,2.65,3636.0,2602.0,8848.0,1692.0,1302.0,Texas,Grimes County,26792.0,14625.0,12167.0,21686.0,4405.0,227.0,91.0,24.0,6127.0,"Grimes County, TX",11818,11110,708,6.0,,,,,,,
6125,48209,,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Hays County, TX",123724,116770,6954,5.6,,,,,,,
76162,40093,,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Major County, OK",3906,3726,180,4.6,,,,,,,
79206,47185,,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"White County, TN",12455,12012,443,3.6,,,,,,,
66101,21147,,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"McCreary County, KY",4463,4095,368,8.2,,,,,,,


# Unemployment Dataset: Fix Datatypes
Unemployment data was saved as a string due to the presence of special characters "-". The following cell drops these characters, converts the column to a float datatype, then adds it back into the main df
### remove special characters '-' and change numeric datatypes to floats

In [16]:
# Total Workforce Column
df_workf = df[['FIPS','Year','Total_workforce']]
df_workf = df_workf.dropna()
df_workf.drop(df_workf[df_workf['Total_workforce'].values == '–'].index,inplace=True)
df_workf = df_workf['Total_workforce'].astype(float)

# Employed column
df_employed = df[['FIPS','Year','Employed']]
df_employed = df_employed.dropna()
df_employed.drop(df_employed[df_employed['Employed'].values == '–'].index,inplace=True)
df_employed = df_employed['Employed'].astype(float)

# Unemployed column
df_unemployed = df[['FIPS','Year','Unemployed']]
df_unemployed = df_unemployed.dropna()
df_unemployed.drop(df_unemployed[df_unemployed['Unemployed'].values == '–'].index,inplace=True)
df_unemployed = df_unemployed['Unemployed'].astype(float)

# Unemployment Rate column
df_unemploy_rate = df[['FIPS','Year','Unemployment_rate']]
df_unemploy_rate = df_unemploy_rate.dropna()
df_unemploy_rate.drop(df_unemploy_rate[df_unemploy_rate['Unemployment_rate'].values == '–'].index,inplace=True)
df_unemploy_rate = df_unemploy_rate['Unemployment_rate'].astype(float)


### Drop old version from main df, and add float versions back in

In [17]:
df = df.drop(['Total_workforce','Employed','Unemployed','Unemployment_rate'],axis=1)
df = pd.concat([df,df_workf,df_employed,df_unemployed,df_unemploy_rate],axis=1)

In [18]:
df.dtypes

FIPS                    object
Rent                   float64
Year                    object
coc_number              object
Houseless_rate         float64
Sheltered_rate         float64
Unsheltered_rate       float64
hh_med_income          float64
pop_disabled           float64
pop_hs_grad            float64
pop_bachelors          float64
pop_grad_degree        float64
pop_priv_health        float64
pop_public_health      float64
pop_no_health          float64
pop_total              float64
percent_hh_poverty     float64
hh_avg_size            float64
pop_65+                float64
hh_no_vehicle          float64
num_hh                 float64
pop_non_citizen        float64
hh_SNAP                float64
State                   object
County                  object
TOT_POP                float64
TOT_MALE               float64
TOT_FEMALE             float64
TOT_WHITE              float64
TOT_BLACK              float64
TOT_NATIVE             float64
TOT_ASIAN              float64
TOT_PACI

# Feeding America Dataset: Impute Missing Values
The Feeding America datasets on food insecurity only contain data at the State level for the years 2011, 2012, and 2013, however County-level data is needed for mapping, and to have enough observations for modeling. The following process uses data from 2010 and 2014 to calculate average yearly change per county, and then fills in the missing values.
### Isolate rows from main df with missing values for Feeding America data, as well as 2010 and 2014, which will be used to calculate average yearly change

In [19]:
# create separate dataframe for each year, including features that need to be added
fi_10 = df[df.Year=='2010'][['FIPS','FI Rate','Cost Per Meal']]
fi_11 = df[df.Year=='2011'][['FIPS','FI Rate','Cost Per Meal']]
fi_12 = df[df.Year=='2012'][['FIPS','FI Rate','Cost Per Meal']]
fi_13 = df[df.Year=='2013'][['FIPS','FI Rate','Cost Per Meal']]
fi_14 = df[df.Year=='2014'][['FIPS','FI Rate','Cost Per Meal']]


### Merge together 2010 and 2014 dataframes

In [20]:
# merge and rename columns for interpretability
fi_10_and_14 = fi_10.merge(fi_14, on='FIPS',how='inner')
fi_10_and_14.rename(columns={'FI Rate_x':'FI Rate 10', 'Cost Per Meal_x': 'Meal cost 10', 
                             'FI Rate_y':'FI Rate 14', 'Cost Per Meal_y': 'Meal cost 14'}, inplace=True)
fi_10_and_14

Unnamed: 0,FIPS,FI Rate 10,Meal cost 10,FI Rate 14,Meal cost 14
0,02020,0.123,2.64,0.127,3.30
1,02020,0.123,2.64,0.127,3.30
2,02020,0.123,2.64,0.127,3.30
3,02013,0.145,2.69,0.143,4.13
4,02013,0.145,2.69,0.143,4.13
...,...,...,...,...,...
10399,25999,,,,
10400,26999,,,,
10401,02999,,,,
10402,04999,,,,


### Impute missing values for each year
This cell finds the difference between 2014 and 2010, and add 1/4 of the difference consecutively for each year with missing data.

In [21]:
# calculate difference between 2014 and 2010 for each feature of interest
fi_10_and_14['Rate_diff'] = fi_10_and_14['FI Rate 14'] - fi_10_and_14['FI Rate 10']
fi_10_and_14['Meal_cost_dif'] = fi_10_and_14['Meal cost 14'] - fi_10_and_14['Meal cost 10']

# fill in the missing values for each feature and each of the three years
fi_10_and_14['FI Rate 11'] = fi_10_and_14['FI Rate 10'] + fi_10_and_14['Rate_diff']/4
fi_10_and_14['FI Rate 12'] = fi_10_and_14['FI Rate 11'] + fi_10_and_14['Rate_diff']/4
fi_10_and_14['FI Rate 13'] = fi_10_and_14['FI Rate 12'] + fi_10_and_14['Rate_diff']/4

fi_10_and_14['Meal cost 11'] = fi_10_and_14['Meal cost 10'] + fi_10_and_14['Meal_cost_dif']/4
fi_10_and_14['Meal cost 12'] = fi_10_and_14['Meal cost 11'] + fi_10_and_14['Meal_cost_dif']/4
fi_10_and_14['Meal cost 13'] = fi_10_and_14['Meal cost 12'] + fi_10_and_14['Meal_cost_dif']/4

fi_10_and_14.drop_duplicates(inplace=True)


### Create new df for each year with newly created values

In [22]:
df_11 = df[df.Year=='2011']
df_11 = df_11.merge(fi_10_and_14[['FIPS','FI Rate 11','Meal cost 11']], on='FIPS')
df_11.drop(['FI Rate','Cost Per Meal'],axis=1,inplace=True)
df_11.rename(columns={'FI Rate 11':'FI Rate','Meal cost 11':'Cost Per Meal'},inplace=True)


In [23]:
df_12 = df[df.Year=='2012']
df_12 = df_12.merge(fi_10_and_14[['FIPS','FI Rate 12','Meal cost 12']], on='FIPS')
df_12.drop(['FI Rate','Cost Per Meal'],axis=1,inplace=True)
df_12.rename(columns={'FI Rate 12':'FI Rate','Meal cost 12':'Cost Per Meal'},inplace=True)


In [24]:
df_13 = df[df.Year=='2013']
df_13 = df_13.merge(fi_10_and_14[['FIPS','FI Rate 13','Meal cost 13']], on='FIPS')
df_13.drop(['FI Rate','Cost Per Meal'],axis=1,inplace=True)
df_13.rename(columns={'FI Rate 13':'FI Rate','Meal cost 13':'Cost Per Meal'},inplace=True)


### Concatenate new dfs back with main df

In [25]:
# drop older missing data from main df
df_trimmed = df.drop(df[df.Year.isin(['2011', '2012','2013'])].index)

# add new data
df = pd.concat([df_trimmed, df_11, df_12, df_13])
df

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,hh_med_income,pop_disabled,pop_hs_grad,pop_bachelors,pop_grad_degree,pop_priv_health,pop_public_health,pop_no_health,pop_total,percent_hh_poverty,hh_avg_size,pop_65+,hh_no_vehicle,num_hh,pop_non_citizen,hh_SNAP,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,TOT_LATINX,State/County,FI Rate,Low Threshold Type,High Threshold Type,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate
0,01073,1019.995960,2014,AL-500,0.013259,0.009128,0.004130,45239.0,2036.0,117854.0,81626.0,52774.0,431638.0,211570.0,81336.0,658834.0,14.8,2.48,87036.0,93630.0,259397.0,17519.0,39967.0,Alabama,Jefferson County,659972.0,312491.0,347481.0,355635.0,284082.0,2029.0,10378.0,347.0,24099.0,"Jefferson County, AL",0.197,SNAP,Other Nutrition Program,2.93,483.0,2693.0,400.0,312131.0,292505.0,19626.0,6.3
1,01117,1229.755051,2014,AL-500,0.013259,0.009128,0.004130,69723.0,1165.0,28911.0,35773.0,18511.0,159655.0,42429.0,19175.0,201168.0,6.2,2.65,23404.0,19762.0,74790.0,7624.0,4706.0,Alabama,Shelby County,206280.0,100304.0,105976.0,174094.0,24247.0,805.0,4403.0,101.0,11872.0,"Shelby County, AL",0.105,SNAP,Other Nutrition Program,3.37,1.0,743.0,2706.0,107208.0,102400.0,4808.0,4.5
2,04003,1051.250000,2014,AZ-500,0.013954,0.007850,0.006104,45974.0,886.0,21109.0,12968.0,7566.0,76099.0,50498.0,14868.0,130807.0,13.1,2.47,23593.0,16328.0,48846.0,7947.0,7812.0,Arizona,Cochise County,127314.0,64661.0,62653.0,112238.0,5737.0,2165.0,2757.0,510.0,44374.0,"Cochise County, AZ",0.161,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.81,800.0,72.0,340.0,50969.0,46682.0,4287.0,8.4
3,04013,1095.670228,2014,AZ-502,0.012524,0.010296,0.002228,53689.0,4602.0,593094.0,490927.0,273108.0,2444443.0,1204681.0,646167.0,3947382.0,12.7,2.74,507428.0,546028.0,1424244.0,373532.0,171581.0,Arizona,Maricopa County,4093648.0,2024659.0,2068989.0,3449404.0,235660.0,112383.0,172425.0,11190.0,1239835.0,"Maricopa County, AZ",0.158,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.90,2389.0,16857.0,6320.0,1961997.0,1848119.0,113878.0,5.8
4,04019,928.546429,2014,AZ-501,0.020613,0.016196,0.004418,46233.0,2324.0,149147.0,115392.0,81406.0,592298.0,364938.0,141211.0,993144.0,13.2,2.50,162075.0,149710.0,386155.0,69636.0,57099.0,Arizona,Pima County,1004229.0,494684.0,509545.0,858334.0,41043.0,42683.0,31905.0,2266.0,363063.0,"Pima County, AZ",0.154,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.85,3591.0,838.0,1604.0,463126.0,435183.0,27943.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3371,21999,,2013,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,17.0,16.0,,,,
3372,22999,,2013,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,131.0,0.0,,,,
3373,25999,,2013,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,104.0,3.0,,,,
3374,16999,,2013,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,51.0,,,,


In [26]:
# check to see that number of observations is retained
df.Year.value_counts()

2020    32502
2019    13207
2010     9787
2018     3406
2017     3395
2014     3386
2016     3386
2015     3384
2012     3378
2013     3376
2011     3375
2009     3239
Name: Year, dtype: int64

# Remove Duplicates per County
Some FIPS (counties) are present multiple times with slightly varying observations (likely taken from different sources.) The following process groups the dataframe by FIPS, while taking the mean of the observations, so that there is one row per county.

In [27]:
# Check value counts per FIPS code, indicating there are multiple entries in some cases
df[df.Year=='2010'].FIPS.value_counts()

06037    12
25017    12
17197    12
17031    12
25009     9
         ..
26999     1
24999     1
16999     1
21999     1
04999     1
Name: FIPS, Length: 3245, dtype: int64

In [28]:
# perform groupby's on the years containing duplicates
no_dups_10 = df[df.Year=='2010'].groupby('FIPS').mean().reset_index()
no_dups_19 = df[df.Year=='2019'].groupby('FIPS').mean().reset_index()
no_dups_20 = df[df.Year=='2020'].groupby('FIPS').mean().reset_index()


In [29]:
# re-add in the columns that were removed during groupby operation
no_dups_10 = no_dups_10.merge(df[df.Year=='2010'][['Year','FIPS','Low Threshold Type','High Threshold Type', 'State',
                                                   'coc_number','County','State/County']],on='FIPS')
no_dups_19 = no_dups_19.merge(df[df.Year=='2019'][['Year','FIPS','Low Threshold Type','High Threshold Type', 'State',
                                         'coc_number','County','State/County']],on='FIPS')
no_dups_20 = no_dups_20.merge(df[df.Year=='2020'][['Year','FIPS','Low Threshold Type','High Threshold Type', 'State',
                                         'coc_number','County','State/County']],on='FIPS')


In [30]:
# drop older data from main df, which contained duplicates
df_trimmed = df.drop(df[df.Year.isin(['2010', '2019','2020'])].index)

# add newly grouped data with no duplicates back into main df
ndf = pd.concat([df_trimmed, no_dups_10, no_dups_19, no_dups_20])
ndf

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,hh_med_income,pop_disabled,pop_hs_grad,pop_bachelors,pop_grad_degree,pop_priv_health,pop_public_health,pop_no_health,pop_total,percent_hh_poverty,hh_avg_size,pop_65+,hh_no_vehicle,num_hh,pop_non_citizen,hh_SNAP,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,TOT_LATINX,State/County,FI Rate,Low Threshold Type,High Threshold Type,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate
0,01073,1019.995960,2014,AL-500,0.013259,0.009128,0.004130,45239.0,2036.0,117854.0,81626.0,52774.0,431638.0,211570.0,81336.0,658834.0,14.8,2.48,87036.0,93630.0,259397.0,17519.0,39967.0,Alabama,Jefferson County,659972.0,312491.0,347481.0,355635.0,284082.0,2029.0,10378.0,347.0,24099.0,"Jefferson County, AL",0.197,SNAP,Other Nutrition Program,2.93,483.0,2693.0,400.0,312131.0,292505.000,19626.000,6.30
1,01117,1229.755051,2014,AL-500,0.013259,0.009128,0.004130,69723.0,1165.0,28911.0,35773.0,18511.0,159655.0,42429.0,19175.0,201168.0,6.2,2.65,23404.0,19762.0,74790.0,7624.0,4706.0,Alabama,Shelby County,206280.0,100304.0,105976.0,174094.0,24247.0,805.0,4403.0,101.0,11872.0,"Shelby County, AL",0.105,SNAP,Other Nutrition Program,3.37,1.0,743.0,2706.0,107208.0,102400.000,4808.000,4.50
2,04003,1051.250000,2014,AZ-500,0.013954,0.007850,0.006104,45974.0,886.0,21109.0,12968.0,7566.0,76099.0,50498.0,14868.0,130807.0,13.1,2.47,23593.0,16328.0,48846.0,7947.0,7812.0,Arizona,Cochise County,127314.0,64661.0,62653.0,112238.0,5737.0,2165.0,2757.0,510.0,44374.0,"Cochise County, AZ",0.161,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.81,800.0,72.0,340.0,50969.0,46682.000,4287.000,8.40
3,04013,1095.670228,2014,AZ-502,0.012524,0.010296,0.002228,53689.0,4602.0,593094.0,490927.0,273108.0,2444443.0,1204681.0,646167.0,3947382.0,12.7,2.74,507428.0,546028.0,1424244.0,373532.0,171581.0,Arizona,Maricopa County,4093648.0,2024659.0,2068989.0,3449404.0,235660.0,112383.0,172425.0,11190.0,1239835.0,"Maricopa County, AZ",0.158,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.90,2389.0,16857.0,6320.0,1961997.0,1848119.000,113878.000,5.80
4,04019,928.546429,2014,AZ-501,0.020613,0.016196,0.004418,46233.0,2324.0,149147.0,115392.0,81406.0,592298.0,364938.0,141211.0,993144.0,13.2,2.50,162075.0,149710.0,386155.0,69636.0,57099.0,Arizona,Pima County,1004229.0,494684.0,509545.0,858334.0,41043.0,42683.0,31905.0,2266.0,363063.0,"Pima County, AZ",0.154,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.85,3591.0,838.0,1604.0,463126.0,435183.000,27943.000,6.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32497,72153,,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,9261.5,8086.875,1174.625,12.55
32498,72153,,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,9261.5,8086.875,1174.625,12.55
32499,72153,,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,9261.5,8086.875,1174.625,12.55
32500,72153,,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,9261.5,8086.875,1174.625,12.55


In [31]:
ndf.Year.value_counts()

2020    32502
2019    13207
2010     9787
2018     3406
2017     3395
2014     3386
2016     3386
2015     3384
2009     3239
2013     1785
2012     1785
2011     1785
Name: Year, dtype: int64

### Save the fully cleaned dataframe using pickle

In [32]:
# save with pickle to preserve datatypes
with open('../pickled/fully_cleaned_data.pickle', "wb") as output_file:
    pickle.dump(ndf, output_file)