# Projecting Food Insecurity Rates in the US by County
## Feature Engineering
The following process imports a cleaned dataset produced from [cleaning_pt2.ipynb.](notebooks/cleaning_pt2) This notebook is used to produce new features that will be used in the modeling process.
### Flatiron School Data Science Capstone<br>By Khyatee Desai

In [165]:
import pandas as pd
import numpy as np
from itertools import combinations
import sklearn
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_columns', None)
import pickle
import warnings
warnings.filterwarnings('ignore')

In [166]:
with open('../pickled/fully_cleaned_data.pickle', "rb") as input_file:
    df = pickle.load(input_file) 
df.head()

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,hh_med_income,pop_disabled,pop_hs_grad,pop_bachelors,pop_grad_degree,pop_priv_health,pop_public_health,pop_no_health,pop_total,percent_hh_poverty,hh_avg_size,pop_65+,hh_no_vehicle,num_hh,pop_non_citizen,hh_SNAP,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,TOT_LATINX,State/County,FI Rate,Low Threshold Type,High Threshold Type,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate
1,1117,1229.755051,2014,AL-500,0.013259,0.009128,0.00413,69723.0,22792.0,28911.0,35773.0,18511.0,159655.0,42429.0,19175.0,201168.0,6.2,2.65,23404.0,19762.0,74790.0,7624.0,4706.0,Alabama,Shelby County,206280.0,100304.0,105976.0,174094.0,24247.0,805.0,4403.0,101.0,11872.0,"Shelby County, AL",0.105,SNAP,Other Nutrition Program,3.37,1.0,743.0,2706.0,107208.0,102400.0,4808.0,4.5
2,4003,1051.25,2014,AZ-500,0.013954,0.00785,0.006104,45974.0,20341.0,21109.0,12968.0,7566.0,76099.0,50498.0,14868.0,130807.0,13.1,2.47,23593.0,16328.0,48846.0,7947.0,7812.0,Arizona,Cochise County,127314.0,64661.0,62653.0,112238.0,5737.0,2165.0,2757.0,510.0,44374.0,"Cochise County, AZ",0.161,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.81,800.0,72.0,340.0,50969.0,46682.0,4287.0,8.4
5,4021,947.52904,2014,AZ-500,0.013954,0.00785,0.006104,50248.0,46667.0,77438.0,30434.0,15731.0,224547.0,138907.0,53302.0,390160.0,11.5,2.9,61319.0,49435.0,126128.0,25388.0,15654.0,Arizona,Pinal County,395322.0,206610.0,188712.0,328808.0,19282.0,26951.0,7543.0,1669.0,115971.0,"Pinal County, AZ",0.153,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.78,387.0,67.0,190.0,153748.0,142996.0,10752.0,7.0
7,5119,811.0,2014,AR-500,0.017366,0.009233,0.008133,46410.0,50250.0,71905.0,53233.0,30063.0,242709.0,125676.0,56673.0,388752.0,12.0,2.49,47591.0,60501.0,153323.0,13584.0,19565.0,Arkansas,Pulaski County,392952.0,188577.0,204375.0,232228.0,141871.0,1899.0,8860.0,257.0,23410.0,"Pulaski County, AR",0.211,SNAP,Other Nutrition Program,2.93,57.0,1628.0,5578.0,184355.0,174206.0,10149.0,5.5
11,6019,1224.241667,2014,CA-514,0.022104,0.006046,0.016058,45201.0,111800.0,127519.0,73566.0,36047.0,459151.0,386910.0,173241.0,948844.0,22.2,3.18,98074.0,100365.0,292550.0,131496.0,56911.0,California,Fresno County,960567.0,479305.0,481262.0,743185.0,56151.0,28677.0,101616.0,2490.0,499032.0,"Fresno County, CA",0.16,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.77,79.0,805.0,267.0,438262.0,387533.0,50729.0,11.6


### Create a percentage for each demographic

In [167]:
df.replace(-888888888.0,np.nan,inplace=True)

In [168]:
df['Percent_male'] = df['TOT_MALE']/df['TOT_POP']
df['Percent_female'] = df['TOT_FEMALE']/df['TOT_POP']
df['Percent_white'] = df['TOT_WHITE']/df['TOT_POP']
df['Percent_Black'] = df['TOT_BLACK']/df['TOT_POP']
df['Percent_native'] = df['TOT_NATIVE']/df['TOT_POP']
df['Percent_asian'] = df['TOT_ASIAN']/df['TOT_POP']
df['Percent_pacific'] = df['TOT_PACIFIC']/df['TOT_POP']
df['Percent_latinX'] = df['TOT_LATINX']/df['TOT_POP']

### Percentage PoC
Percentage of a county that is not white

In [169]:
df['Percent_PoC'] = 1-df['Percent_white']

### Workforce as a percentage of total population
Workforce represented as percentage, so that it can be compared across different counties

In [170]:
# percent of population that is working
df['Percent_working'] = df['Total_workforce']/df['TOT_POP']


### Sum of Food Establishments
Total number of food retail businesses, which is the sum of the three categories (wholesalers, restaraunts, and grocery stores)

In [171]:
df['Total_food_retail'] = df['Num_wholesale'].fillna(0)+ df['Num_restaraunts'].fillna(0)+df['Num_grocery'].fillna(0)

### Population divided by number of food establishments 
Looking at prevalence of food establishments as a function of population - aka how many Food Retail establishments exist per person within a County

In [172]:
df['Food_retail_per_person'] = df['Total_food_retail']/df['TOT_POP']


## ACS Survey Percentages
Take percentages of all features taken from the ACS survey, which use a slightly different (~5%) total population number for the denominator

In [173]:
df['Percent_disabled'] = df['pop_disabled']/df['TOT_POP']
df['Percent_hs_grad'] = df['pop_hs_grad']/df['TOT_POP']
df['Percent_bachelors'] = df['pop_bachelors']/df['TOT_POP']
df['Percent_grad_degree'] = df['pop_grad_degree']/df['TOT_POP']
df['Percent_priv_health'] = df['pop_priv_health']/df['TOT_POP']
df['Percent_public_health'] = df['pop_public_health']/df['TOT_POP']
df['Percent_no_health'] = df['pop_no_health']/df['TOT_POP']
df['Percent_65+'] = df['pop_65+']/df['TOT_POP']
df['Percent_non_citizen'] = df['pop_non_citizen']/df['TOT_POP']
df['Percent_hh_no_vehicle'] = df['hh_no_vehicle']/df['num_hh']
df['Percent_hh_SNAP'] = df['hh_SNAP']/df['num_hh']

### Drop raw count columns after deriving percentages

In [174]:
df.drop(['TOT_MALE','TOT_FEMALE','TOT_WHITE','TOT_BLACK','TOT_NATIVE','TOT_ASIAN','TOT_PACIFIC','TOT_LATINX',
         'pop_disabled', 'pop_hs_grad','pop_bachelors', 'pop_grad_degree', 'pop_priv_health', 'pop_public_health',
        'pop_no_health', 'pop_65+','pop_non_citizen','hh_no_vehicle','hh_SNAP','pop_total'],axis=1,inplace=True)

## Create Percentages for CPS 2020 Data

In [175]:
with open('../pickled/cps_20_data.pickle', "rb") as input_file:
    df_cps_20 = pickle.load(input_file) 

### Education Columns

In [176]:
df_cps_20['Percent_hs_grad'] = df_cps_20['pop_hs_grad'] / df_cps_20['Num_respondants_b']
df_cps_20['Percent_bachelors'] = df_cps_20['pop_bachelors'] / df_cps_20['Num_respondants_b']
df_cps_20['Percent_grad_degree'] = df_cps_20['pop_grad_degree'] / df_cps_20['Num_respondants_b']

### Citizenship Status Columns

In [177]:
df_cps_20['Percent_non_citizen'] = df_cps_20['pop_non_citizen'] / df_cps_20['Num_respondants_b']

### Disability Status Columns

In [178]:
df_cps_20['Percent_disabled'] = df_cps_20['HH_disabled']/ (df_cps_20['HH_disabled']+df_cps_20['HH_not_disabled'] )


### Health Insurance columns

In [179]:
df_cps_20['Percent_no_health'] = (df_cps_20['HH_not_insured'])/(df_cps_20['HH_health_insured']+df_cps_20['HH_not_insured']+df_cps_20['HH_some_insured'])
df_cps_20['Percent_priv_health'] = (df_cps_20['HH_health_priv']+df_cps_20['HH_some_insured_priv']) /(df_cps_20['HH_not_insured_priv'] + df_cps_20['HH_some_insured_priv']+df_cps_20['HH_health_priv'])
df_cps_20['Percent_public_health'] = (df_cps_20['HH_insured_pub']+df_cps_20['HH_some_insured_pub']) /(df_cps_20['HH_no_health_pub'] + df_cps_20['HH_some_insured_pub']+df_cps_20['HH_insured_pub'])


In [180]:
df_cps_20 = df_cps_20.loc[:,['Year','HH_income','HH_size', 'Percent_hs_grad','Percent_bachelors','Percent_grad_degree',
    'Percent_non_citizen','Percent_disabled','Percent_no_health','Percent_priv_health','Percent_public_health']].reset_index()

df_cps_20.rename(columns={'index':'FIPS', 'HH_income':'hh_med_income','HH_size':'hh_avg_size' },inplace=True)

### Concatenate 2020 CPS data with main df

In [181]:
df_no_20 = df[~df.Year.isin(['2020'])]
df_20 = df[df.Year=='2020']

df_20_dropped = df_20.drop(df_cps_20.columns[1:],axis=1)
df_20 = df_cps_20.merge(df_20_dropped, on='FIPS', how='outer')

df = pd.concat([df_no_20, df_20])

## Impute Missing 2020 Data
2020 Data is only available for 329 Counties, as the CPS surveys are only performed on a subset of the population. The following cells use survey responses to impute missing values for all of the other counties, by taking an average rate of change.

Method: 
- retrieve fips for counties with missing values
- determine average change from 2019-2020 for CPS columns
- apply average rate of change to missing counties

### Determine which Counties have missing 2020 data

In [183]:
missing_fips = df_20['FIPS'][df_20.Year.isnull()].tolist()
not_missing = df_20['FIPS'][~df_20.Year.isnull()].tolist()
cps_columns = df_cps_20.columns


### Calculate mean values and percent change for 2019 & 2020

In [196]:
df_19 = df[cps_columns][df.Year == '2019']
means_19_20 = df_19.mean().reset_index().rename(columns={0:'2019 Mean'})

df_20 = df[cps_columns][df.Year == '2020']
means_19_20['2020 Mean'] = df_20.mean().values      
means_19_20[['2019 Mean', '2020 Mean']].pct_change(axis=1)

Unnamed: 0,2019 Mean,2020 Mean
0,,
1,,
2,,0.337527
3,,-0.302658
4,,5.909285
5,,1.31709
6,,-0.118185
7,,0.89165
8,,-0.979186
9,,-0.577699


### Apply percent change to generate missing values for 2020
Percent change rate is applied to the 2019 values of counties that are missing values for 2020.

In [198]:
df_19_20 = df_19[df_19.FIPS.isin(missing_fips)].drop('Year',axis=1)
df_19_20['hh_med_income'] = df_19_20['hh_med_income']+ df_19_20['hh_med_income']*0.337527
df_19_20['hh_avg_size'] = df_19_20['hh_avg_size']+ df_19_20['hh_avg_size']*-0.302658
df_19_20['Percent_hs_grad'] = df_19_20['Percent_hs_grad']+ df_19_20['Percent_hs_grad']*5.909285
df_19_20['Percent_bachelors'] = df_19_20['Percent_bachelors']+ df_19_20['Percent_bachelors']*1.317090
df_19_20['Percent_grad_degree'] = df_19_20['Percent_grad_degree']+ df_19_20['Percent_grad_degree']* -0.118185
df_19_20['Percent_non_citizen'] = df_19_20['Percent_non_citizen']+ df_19_20['Percent_non_citizen']* 0.891650
df_19_20['Percent_disabled'] = df_19_20['Percent_disabled']+ df_19_20['Percent_disabled']* -0.979186
df_19_20['Percent_no_health'] = df_19_20['Percent_no_health']+ df_19_20['Percent_no_health']* -0.577699
df_19_20['Percent_priv_health'] = df_19_20['Percent_priv_health']+ df_19_20['Percent_priv_health']* 0.166211
df_19_20['Percent_public_health'] = df_19_20['Percent_public_health']+ df_19_20['Percent_public_health']* 0.249853


In [None]:
df_no_20 = df[~df.Year.isin(['2020'])]
df_20 = df[df.Year=='2020']

# Impute Missing 2020 Data
Some features, such as demographic information, follow a predictable pattern and are unlikely to be sharply affected by the pandemic, therefore these columns will be imputed using previous yearly data to calculate estimated 2020 values.
### Determine yearly percent change for demographics columns

In [189]:
df.groupby('Year')[['Percent_male','Percent_female','Percent_white','Percent_Black','Percent_native',
        'Percent_asian','Percent_pacific','Percent_latinX','Percent_PoC','Percent_disabled', 'Percent_hs_grad',
     'Percent_bachelors','Percent_grad_degree','Percent_priv_health','Percent_public_health','Percent_no_health']].mean()


Unnamed: 0_level_0,Percent_male,Percent_female,Percent_white,Percent_Black,Percent_native,Percent_asian,Percent_pacific,Percent_latinX,Percent_PoC,Percent_disabled,Percent_hs_grad,Percent_bachelors,Percent_grad_degree,Percent_priv_health,Percent_public_health,Percent_no_health
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2009,,,,,,,,,,,,,,,,
2010,0.49976,0.50024,0.860438,0.089954,0.019532,0.01271,0.000908,0.084927,0.139562,,0.23862,0.084513,0.043306,,,
2011,0.499932,0.500068,0.841847,0.108535,0.017176,0.014607,0.001066,0.064408,0.158153,,0.236454,0.084289,0.043865,,,
2012,0.500045,0.499955,0.840321,0.108828,0.017346,0.015112,0.001099,0.065701,0.159679,0.149006,0.23586,0.085601,0.044779,0.629601,0.329167,0.1442
2013,0.500621,0.499379,0.835858,0.112749,0.017073,0.015767,0.001072,0.067735,0.164142,0.15039,0.236118,0.086668,0.046105,0.621235,0.337594,0.146237
2014,0.501397,0.498603,0.854739,0.088712,0.023968,0.012575,0.001248,0.088109,0.145261,0.152615,0.240772,0.087907,0.045556,0.621966,0.345114,0.141837
2015,0.501522,0.498478,0.854864,0.088925,0.02328,0.012556,0.001235,0.089125,0.145136,0.154337,0.242014,0.088375,0.045686,0.624687,0.355432,0.131151
2016,0.501484,0.498516,0.853099,0.088804,0.024151,0.012837,0.001289,0.092048,0.146901,0.156237,0.241082,0.090091,0.046972,0.627707,0.364867,0.120421
2017,0.501559,0.498441,0.851285,0.089523,0.024186,0.013302,0.001348,0.093032,0.148715,0.156265,0.240144,0.092036,0.048537,0.631937,0.372242,0.109065
2018,0.501599,0.498401,0.849804,0.089717,0.02445,0.013796,0.00136,0.093854,0.150196,0.156663,0.239894,0.093567,0.049798,0.634967,0.381047,0.098594


In [53]:
percent_change = df.groupby('Year')[['Percent_male','Percent_female','Percent_white','Percent_Black','Percent_native',
        'Percent_asian','Percent_pacific','Percent_latinX','Percent_PoC','Percent_65+']].mean().pct_change().mean()
percent_change


Percent_male       0.000274
Percent_female    -0.000274
Percent_white     -0.001753
Percent_Black      0.008249
Percent_native     0.012824
Percent_asian      0.027599
Percent_pacific    0.019269
Percent_latinX     0.024408
Percent_PoC        0.011422
Percent_65+       -0.052165
dtype: float64

### Multiply each 2019 value by the calculated percent change by above, to derive a 2020 estimate

In [54]:
dems_19_20 = df[['FIPS','Percent_male','Percent_female','Percent_white','Percent_Black','Percent_native',
        'Percent_asian','Percent_pacific','Percent_latinX','Percent_PoC']][df.Year=='2019'].drop_duplicates()

dems_19_20['Percent_male'] = dems_19_20['Percent_male'] + dems_19_20['Percent_male']*0.000274
dems_19_20['Percent_female'] = dems_19_20['Percent_female'] + dems_19_20['Percent_female']* -0.000274
dems_19_20['Percent_white'] = dems_19_20['Percent_white']+ dems_19_20['Percent_white']* -0.001753
dems_19_20['Percent_Black'] = dems_19_20['Percent_Black'] +dems_19_20['Percent_Black']* 0.008249
dems_19_20['Percent_native'] = dems_19_20['Percent_native']+ dems_19_20['Percent_native']* 0.012824
dems_19_20['Percent_asian'] = dems_19_20['Percent_asian'] +dems_19_20['Percent_asian']* 0.027599
dems_19_20['Percent_pacific'] = dems_19_20['Percent_pacific'] +dems_19_20['Percent_pacific']* 0.019269
dems_19_20['Percent_latinX'] = dems_19_20['Percent_latinX'] +dems_19_20['Percent_latinX']* 0.024408
dems_19_20['Percent_PoC'] = dems_19_20['Percent_PoC'] + dems_19_20['Percent_PoC']* 0.011422


### Drop nulls, and add the newly calculated 2020 values into the main df

In [55]:
df_20_dropped_null = df[df.Year=='2020'].drop(['Percent_male','Percent_female','Percent_white','Percent_Black',
            'Percent_native','Percent_asian','Percent_pacific','Percent_latinX','Percent_PoC'],axis=1)
df = df[df.Year!='2020']
df_20_new = df_20_dropped_null.merge(dems_19_20,on='FIPS',how='outer')

df = pd.concat([df, df_20_new])

# Polynomial Features
**Note:** Decided to omit polynomial features, because they decreased model performance.

In [56]:
# only using derived percentages, ignore raw counts
continuous_features = ['Rent', 'Houseless_rate','Sheltered_rate', 'Unsheltered_rate', 'TOT_POP',
       'Cost Per Meal', 'Num_wholesale','Num_restaraunts', 'Num_grocery',  'Unemployment_rate', 'Percent_male', 
         'Percent_female','Percent_white', 'Percent_Black', 'Percent_native', 'Percent_asian',
       'Percent_pacific', 'Percent_latinX','Percent_working', 'Total_food_retail','Food_retail_per_person',
        'Percent_disabled','Percent_hs_grad','Percent_bachelors','Percent_grad_degree','Percent_priv_health',
            'Percent_public_health','Percent_no_health','Percent_65+','Percent_non_citizen','Percent_hh_no_vehicle',
                    'Percent_hh_SNAP','percent_hh_poverty','hh_avg_size','num_hh','hh_med_income']


In [57]:
## add squared and cubed polynomials for each continuous feature
# for feat in continuous_features:
#     df[feat+'^2'] = df[feat]**2
#     df[feat+'^3'] = df[feat]**3


# Interaction Features
Create an interaction feature for each combination of continuous features, and add best ones to dataframe

In [58]:
# Generate combinations of features
y = df.dropna()[['FI Rate']]
X = df.dropna()[continuous_features]
interactions = list(combinations(X.columns, 2))
interaction_dict = {}

# run simple regression model with each possible interaction, and save R-squared for each interaction in a dictionary
for interaction in interactions:
    X_copy = X.copy()
    X_copy['interact'] = X_copy[interaction[0]] * X_copy[interaction[1]] 
    X_copy = X_copy.replace([np.inf, -np.inf], 0)
    model = LinearRegression()
    model.fit(X_copy, y)
    interaction_dict[model.score(X_copy, y)] = interaction 


### Add best 50 interactions to dataframe

In [59]:
# Sort the interactions dictionary, and add best 50 interactions to dataframe
top_interactions = sorted(interaction_dict.keys(), reverse = True)[:50]
for interaction in top_interactions:
    feature1 = interaction_dict[interaction][0]
    feature2 = interaction_dict[interaction][1]
    df[feature1+'_X_'+feature2] = df[feature1] * df[feature2] #also add to new_features df


# Log Transformations
Take natural log of each continuous feature, and add these log features to dataframe

In [60]:
for feat in continuous_features:
    df['log_'+feat] = df[feat].map(lambda x: np.log(x))
df = df.replace([np.inf, -np.inf], 0)

# Dummy Variables
High and Low Threshold programs delineate the assistance programs provided by State

In [61]:
# Create dummy variables for high and low threshold programs, and add to dataframe
hi_thresh_dummies = pd.get_dummies(df['High Threshold Type'].astype(str), dtype=int)
hi_thresh_dummies['other'] = hi_thresh_dummies['Other Nutrition Program'] + hi_thresh_dummies['other nutrition pgm']
hi_thresh_dummies.drop(['Other Nutrition Program','other nutrition pgm','nan'],axis=1,inplace=True) # drop last col
hi_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
hi_thresh_dummies.columns = 'Hi_thresh_'+hi_thresh_dummies.columns

low_thresh_dummies = pd.get_dummies(df['Low Threshold Type'].astype(str), dtype=int)
low_thresh_dummies.drop('nan', axis=1,inplace=True)
low_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
low_thresh_dummies.columns = 'Lo_thresh_'+low_thresh_dummies.columns

df = pd.concat([df, low_thresh_dummies, hi_thresh_dummies],axis=1)


### Pickle the new dataframe
Save the dataframe with all new features added (demographic percentages, interactions, logs, and dummies)

In [62]:
with open('../pickled/feature_engineered_data.pickle', "wb") as output_file:
    pickle.dump(df, output_file)