# Projecting Food Insecurity Rates in the US by County
## Feature Engineering
The following process imports a cleaned dataset produced from [cleaning_pt2.ipynb.](notebooks/cleaning_pt2) This notebook is used to produce new features that will be used in the modeling process.
### Flatiron School Data Science Capstone<br>By Khyatee Desai

In [35]:
import pandas as pd
import numpy as np
from itertools import combinations
import sklearn
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_columns', None)
import pickle
import warnings
warnings.filterwarnings('ignore')

In [36]:
with open('../pickled/fully_cleaned_data.pickle', "rb") as input_file:
    df = pickle.load(input_file) 
df.head()

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,hh_med_income,pop_disabled,pop_hs_grad,pop_bachelors,pop_grad_degree,pop_priv_health,pop_public_health,pop_no_health,pop_total,percent_hh_poverty,hh_avg_size,pop_65+,hh_no_vehicle,num_hh,pop_non_citizen,hh_SNAP,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,TOT_LATINX,State/County,FI Rate,Low Threshold Type,High Threshold Type,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate
0,1073,1019.99596,2014,AL-500,0.013259,0.009128,0.00413,45239.0,94584.0,117854.0,81626.0,52774.0,431638.0,211570.0,81336.0,658834.0,14.8,2.48,87036.0,93630.0,259397.0,17519.0,39967.0,Alabama,Jefferson County,659972.0,312491.0,347481.0,355635.0,284082.0,2029.0,10378.0,347.0,24099.0,"Jefferson County, AL",0.197,SNAP,Other Nutrition Program,2.93,483.0,2693.0,400.0,312131.0,292505.0,19626.0,6.3
1,1117,1229.755051,2014,AL-500,0.013259,0.009128,0.00413,69723.0,22792.0,28911.0,35773.0,18511.0,159655.0,42429.0,19175.0,201168.0,6.2,2.65,23404.0,19762.0,74790.0,7624.0,4706.0,Alabama,Shelby County,206280.0,100304.0,105976.0,174094.0,24247.0,805.0,4403.0,101.0,11872.0,"Shelby County, AL",0.105,SNAP,Other Nutrition Program,3.37,1.0,743.0,2706.0,107208.0,102400.0,4808.0,4.5
2,4003,1051.25,2014,AZ-500,0.013954,0.00785,0.006104,45974.0,20341.0,21109.0,12968.0,7566.0,76099.0,50498.0,14868.0,130807.0,13.1,2.47,23593.0,16328.0,48846.0,7947.0,7812.0,Arizona,Cochise County,127314.0,64661.0,62653.0,112238.0,5737.0,2165.0,2757.0,510.0,44374.0,"Cochise County, AZ",0.161,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.81,800.0,72.0,340.0,50969.0,46682.0,4287.0,8.4
3,4013,1095.670228,2014,AZ-502,0.012524,0.010296,0.002228,53689.0,399455.0,593094.0,490927.0,273108.0,2444443.0,1204681.0,646167.0,3947382.0,12.7,2.74,507428.0,546028.0,1424244.0,373532.0,171581.0,Arizona,Maricopa County,4093648.0,2024659.0,2068989.0,3449404.0,235660.0,112383.0,172425.0,11190.0,1239835.0,"Maricopa County, AZ",0.158,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.9,2389.0,16857.0,6320.0,1961997.0,1848119.0,113878.0,5.8
4,4019,928.546429,2014,AZ-501,0.020613,0.016196,0.004418,46233.0,133694.0,149147.0,115392.0,81406.0,592298.0,364938.0,141211.0,993144.0,13.2,2.5,162075.0,149710.0,386155.0,69636.0,57099.0,Arizona,Pima County,1004229.0,494684.0,509545.0,858334.0,41043.0,42683.0,31905.0,2266.0,363063.0,"Pima County, AZ",0.154,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.85,3591.0,838.0,1604.0,463126.0,435183.0,27943.0,6.0


### Create a percentage for each demographic

In [37]:
df.replace(-888888888.0,np.nan,inplace=True)

In [38]:
df['Percent_male'] = df['TOT_MALE']/df['TOT_POP']
df['Percent_female'] = df['TOT_FEMALE']/df['TOT_POP']
df['Percent_white'] = df['TOT_WHITE']/df['TOT_POP']
df['Percent_Black'] = df['TOT_BLACK']/df['TOT_POP']
df['Percent_native'] = df['TOT_NATIVE']/df['TOT_POP']
df['Percent_asian'] = df['TOT_ASIAN']/df['TOT_POP']
df['Percent_pacific'] = df['TOT_PACIFIC']/df['TOT_POP']
df['Percent_latinX'] = df['TOT_LATINX']/df['TOT_POP']

### Percentage PoC
Percentage of a county that is not white

In [39]:
df['Percent_PoC'] = 1-df['Percent_white']

### Workforce as a percentage of total population
Workforce represented as percentage, so that it can be compared across different counties

In [40]:
# percent of population that is working
df['Percent_working'] = df['Total_workforce']/df['TOT_POP']


### Sum of Food Establishments
Total number of food retail businesses, which is the sum of the three categories (wholesalers, restaraunts, and grocery stores)

In [41]:
df['Total_food_retail'] = df['Num_wholesale'].fillna(0)+ df['Num_restaraunts'].fillna(0)+df['Num_grocery'].fillna(0)

### Population divided by number of food establishments 
Looking at prevalence of food establishments as a function of population - aka how many Food Retail establishments exist per person within a County

In [42]:
df['Food_retail_per_person'] = df['Total_food_retail']/df['TOT_POP']


## ACS Survey Percentages
Take percentages of all features taken from the ACS survey, which use a slightly different (~5%) total population number for the denominator

In [43]:
df['Percent_disabled'] = df['pop_disabled']/df['TOT_POP']
df['Percent_hs_grad'] = df['pop_hs_grad']/df['TOT_POP']
df['Percent_bachelors'] = df['pop_bachelors']/df['TOT_POP']
df['Percent_grad_degree'] = df['pop_grad_degree']/df['TOT_POP']
df['Percent_priv_health'] = df['pop_priv_health']/df['TOT_POP']
df['Percent_public_health'] = df['pop_public_health']/df['TOT_POP']
df['Percent_no_health'] = df['pop_no_health']/df['TOT_POP']
df['Percent_65+'] = df['pop_65+']/df['TOT_POP']
df['Percent_non_citizen'] = df['pop_non_citizen']/df['TOT_POP']
df['Percent_hh_no_vehicle'] = df['hh_no_vehicle']/df['num_hh']
df['Percent_hh_SNAP'] = df['hh_SNAP']/df['num_hh']

### Drop raw count columns after deriving percentages

In [44]:
df.drop(['TOT_MALE','TOT_FEMALE','TOT_WHITE','TOT_BLACK','TOT_NATIVE','TOT_ASIAN','TOT_PACIFIC','TOT_LATINX',
         'pop_disabled', 'pop_hs_grad','pop_bachelors', 'pop_grad_degree', 'pop_priv_health', 'pop_public_health',
        'pop_no_health', 'pop_65+','pop_non_citizen','hh_no_vehicle','hh_SNAP','pop_total'],axis=1,inplace=True)

## Create Percentages for CPS 2020 Data

In [45]:
with open('../pickled/cps_20_data.pickle', "rb") as input_file:
    df_cps_20 = pickle.load(input_file) 

### Education Columns

In [46]:
df_cps_20['Percent_hs_grad'] = df_cps_20['pop_hs_grad'] / df_cps_20['Num_respondants_b']
df_cps_20['Percent_bachelors'] = df_cps_20['pop_bachelors'] / df_cps_20['Num_respondants_b']
df_cps_20['Percent_grad_degree'] = df_cps_20['pop_grad_degree'] / df_cps_20['Num_respondants_b']

### Citizenship Status Columns

In [47]:
df_cps_20['Percent_non_citizen'] = df_cps_20['pop_non_citizen'] / df_cps_20['Num_respondants_b']

### Disability Status Columns

In [48]:
df_cps_20['Percent_disabled'] = df_cps_20['HH_disabled']/ (df_cps_20['HH_disabled']+df_cps_20['HH_not_disabled'] )


### Health Insurance columns

In [49]:
df_cps_20['Percent_no_health'] = (df_cps_20['HH_not_insured'])/(df_cps_20['HH_health_insured']+df_cps_20['HH_not_insured']+df_cps_20['HH_some_insured'])
df_cps_20['Percent_priv_health'] = (df_cps_20['HH_health_priv']+df_cps_20['HH_some_insured_priv']) /(df_cps_20['HH_not_insured_priv'] + df_cps_20['HH_some_insured_priv']+df_cps_20['HH_health_priv'])
df_cps_20['Percent_public_health'] = (df_cps_20['HH_insured_pub']+df_cps_20['HH_some_insured_pub']) /(df_cps_20['HH_no_health_pub'] + df_cps_20['HH_some_insured_pub']+df_cps_20['HH_insured_pub'])


In [50]:
df_cps_20 = df_cps_20.loc[:,['Year','HH_income','HH_size', 'Percent_hs_grad','Percent_bachelors','Percent_grad_degree',
    'Percent_non_citizen','Percent_disabled','Percent_no_health','Percent_priv_health','Percent_public_health']].reset_index()

df_cps_20.rename(columns={'index':'FIPS', 'HH_income':'hh_med_income','HH_size':'hh_avg_size' },inplace=True)

### Concatenate 2020 CPS data with main df

In [51]:
df_no_20 = df[~df.Year.isin(['2020'])]
df_20 = df[df.Year=='2020']

df_20_dropped = df_20.drop(df_cps_20.columns[1:],axis=1)
df_20 = df_cps_20.merge(df_20_dropped, on='FIPS', how='outer')

df = pd.concat([df_no_20, df_20])

## Impute Missing 2020 Data
Some features, such as demographic information, follow a predictable pattern and are unlikely to be sharply affected by the pandemic, therefore these columns will be imputed using previous yearly data to calculate estimated 2020 values.

### Determine yearly percent change for demographics columns

In [52]:
df.groupby('Year')[['Percent_male','Percent_female','Percent_white','Percent_Black','Percent_native',
        'Percent_asian','Percent_pacific','Percent_latinX','Percent_PoC','Percent_disabled', 'Percent_hs_grad',
     'Percent_bachelors','Percent_grad_degree','Percent_priv_health','Percent_public_health','Percent_no_health']].mean()


Unnamed: 0_level_0,Percent_male,Percent_female,Percent_white,Percent_Black,Percent_native,Percent_asian,Percent_pacific,Percent_latinX,Percent_PoC,Percent_disabled,Percent_hs_grad,Percent_bachelors,Percent_grad_degree,Percent_priv_health,Percent_public_health,Percent_no_health
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2009,,,,,,,,,,,,,,,,
2010,0.499638,0.500362,0.857873,0.089985,0.021619,0.012652,0.001112,0.084234,0.142127,,0.238241,0.084323,0.043198,,,
2011,0.499541,0.500459,0.8397,0.110203,0.018216,0.013882,0.00113,0.064932,0.1603,,0.23748,0.084378,0.043308,,,
2012,0.499773,0.500227,0.838104,0.110548,0.018413,0.014356,0.001165,0.066211,0.161896,0.149516,0.236568,0.085763,0.044233,0.627519,0.331782,0.14602
2013,0.499853,0.500147,0.835338,0.111947,0.018647,0.014926,0.001204,0.067619,0.164662,0.150693,0.237064,0.087005,0.045306,0.619998,0.339705,0.146622
2014,0.500625,0.499375,0.850215,0.092106,0.022659,0.014644,0.001266,0.091117,0.149785,0.150614,0.237156,0.090137,0.047232,0.625153,0.341686,0.14075
2015,0.500818,0.499182,0.848638,0.092428,0.022878,0.01511,0.001301,0.092774,0.151362,0.151874,0.237364,0.091465,0.048343,0.62807,0.350993,0.130266
2016,0.500852,0.499148,0.847006,0.092782,0.023124,0.015556,0.001332,0.094446,0.152994,0.15326,0.236615,0.093128,0.049572,0.631643,0.360194,0.118918
2017,0.500875,0.499125,0.845457,0.093154,0.023354,0.015933,0.001368,0.096121,0.154543,0.153586,0.235818,0.094946,0.050888,0.635518,0.367423,0.107984
2018,0.500904,0.499096,0.843955,0.093561,0.023547,0.016242,0.001411,0.097676,0.156045,0.153959,0.23552,0.096597,0.052234,0.638463,0.376279,0.097646


In [53]:
percent_change = df.groupby('Year')[['Percent_male','Percent_female','Percent_white','Percent_Black','Percent_native',
        'Percent_asian','Percent_pacific','Percent_latinX','Percent_PoC','Percent_65+']].mean().pct_change().mean()
percent_change



Percent_male       0.000274
Percent_female    -0.000274
Percent_white     -0.001753
Percent_Black      0.008249
Percent_native     0.012824
Percent_asian      0.027599
Percent_pacific    0.019269
Percent_latinX     0.024408
Percent_PoC        0.011422
Percent_65+       -0.052165
dtype: float64

### Multiply each 2019 value by the calculated percent change by above, to derive a 2020 estimate

In [54]:
dems_19_20 = df[['FIPS','Percent_male','Percent_female','Percent_white','Percent_Black','Percent_native',
        'Percent_asian','Percent_pacific','Percent_latinX','Percent_PoC']][df.Year=='2019'].drop_duplicates()

dems_19_20['Percent_male'] = dems_19_20['Percent_male'] + dems_19_20['Percent_male']*0.000274
dems_19_20['Percent_female'] = dems_19_20['Percent_female'] + dems_19_20['Percent_female']* -0.000274
dems_19_20['Percent_white'] = dems_19_20['Percent_white']+ dems_19_20['Percent_white']* -0.001753
dems_19_20['Percent_Black'] = dems_19_20['Percent_Black'] +dems_19_20['Percent_Black']* 0.008249
dems_19_20['Percent_native'] = dems_19_20['Percent_native']+ dems_19_20['Percent_native']* 0.012824
dems_19_20['Percent_asian'] = dems_19_20['Percent_asian'] +dems_19_20['Percent_asian']* 0.027599
dems_19_20['Percent_pacific'] = dems_19_20['Percent_pacific'] +dems_19_20['Percent_pacific']* 0.019269
dems_19_20['Percent_latinX'] = dems_19_20['Percent_latinX'] +dems_19_20['Percent_latinX']* 0.024408
dems_19_20['Percent_PoC'] = dems_19_20['Percent_PoC'] + dems_19_20['Percent_PoC']* 0.011422


### Drop nulls, and add the newly calculated 2020 values into the main df

In [55]:
df_20_dropped_null = df[df.Year=='2020'].drop(['Percent_male','Percent_female','Percent_white','Percent_Black',
            'Percent_native','Percent_asian','Percent_pacific','Percent_latinX','Percent_PoC'],axis=1)
df = df[df.Year!='2020']
df_20_new = df_20_dropped_null.merge(dems_19_20,on='FIPS',how='outer')

df = pd.concat([df, df_20_new])

# Polynomial Features
**Note:** Decided to omit polynomial features, because they decreased model performance.

In [56]:
# only using derived percentages, ignore raw counts
continuous_features = ['Rent', 'Houseless_rate','Sheltered_rate', 'Unsheltered_rate', 'TOT_POP',
       'Cost Per Meal', 'Num_wholesale','Num_restaraunts', 'Num_grocery',  'Unemployment_rate', 'Percent_male', 
         'Percent_female','Percent_white', 'Percent_Black', 'Percent_native', 'Percent_asian',
       'Percent_pacific', 'Percent_latinX','Percent_working', 'Total_food_retail','Food_retail_per_person',
        'Percent_disabled','Percent_hs_grad','Percent_bachelors','Percent_grad_degree','Percent_priv_health',
            'Percent_public_health','Percent_no_health','Percent_65+','Percent_non_citizen','Percent_hh_no_vehicle',
                    'Percent_hh_SNAP','percent_hh_poverty','hh_avg_size','num_hh','hh_med_income']


In [57]:
## add squared and cubed polynomials for each continuous feature
# for feat in continuous_features:
#     df[feat+'^2'] = df[feat]**2
#     df[feat+'^3'] = df[feat]**3


# Interaction Features
Create an interaction feature for each combination of continuous features, and add best ones to dataframe

In [58]:
# Generate combinations of features
y = df.dropna()[['FI Rate']]
X = df.dropna()[continuous_features]
interactions = list(combinations(X.columns, 2))
interaction_dict = {}

# run simple regression model with each possible interaction, and save R-squared for each interaction in a dictionary
for interaction in interactions:
    X_copy = X.copy()
    X_copy['interact'] = X_copy[interaction[0]] * X_copy[interaction[1]] 
    X_copy = X_copy.replace([np.inf, -np.inf], 0)
    model = LinearRegression()
    model.fit(X_copy, y)
    interaction_dict[model.score(X_copy, y)] = interaction 


### Add best 50 interactions to dataframe

In [59]:
# Sort the interactions dictionary, and add best 50 interactions to dataframe
top_interactions = sorted(interaction_dict.keys(), reverse = True)[:50]
for interaction in top_interactions:
    feature1 = interaction_dict[interaction][0]
    feature2 = interaction_dict[interaction][1]
    df[feature1+'_X_'+feature2] = df[feature1] * df[feature2] #also add to new_features df


# Log Transformations
Take natural log of each continuous feature, and add these log features to dataframe

In [60]:
for feat in continuous_features:
    df['log_'+feat] = df[feat].map(lambda x: np.log(x))
df = df.replace([np.inf, -np.inf], 0)

# Dummy Variables
High and Low Threshold programs delineate the assistance programs provided by State

In [61]:
# Create dummy variables for high and low threshold programs, and add to dataframe
hi_thresh_dummies = pd.get_dummies(df['High Threshold Type'].astype(str), dtype=int)
hi_thresh_dummies['other'] = hi_thresh_dummies['Other Nutrition Program'] + hi_thresh_dummies['other nutrition pgm']
hi_thresh_dummies.drop(['Other Nutrition Program','other nutrition pgm','nan'],axis=1,inplace=True) # drop last col
hi_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
hi_thresh_dummies.columns = 'Hi_thresh_'+hi_thresh_dummies.columns

low_thresh_dummies = pd.get_dummies(df['Low Threshold Type'].astype(str), dtype=int)
low_thresh_dummies.drop('nan', axis=1,inplace=True)
low_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
low_thresh_dummies.columns = 'Lo_thresh_'+low_thresh_dummies.columns

df = pd.concat([df, low_thresh_dummies, hi_thresh_dummies],axis=1)


### Pickle the new dataframe
Save the dataframe with all new features added (demographic percentages, interactions, logs, and dummies)

In [62]:
with open('../pickled/feature_engineered_data.pickle', "wb") as output_file:
    pickle.dump(df, output_file)