# Projecting Food Insecurity Rates in the US by County
## Feature Engineering
The following process imports a cleaned dataset produced from [cleaning_pt2.ipynb.](notebooks/cleaning_pt2) This notebook is used to produce new features that will be used in the modeling process.
### Flatiron School Data Science Capstone<br>By Khyatee Desai

In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
import sklearn
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_columns', None)
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open('../pickled/fully_cleaned_data.pickle', "rb") as input_file:
    df = pickle.load(input_file) 
df.sample(3)

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,hh_med_income,pop_disabled,pop_hs_grad,pop_bachelors,pop_grad_degree,pop_priv_health,pop_public_health,pop_no_health,pop_total,percent_hh_poverty,hh_avg_size,pop_65+,hh_no_vehicle,num_hh,pop_non_citizen,hh_SNAP,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,TOT_LATINX,State/County,FI Rate,Low Threshold Type,High Threshold Type,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate
10619,48161,,2019,TX-604,0.004923,0.003163,0.00176,49471.0,190.0,1411.0,1503.0,1461.0,10874.0,7222.0,2553.0,0.0,13.8,2520.0,1734.0,364.0,6758.0,553.0,920.0,Texas,Freestone County,19717.0,10246.0,9471.0,15718.0,3124.0,288.0,176.0,6.0,3169.0,"Freestone County, TX",,,,,,,,6400.5,6097.0,303.5,4.725
35454,28103,,2017,MS-501,0.003442,0.002227,0.001215,30808.0,309.0,2155.0,589.0,333.0,4398.0,5655.0,1529.0,10942.0,33.4,2.68,1591.0,569.0,4002.0,132.0,1357.0,Mississippi,Noxubee County,10711.0,5062.0,5649.0,2866.0,7688.0,39.0,28.0,2.0,141.0,"Noxubee County, MS",0.269,SNAP,Other Nutrition Program,3.19,0.0,47.0,0.0,3907.0,3621.0,286.0,7.3
55554,42033,,2009,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Clearfield County, PA",0.158,SNAP,other nutrition pgm,2.2987,,,,40730.0,36674.0,4056.0,10.0


In [3]:
df.replace(-888888888.0,np.nan,inplace=True)

In [4]:
df['Percent_male'] = df['TOT_MALE']/df['TOT_POP']
df['Percent_female'] = df['TOT_FEMALE']/df['TOT_POP']
df['Percent_white'] = df['TOT_WHITE']/df['TOT_POP']
df['Percent_Black'] = df['TOT_BLACK']/df['TOT_POP']
df['Percent_native'] = df['TOT_NATIVE']/df['TOT_POP']
df['Percent_asian'] = df['TOT_ASIAN']/df['TOT_POP']
df['Percent_pacific'] = df['TOT_PACIFIC']/df['TOT_POP']
df['Percent_latinX'] = df['TOT_LATINX']/df['TOT_POP']

## Percentage PoC
Percentage of a county that is not white

In [5]:
df['Percent_PoC'] = 1-df['Percent_white']

### Workforce as a percentage of total population
Workforce represented as percentage, so that it can be compared across different counties

In [6]:
# percent of population that is working
df['Percent_working'] = df['Total_workforce']/df['TOT_POP']


### ACS Survey Percentages
Take percentages of all features taken from the ACS survey, which use a slightly different (~5%) total population number for the denominator

In [7]:
df['Percent_disabled'] = df['pop_disabled']/df['TOT_POP']
df['Percent_hs_grad'] = df['pop_hs_grad']/df['TOT_POP']
df['Percent_bachelors'] = df['pop_bachelors']/df['TOT_POP']
df['Percent_grad_degree'] = df['pop_grad_degree']/df['TOT_POP']
df['Percent_priv_health'] = df['pop_priv_health']/df['TOT_POP']
df['Percent_public_health'] = df['pop_public_health']/df['TOT_POP']
df['Percent_no_health'] = df['pop_no_health']/df['TOT_POP']
df['Percent_65+'] = df['pop_65+']/df['TOT_POP']
df['Percent_non_citizen'] = df['pop_non_citizen']/df['TOT_POP']
df['Percent_hh_no_vehicle'] = df['hh_no_vehicle']/df['num_hh']
df['Percent_hh_SNAP'] = df['hh_SNAP']/df['num_hh']

## Drop raw count columns after deriving percentages

In [8]:
df.drop(['TOT_MALE','TOT_FEMALE','TOT_WHITE','TOT_BLACK','TOT_NATIVE','TOT_ASIAN','TOT_PACIFIC','TOT_LATINX',
         'pop_disabled', 'pop_hs_grad','pop_bachelors', 'pop_grad_degree', 'pop_priv_health', 'pop_public_health',
        'pop_no_health', 'pop_65+','pop_non_citizen','hh_no_vehicle','hh_SNAP','pop_total'],axis=1,inplace=True)

### Sum of Food Establishments
Total number of food retail businesses, which is the sum of the three categories (wholesalers, restaraunts, and grocery stores)

In [9]:
df['Total_food_retail'] = df['Num_wholesale'].fillna(0)+ df['Num_restaraunts'].fillna(0)+df['Num_grocery'].fillna(0)

### Population divided by number of food establishments 
Looking at prevalence of food establishments as a function of population - aka how many Food Retail establishments exist per person within a County

In [10]:
df['Food_retail_per_person'] = df['Total_food_retail']/df['TOT_POP']


# Polynomial Features
**Note:** Decided to omit polynomial features, because they decreased model performance.

In [14]:
# only using derived percentages, ignore raw counts
continuous_features = ['Rent', 'Houseless_rate','Sheltered_rate', 'Unsheltered_rate', 'TOT_POP',
       'Cost Per Meal', 'Num_wholesale','Num_restaraunts', 'Num_grocery',  'Unemployment_rate', 'Percent_male', 
         'Percent_female','Percent_white', 'Percent_Black', 'Percent_native', 'Percent_asian',
       'Percent_pacific', 'Percent_latinX','Percent_working', 'Total_food_retail','Food_retail_per_person',
        'Percent_disabled','Percent_hs_grad','Percent_bachelors','Percent_grad_degree','Percent_priv_health',
            'Percent_public_health','Percent_no_health','Percent_65+','Percent_non_citizen','Percent_hh_no_vehicle',
                    'Percent_hh_SNAP','percent_hh_poverty','hh_avg_size','num_hh','hh_med_income']


In [15]:
## add squared and cubed polynomials for each continuous feature
# for feat in continuous_features:
#     df[feat+'^2'] = df[feat]**2
#     df[feat+'^3'] = df[feat]**3


# Interaction Features
Create an interaction feature for each combination of continuous features, and add best ones to dataframe

In [16]:
# Generate combinations of features
y = df.dropna()[['FI Rate']]
X = df.dropna()[continuous_features]
interactions = list(combinations(X.columns, 2))
interaction_dict = {}

# run simple regression model with each possible interaction, and save R-squared for each interaction in a dictionary
for interaction in interactions:
    X_copy = X.copy()
    X_copy['interact'] = X_copy[interaction[0]] * X_copy[interaction[1]] 
    X_copy = X_copy.replace([np.inf, -np.inf], 0)
    model = LinearRegression()
    model.fit(X_copy, y)
    interaction_dict[model.score(X_copy, y)] = interaction 


### Add best 50 interactions to dataframe

In [17]:
# Sort the interactions dictionary, and add best 50 interactions to dataframe
top_interactions = sorted(interaction_dict.keys(), reverse = True)[:50]
for interaction in top_interactions:
    feature1 = interaction_dict[interaction][0]
    feature2 = interaction_dict[interaction][1]
    df[feature1+'_X_'+feature2] = df[feature1] * df[feature2] #also add to new_features df


# Log Transformations
Take natural log of each continuous feature, and add these log features to dataframe

In [18]:
for feat in continuous_features:
    df['log_'+feat] = df[feat].map(lambda x: np.log(x))
df = df.replace([np.inf, -np.inf], 0)

# Dummy Variables
High and Low Threshold programs delineate the assistance programs provided by State

In [19]:
# Create dummy variables for high and low threshold programs, and add to dataframe
hi_thresh_dummies = pd.get_dummies(df['High Threshold Type'].astype(str), dtype=int)
hi_thresh_dummies['other'] = hi_thresh_dummies['Other Nutrition Program'] + hi_thresh_dummies['other nutrition pgm']
hi_thresh_dummies.drop(['Other Nutrition Program','other nutrition pgm','nan'],axis=1,inplace=True) # drop last col
hi_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
hi_thresh_dummies.columns = 'Hi_thresh_'+hi_thresh_dummies.columns

low_thresh_dummies = pd.get_dummies(df['Low Threshold Type'].astype(str), dtype=int)
low_thresh_dummies.drop('nan', axis=1,inplace=True)
low_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
low_thresh_dummies.columns = 'Lo_thresh_'+low_thresh_dummies.columns

df = pd.concat([df, low_thresh_dummies, hi_thresh_dummies],axis=1)


### Pickle the new dataframe
Save the dataframe with all new features added (demographic percentages, interactions, logs, and dummies)

In [16]:
df[(df.FIPS=='01003')&(df.Year.isin(['2017','2018','2019']))]

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,hh_med_income,percent_hh_poverty,hh_avg_size,num_hh,State,County,TOT_POP,State/County,FI Rate,Low Threshold Type,High Threshold Type,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate,Percent_male,Percent_female,Percent_white,Percent_Black,Percent_native,Percent_asian,Percent_pacific,Percent_latinX,Percent_PoC,Percent_working,Percent_disabled,Percent_hs_grad,Percent_bachelors,Percent_grad_degree,Percent_priv_health,Percent_public_health,Percent_no_health,Percent_65+,Percent_non_citizen,Percent_hh_no_vehicle,Percent_hh_SNAP,Total_food_retail,Food_retail_per_person
34098,1003,,2017,AL-501,0.009188,0.00517,0.004018,52562.0,8.2,2.63,76133.0,Alabama,Baldwin County,212521.0,"Baldwin County, AL",0.116,SNAP,Other Nutrition Program,3.57,47.0,1950.0,778.0,92456.0,88711.0,3745.0,4.1,0.485684,0.514316,0.873208,0.090589,0.007684,0.01047,0.000649,0.045553,0.126792,0.435044,0.006809,0.187139,0.137572,0.069316,0.657624,0.32977,0.101976,0.179211,0.016163,0.03352,0.081003,2775.0,0.013058
36905,1003,,2018,AL-501,0.008286,0.004782,0.003504,55962.0,7.3,2.61,78622.0,Alabama,Baldwin County,217855.0,"Baldwin County, AL",0.129,SNAP,Other Nutrition Program,3.58,58.0,1955.0,551.0,95233.0,91809.0,3424.0,3.6,0.485194,0.514806,0.874104,0.089344,0.007578,0.010507,0.000652,0.046531,0.125896,0.437139,0.005586,0.186266,0.139685,0.071809,0.662817,0.331973,0.09577,0.18359,0.017677,0.034405,0.075373,2564.0,0.011769
4,1003,,2019,AL-501,0.007538,0.004523,0.003015,58320.0,6.9,28759.0,80930.0,Alabama,Baldwin County,223234.0,"Baldwin County, AL",,,,,,,,96640.75,94392.75,2248.0,2.325,0.484904,0.515096,0.87441,0.087769,0.007803,0.010661,0.00069,0.047188,0.12559,0.432912,0.002123,0.046704,0.063951,0.142456,0.664944,0.344504,0.083675,0.064108,0.016351,0.032818,0.077462,0.0,0.0
5,1003,,2019,AL-501,0.007538,0.004523,0.003015,58320.0,6.9,28759.0,80930.0,Alabama,Baldwin County,223234.0,"Baldwin County, AL",,,,,,,,96640.75,94392.75,2248.0,2.325,0.484904,0.515096,0.87441,0.087769,0.007803,0.010661,0.00069,0.047188,0.12559,0.432912,0.002123,0.046704,0.063951,0.142456,0.664944,0.344504,0.083675,0.064108,0.016351,0.032818,0.077462,0.0,0.0
6,1003,,2019,AL-501,0.007538,0.004523,0.003015,58320.0,6.9,28759.0,80930.0,Alabama,Baldwin County,223234.0,"Baldwin County, AL",,,,,,,,96640.75,94392.75,2248.0,2.325,0.484904,0.515096,0.87441,0.087769,0.007803,0.010661,0.00069,0.047188,0.12559,0.432912,0.002123,0.046704,0.063951,0.142456,0.664944,0.344504,0.083675,0.064108,0.016351,0.032818,0.077462,0.0,0.0
7,1003,,2019,AL-501,0.007538,0.004523,0.003015,58320.0,6.9,28759.0,80930.0,Alabama,Baldwin County,223234.0,"Baldwin County, AL",,,,,,,,96640.75,94392.75,2248.0,2.325,0.484904,0.515096,0.87441,0.087769,0.007803,0.010661,0.00069,0.047188,0.12559,0.432912,0.002123,0.046704,0.063951,0.142456,0.664944,0.344504,0.083675,0.064108,0.016351,0.032818,0.077462,0.0,0.0


In [21]:
with open('../pickled/feature_engineered_data.pickle', "wb") as output_file:
    pickle.dump(df, output_file)

In [22]:
df[df['Year']=='2020']

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,hh_med_income,percent_hh_poverty,hh_avg_size,num_hh,State,County,TOT_POP,State/County,FI Rate,Low Threshold Type,High Threshold Type,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate,Percent_male,Percent_female,Percent_white,Percent_Black,Percent_native,Percent_asian,Percent_pacific,Percent_latinX,Percent_PoC,Percent_working,Percent_disabled,Percent_hs_grad,Percent_bachelors,Percent_grad_degree,Percent_priv_health,Percent_public_health,Percent_no_health,Percent_65+,Percent_non_citizen,Percent_hh_no_vehicle,Percent_hh_SNAP,Total_food_retail,Food_retail_per_person,Unemployment_rate_X_Percent_Black,Unemployment_rate_X_Percent_latinX,Unemployment_rate_X_Percent_white,Houseless_rate_X_Sheltered_rate,Percent_latinX_X_hh_med_income,Unemployment_rate_X_hh_avg_size,Percent_pacific_X_Percent_hs_grad,Percent_Black_X_Percent_hh_SNAP,Rent_X_Unemployment_rate,Sheltered_rate_X_Percent_grad_degree,Rent_X_Percent_non_citizen,Unemployment_rate_X_Percent_non_citizen,Cost Per Meal_X_Percent_Black,Percent_white_X_Percent_hh_SNAP,Percent_latinX_X_Percent_hh_SNAP,Percent_pacific_X_Percent_public_health,Percent_Black_X_Percent_hs_grad,Unemployment_rate_X_Percent_female,Percent_white_X_Percent_pacific,Houseless_rate_X_Percent_pacific,Percent_latinX_X_percent_hh_poverty,Percent_Black_X_Percent_grad_degree,Unemployment_rate_X_Percent_hs_grad,Rent_X_Percent_Black,Percent_latinX_X_Percent_hs_grad,Percent_disabled_X_num_hh,Unemployment_rate_X_Percent_hh_SNAP,Percent_pacific_X_Percent_65+,Percent_Black_X_hh_med_income,Percent_latinX_X_Percent_bachelors,Sheltered_rate_X_Percent_pacific,Percent_Black_X_percent_hh_poverty,Percent_Black_X_Percent_public_health,Percent_female_X_Percent_hs_grad,Percent_white_X_percent_hh_poverty,TOT_POP_X_Percent_disabled,Percent_white_X_Percent_public_health,Percent_pacific_X_Percent_hh_SNAP,Percent_hs_grad_X_hh_avg_size,Percent_hh_SNAP_X_percent_hh_poverty,Num_grocery_X_Percent_disabled,Percent_male_X_Percent_hh_SNAP,Percent_female_X_Percent_hh_SNAP,Rent_X_Percent_hs_grad,Percent_asian_X_Percent_pacific,Percent_latinX_X_Percent_priv_health,Percent_hs_grad_X_Percent_hh_SNAP,Percent_latinX_X_Percent_public_health,Sheltered_rate_X_Percent_no_health,Percent_white_X_Percent_priv_health,log_Rent,log_Houseless_rate,log_Sheltered_rate,log_Unsheltered_rate,log_TOT_POP,log_Cost Per Meal,log_Num_wholesale,log_Num_restaraunts,log_Num_grocery,log_Unemployment_rate,log_Percent_male,log_Percent_female,log_Percent_white,log_Percent_Black,log_Percent_native,log_Percent_asian,log_Percent_pacific,log_Percent_latinX,log_Percent_working,log_Total_food_retail,log_Food_retail_per_person,log_Percent_disabled,log_Percent_hs_grad,log_Percent_bachelors,log_Percent_grad_degree,log_Percent_priv_health,log_Percent_public_health,log_Percent_no_health,log_Percent_65+,log_Percent_non_citizen,log_Percent_hh_no_vehicle,log_Percent_hh_SNAP,log_percent_hh_poverty,log_hh_avg_size,log_num_hh,log_hh_med_income,Lo_thresh_SNAP,Lo_thresh_SNAP_other,Hi_thresh_SNAP,Hi_thresh_SNAP_other,Hi_thresh_other
0,01001,,2020,,,,,,,,,,,,"Autauga County, AL",,,,,,,,25485.1,24078.400,1406.700,5.53,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.710188,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,0,0,0,0,0
1,01001,,2020,,,,,,,,,,,,"Autauga County, AL",,,,,,,,25485.1,24078.400,1406.700,5.53,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.710188,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,0,0,0,0,0
2,01001,,2020,,,,,,,,,,,,"Autauga County, AL",,,,,,,,25485.1,24078.400,1406.700,5.53,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.710188,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,0,0,0,0,0
3,01001,,2020,,,,,,,,,,,,"Autauga County, AL",,,,,,,,25485.1,24078.400,1406.700,5.53,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.710188,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,0,0,0,0,0
4,01001,,2020,,,,,,,,,,,,"Autauga County, AL",,,,,,,,25485.1,24078.400,1406.700,5.53,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.710188,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32497,72153,,2020,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,9261.5,8086.875,1174.625,12.55,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.529721,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,0,0,0,0,0
32498,72153,,2020,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,9261.5,8086.875,1174.625,12.55,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.529721,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,0,0,0,0,0
32499,72153,,2020,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,9261.5,8086.875,1174.625,12.55,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.529721,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,0,0,0,0,0
32500,72153,,2020,,,,,,,,,,,,"Yauco Municipio, PR",,,,,,,,9261.5,8086.875,1174.625,12.55,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.529721,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,0,0,0,0,0
