# Projecting Food Insecurity Rates in the US by County
## Feature Engineering
The following process imports a cleaned dataset produced from [cleaning_pt2.ipynb.](notebooks/cleaning_pt2) This notebook is used to produce new features that will be used in the modeling process.
### Flatiron School Data Science Capstone<br>By Khyatee Desai

In [2]:
import pandas as pd
import numpy as np
from itertools import combinations
import sklearn
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_columns', None)
import pickle
import warnings
warnings.filterwarnings('ignore')

In [3]:
with open('../pickled/fully_cleaned_data.pickle', "rb") as input_file:
    df = pickle.load(input_file) 
df.sample(3)

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,hh_med_income,pop_disabled,pop_hs_grad,pop_bachelors,pop_grad_degree,pop_priv_health,pop_public_health,pop_no_health,pop_total,percent_hh_poverty,hh_avg_size,pop_65+,hh_no_vehicle,num_hh,pop_non_citizen,hh_SNAP,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,TOT_LATINX,State/County,FI Rate,Low Threshold Type,High Threshold Type,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate
64,1041,,2012,AL-507,0.004722,0.004349,0.000373,37309.0,312.0,3691.0,607.0,438.0,8984.0,5256.0,1586.0,13971.0,13.6,2.44,2108.0,1751.0,5648.0,274.0,952.0,Alabama,Crenshaw County,13915.0,6756.0,7159.0,10049.0,3344.0,75.0,213.0,11.0,217.0,"Crenshaw County, AL",0.168,,,2.89,0.0,0.0,26.0,6251.0,5732.0,519.0,8.3
12593,26033,,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Chippewa County, MI",,,,,,,,16551.8,14961.9,1589.9,9.6
36895,2195,,2018,AK-501,0.0197,0.015106,0.004594,66907.0,97.0,680.0,469.0,120.0,2047.0,1321.0,296.0,3255.0,6.9,2.72,701.0,185.0,1170.0,162.0,130.0,Alaska,Petersburg Borough,3244.0,1696.0,1548.0,2450.0,57.0,358.0,115.0,29.0,187.0,"Petersburg Borough, AK",0.138,SNAP,Other Nutrition Program,3.73,18.0,27.0,9.0,1379.0,1261.0,118.0,8.6


In [5]:
df.replace(-888888888.0,np.nan,inplace=True)

In [7]:
df['Percent_male'] = df['TOT_MALE']/df['TOT_POP']
df['Percent_female'] = df['TOT_FEMALE']/df['TOT_POP']
df['Percent_white'] = df['TOT_WHITE']/df['TOT_POP']
df['Percent_Black'] = df['TOT_BLACK']/df['TOT_POP']
df['Percent_native'] = df['TOT_NATIVE']/df['TOT_POP']
df['Percent_asian'] = df['TOT_ASIAN']/df['TOT_POP']
df['Percent_pacific'] = df['TOT_PACIFIC']/df['TOT_POP']
df['Percent_latinX'] = df['TOT_LATINX']/df['TOT_POP']

## Percentage PoC
Percentage of a county that is not white

In [8]:
df['Percent_PoC'] = 1-df['Percent_white']

### Workforce as a percentage of total population
Workforce represented as percentage, so that it can be compared across different counties

In [9]:
# percent of population that is working
df['Percent_working'] = df['Total_workforce']/df['TOT_POP']


### ACS Survey Percentages
Take percentages of all features taken from the ACS survey, which use a slightly different (~5%) total population number for the denominator

In [10]:
df['Percent_disabled'] = df['pop_disabled']/df['TOT_POP']
df['Percent_hs_grad'] = df['pop_hs_grad']/df['TOT_POP']
df['Percent_bachelors'] = df['pop_bachelors']/df['TOT_POP']
df['Percent_grad_degree'] = df['pop_grad_degree']/df['TOT_POP']
df['Percent_priv_health'] = df['pop_priv_health']/df['TOT_POP']
df['Percent_public_health'] = df['pop_public_health']/df['TOT_POP']
df['Percent_no_health'] = df['pop_no_health']/df['TOT_POP']
df['Percent_65+'] = df['pop_65+']/df['TOT_POP']
df['Percent_non_citizen'] = df['pop_non_citizen']/df['TOT_POP']
df['Percent_hh_no_vehicle'] = df['hh_no_vehicle']/df['num_hh']
df['Percent_hh_SNAP'] = df['hh_SNAP']/df['num_hh']

## Drop raw count columns after deriving percentages

In [11]:
df.drop(['TOT_MALE','TOT_FEMALE','TOT_WHITE','TOT_BLACK','TOT_NATIVE','TOT_ASIAN','TOT_PACIFIC','TOT_LATINX',
         'pop_disabled', 'pop_hs_grad','pop_bachelors', 'pop_grad_degree', 'pop_priv_health', 'pop_public_health',
        'pop_no_health', 'pop_65+','pop_non_citizen','hh_no_vehicle','hh_SNAP','pop_total'],axis=1,inplace=True)

### Sum of Food Establishments
Total number of food retail businesses, which is the sum of the three categories (wholesalers, restaraunts, and grocery stores)

In [12]:
df['Total_food_retail'] = df['Num_wholesale'].fillna(0)+ df['Num_restaraunts'].fillna(0)+df['Num_grocery'].fillna(0)

### Population divided by number of food establishments 
Looking at prevalence of food establishments as a function of population - aka how many Food Retail establishments exist per person within a County

In [13]:
df['Food_retail_per_person'] = df['Total_food_retail']/df['TOT_POP']


# Polynomial Features
**Note:** Decided to omit polynomial features, because they decreased model performance.

In [14]:
# only using derived percentages, ignore raw counts
continuous_features = ['Rent', 'Houseless_rate','Sheltered_rate', 'Unsheltered_rate', 'TOT_POP',
       'Cost Per Meal', 'Num_wholesale','Num_restaraunts', 'Num_grocery',  'Unemployment_rate', 'Percent_male', 
         'Percent_female','Percent_white', 'Percent_Black', 'Percent_native', 'Percent_asian',
       'Percent_pacific', 'Percent_latinX','Percent_working', 'Total_food_retail','Food_retail_per_person',
        'Percent_disabled','Percent_hs_grad','Percent_bachelors','Percent_grad_degree','Percent_priv_health',
            'Percent_public_health','Percent_no_health','Percent_65+','Percent_non_citizen','Percent_hh_no_vehicle',
                    'Percent_hh_SNAP','percent_hh_poverty','hh_avg_size','num_hh','hh_med_income']


In [15]:
## add squared and cubed polynomials for each continuous feature
# for feat in continuous_features:
#     df[feat+'^2'] = df[feat]**2
#     df[feat+'^3'] = df[feat]**3


# Interaction Features
Create an interaction feature for each combination of continuous features, and add best ones to dataframe

In [16]:
# Generate combinations of features
y = df.dropna()[['FI Rate']]
X = df.dropna()[continuous_features]
interactions = list(combinations(X.columns, 2))
interaction_dict = {}

# run simple regression model with each possible interaction, and save R-squared for each interaction in a dictionary
for interaction in interactions:
    X_copy = X.copy()
    X_copy['interact'] = X_copy[interaction[0]] * X_copy[interaction[1]] 
    X_copy = X_copy.replace([np.inf, -np.inf], 0)
    model = LinearRegression()
    model.fit(X_copy, y)
    interaction_dict[model.score(X_copy, y)] = interaction 


### Add best 50 interactions to dataframe

In [17]:
# Sort the interactions dictionary, and add best 50 interactions to dataframe
top_interactions = sorted(interaction_dict.keys(), reverse = True)[:50]
for interaction in top_interactions:
    feature1 = interaction_dict[interaction][0]
    feature2 = interaction_dict[interaction][1]
    df[feature1+'_X_'+feature2] = df[feature1] * df[feature2] #also add to new_features df


# Log Transformations
Take natural log of each continuous feature, and add these log features to dataframe

In [18]:
for feat in continuous_features:
    df['log_'+feat] = df[feat].map(lambda x: np.log(x))
df = df.replace([np.inf, -np.inf], 0)

# Dummy Variables
High and Low Threshold programs delineate the assistance programs provided by State

In [19]:
# Create dummy variables for high and low threshold programs, and add to dataframe
hi_thresh_dummies = pd.get_dummies(df['High Threshold Type'].astype(str), dtype=int)
hi_thresh_dummies['other'] = hi_thresh_dummies['Other Nutrition Program'] + hi_thresh_dummies['other nutrition pgm']
hi_thresh_dummies.drop(['Other Nutrition Program','other nutrition pgm','nan'],axis=1,inplace=True) # drop last col
hi_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
hi_thresh_dummies.columns = 'Hi_thresh_'+hi_thresh_dummies.columns

low_thresh_dummies = pd.get_dummies(df['Low Threshold Type'].astype(str), dtype=int)
low_thresh_dummies.drop('nan', axis=1,inplace=True)
low_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
low_thresh_dummies.columns = 'Lo_thresh_'+low_thresh_dummies.columns

df = pd.concat([df, low_thresh_dummies, hi_thresh_dummies],axis=1)


### Pickle the new dataframe
Save the dataframe with all new features added (demographic percentages, interactions, logs, and dummies)

In [20]:
df.sample(5)

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,hh_med_income,percent_hh_poverty,hh_avg_size,num_hh,State,County,TOT_POP,State/County,FI Rate,Low Threshold Type,High Threshold Type,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate,Percent_male,Percent_female,Percent_white,Percent_Black,Percent_native,Percent_asian,Percent_pacific,Percent_latinX,Percent_PoC,Percent_working,Percent_disabled,Percent_hs_grad,Percent_bachelors,Percent_grad_degree,Percent_priv_health,Percent_public_health,Percent_no_health,Percent_65+,Percent_non_citizen,Percent_hh_no_vehicle,Percent_hh_SNAP,Total_food_retail,Food_retail_per_person,Unemployment_rate_X_Percent_Black,Unemployment_rate_X_Percent_latinX,Unemployment_rate_X_Percent_white,Houseless_rate_X_Sheltered_rate,Percent_latinX_X_hh_med_income,Unemployment_rate_X_hh_avg_size,Percent_pacific_X_Percent_hs_grad,Percent_Black_X_Percent_hh_SNAP,Rent_X_Unemployment_rate,Sheltered_rate_X_Percent_grad_degree,Rent_X_Percent_non_citizen,Unemployment_rate_X_Percent_non_citizen,Cost Per Meal_X_Percent_Black,Percent_white_X_Percent_hh_SNAP,Percent_latinX_X_Percent_hh_SNAP,Percent_pacific_X_Percent_public_health,Percent_Black_X_Percent_hs_grad,Unemployment_rate_X_Percent_female,Percent_white_X_Percent_pacific,Houseless_rate_X_Percent_pacific,Percent_latinX_X_percent_hh_poverty,Percent_Black_X_Percent_grad_degree,Unemployment_rate_X_Percent_hs_grad,Rent_X_Percent_Black,Percent_latinX_X_Percent_hs_grad,Percent_disabled_X_num_hh,Unemployment_rate_X_Percent_hh_SNAP,Percent_pacific_X_Percent_65+,Percent_Black_X_hh_med_income,Percent_latinX_X_Percent_bachelors,Sheltered_rate_X_Percent_pacific,Percent_Black_X_percent_hh_poverty,Percent_Black_X_Percent_public_health,Percent_female_X_Percent_hs_grad,Percent_white_X_percent_hh_poverty,TOT_POP_X_Percent_disabled,Percent_white_X_Percent_public_health,Percent_pacific_X_Percent_hh_SNAP,Percent_hs_grad_X_hh_avg_size,Percent_hh_SNAP_X_percent_hh_poverty,Num_grocery_X_Percent_disabled,Percent_male_X_Percent_hh_SNAP,Percent_female_X_Percent_hh_SNAP,Rent_X_Percent_hs_grad,Percent_asian_X_Percent_pacific,Percent_latinX_X_Percent_priv_health,Percent_hs_grad_X_Percent_hh_SNAP,Percent_latinX_X_Percent_public_health,Sheltered_rate_X_Percent_no_health,Percent_white_X_Percent_priv_health,log_Rent,log_Houseless_rate,log_Sheltered_rate,log_Unsheltered_rate,log_TOT_POP,log_Cost Per Meal,log_Num_wholesale,log_Num_restaraunts,log_Num_grocery,log_Unemployment_rate,log_Percent_male,log_Percent_female,log_Percent_white,log_Percent_Black,log_Percent_native,log_Percent_asian,log_Percent_pacific,log_Percent_latinX,log_Percent_working,log_Total_food_retail,log_Food_retail_per_person,log_Percent_disabled,log_Percent_hs_grad,log_Percent_bachelors,log_Percent_grad_degree,log_Percent_priv_health,log_Percent_public_health,log_Percent_no_health,log_Percent_65+,log_Percent_non_citizen,log_Percent_hh_no_vehicle,log_Percent_hh_SNAP,log_percent_hh_poverty,log_hh_avg_size,log_num_hh,log_hh_med_income,Lo_thresh_SNAP,Lo_thresh_SNAP_other,Hi_thresh_SNAP,Hi_thresh_SNAP_other,Hi_thresh_other
53515,6069,,2009,,,,,,,,,,,,"San Benito County, CA",0.175,SNAP,other nutrition pgm,3.26136,46.0,223.0,61.0,24864.0,21392.0,3472.0,14.0,,,,,,,,,,,,,,,,,,,,,,330.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.182144,3.828641,5.407172,4.110874,2.639057,,,,,,,,,,5.799093,,,,,,,,,,,,,,,,,1,0,0,0,1
18015,34025,2394.186508,2020,,,,,,,,,,,,"Monmouth County, NJ",,,,,,,,328093.5,298079.3,30014.2,9.15,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,21906.806548,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7.780799,,,,,,,,,2.213754,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,0,0,0,0,0
9122,41049,,2019,OR-505,0.04395,0.012771,0.031179,54269.0,10.6,1221.0,4108.0,Oregon,Morrow County,11603.0,"Morrow County, OR",,,,,,,,5681.0,5503.75,177.25,3.125,0.508575,0.491425,0.928984,0.011204,0.025166,0.005947,0.003189,0.377402,0.071016,0.489615,0.000431,0.086185,0.052142,0.038094,0.629837,0.387745,0.092735,0.067483,0.037835,0.022395,0.200828,0.0,0.0,0.035012,1.179382,2.903075,0.000561,20481.250625,3815.625,0.000275,0.00225,,0.000486,,0.118235,,0.186566,0.075793,0.001236,0.000966,1.535702,0.002962,0.00014,4.000465,0.000427,0.269327,,0.032526,1.770232,0.627586,0.000215,608.02982,0.019678,4.1e-05,0.118762,0.004344,0.042353,9.847229,5.0,0.360208,0.00064,105.231406,2.128773,,0.102136,0.098692,,1.9e-05,0.237702,0.017308,0.146336,0.001184,0.585109,,-3.124697,-4.360568,-3.468006,9.359019,,,,,1.139434,-0.676142,-0.710447,-0.073664,-4.491485,-3.682265,-5.124912,-5.748101,-0.974443,-0.714136,0.0,0.0,-7.749581,-2.451264,-2.953791,-3.267709,-0.462294,-0.947409,-2.378013,-2.695886,-3.27452,-3.798903,-1.605308,2.360854,7.107425,8.320692,10.901708,0,0,0,0,0
1524,28049,,2011,MS-500,0.01384,0.006535,0.007306,39290.0,18.1,2.69,88159.0,Mississippi,Hinds County,248205.0,"Hinds County, MS",0.2425,,,2.7,109.0,635.0,379.0,118305.0,107045.0,11260.0,9.5,0.470583,0.529417,0.283399,0.699249,0.001491,0.007639,0.000266,0.014899,0.716601,0.476642,,0.152608,0.104796,0.0641,,,,,0.012212,0.382752,0.16857,1123.0,0.004524,6.642862,0.14154,2.692289,9e-05,585.380714,25.555,4.1e-05,0.117873,,0.000419,,0.116011,1.887971,0.047773,0.002512,,0.106711,5.029464,7.5e-05,4e-06,0.269671,0.044822,1.449773,,0.002274,,1.601419,,27473.47769,0.001561,2e-06,12.6564,,0.080793,5.129518,,,4.5e-05,0.410515,3.051125,,0.079326,0.089244,,2e-06,,0.025725,,,,,-4.280165,-5.030613,-4.919117,12.42201,0.993252,4.691348,6.453625,5.937536,2.251292,-0.753783,-0.635978,-1.2609,-0.357749,-6.508507,-4.874509,-8.232356,-4.206463,-0.740989,7.023759,-5.398251,,-1.879885,-2.255735,-2.747307,,,,,-4.405362,-0.960369,-1.780402,2.895912,0.989541,11.386897,10.578725,0,0,0,0,0
55841,48049,,2009,,,,,,,,,,,,"Brown County, TX",0.177,SNAP,other nutrition pgm,2.38506,,,,19013.0,17717.0,1296.0,6.8,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.869224,,,,1.916923,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,1,0,0,0,1


In [21]:
with open('../pickled/feature_engineered_data.pickle', "wb") as output_file:
    pickle.dump(df, output_file)