# Feature Engineering
## Projecting Food Insecurity Rates in 2020
### Khyatee Desai

In [415]:
# import necessary libraries
import pandas as pd
import numpy as np
from itertools import combinations
import sklearn
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_columns', None)
import pickle
import warnings
warnings.filterwarnings('ignore')

In [416]:
with open('../pickled/fully_cleaned_data.pickle', "rb") as input_file:
    df = pickle.load(input_file) 
df.head(3)

Unnamed: 0,FIPS,Rent,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,State,County,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,TOT_LATINX,State/County,FI Rate,Low Threshold Type,High Threshold Type,Cost Per Meal,Num_wholesale,Num_restaraunts,Num_grocery,Total_workforce,Employed,Unemployed,Unemployment_rate
0,1073,1019.99596,2014,AL-500,0.001396,0.000961,0.000435,Alabama,Jefferson County,659972.0,312491.0,347481.0,355635.0,284082.0,2029.0,10378.0,347.0,24099.0,"Jefferson County, AL",0.197,SNAP,Other Nutrition Program,2.93,483.0,2693.0,400.0,312131.0,292505.0,19626.0,6.3
19,1117,1229.755051,2014,AL-500,0.001396,0.000961,0.000435,Alabama,Shelby County,206280.0,100304.0,105976.0,174094.0,24247.0,805.0,4403.0,101.0,11872.0,"Shelby County, AL",0.105,SNAP,Other Nutrition Program,3.37,1.0,743.0,2706.0,107208.0,102400.0,4808.0,4.5
38,4003,1051.25,2014,AZ-500,0.001469,0.000826,0.000643,Arizona,Cochise County,127314.0,64661.0,62653.0,112238.0,5737.0,2165.0,2757.0,510.0,44374.0,"Cochise County, AZ",0.161,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",2.81,800.0,72.0,340.0,50969.0,46682.0,4287.0,8.4


# Derived Features
New columns, derived from existing features
### Demographics Percentage Breakdown

In [417]:
df['Percent_male'] = df['TOT_MALE']/df['TOT_POP']
df['Percent_female'] = df['TOT_FEMALE']/df['TOT_POP']
df['Percent_white'] = df['TOT_WHITE']/df['TOT_POP']
df['Percent_Black'] = df['TOT_BLACK']/df['TOT_POP']
df['Percent_native'] = df['TOT_NATIVE']/df['TOT_POP']
df['Percent_asian'] = df['TOT_ASIAN']/df['TOT_POP']
df['Percent_pacific'] = df['TOT_PACIFIC']/df['TOT_POP']
df['Percent_latinX'] = df['TOT_LATINX']/df['TOT_POP']

### Workforce as a percentage of total population

In [418]:
# percent of population that is working
df['Percent_working'] = df['Total_workforce']/df['TOT_POP']


### Sum of Food Establishments

In [419]:
df['Total_food_retail'] = df['Num_wholesale']+ df['Num_restaraunts']+df['Num_grocery']

### Population divided by number of food establishments 
Looking at prevalence of food establishments as a function of population

In [420]:
df['Food_retail_per_person'] = df['Total_food_retail']/df['TOT_POP']


### Scale Continuous Features

In [421]:
# to_scale = ['Rent', 'Houseless_rate',
#        'Sheltered_rate', 'Unsheltered_rate','TOT_POP', 'TOT_MALE', 'TOT_FEMALE', 'TOT_WHITE',
#        'TOT_BLACK', 'TOT_NATIVE', 'TOT_ASIAN', 'TOT_PACIFIC', 'TOT_LATINX',
#        'FI Rate', 'Cost Per Meal', 'Num_wholesale',
#        'Num_restaraunts', 'Num_grocery', 'Total_workforce', 'Employed',
#        'Unemployed', 'Unemployment_rate', 'Percent_male', 'Percent_female',
#        'Percent_white', 'Percent_Black', 'Percent_native', 'Percent_asian',
#        'Percent_pacific', 'Percent_workforce', 'Total_food_retail',
#        'Food_retail_per_person']

# for feat in to_scale:
#     df['scaled_'+feat] = (df[feat] - min(df[feat])) / (max(df[feat]) - min(df[feat]))

# Polynomial Features

In [422]:
# only using derived percentages, ignore raw counts
continuous_features = ['Rent', 'Houseless_rate','Sheltered_rate', 'Unsheltered_rate', 'TOT_POP',
       'Cost Per Meal', 'Num_wholesale','Num_restaraunts', 'Num_grocery',  'Unemployment_rate', 'Percent_male', 
         'Percent_female','Percent_white', 'Percent_Black', 'Percent_native', 'Percent_asian',
       'Percent_pacific', 'Percent_working', 'Total_food_retail','Food_retail_per_person']


Note: Decided to omit polynomial features, because they decreased model performance.

In [423]:
# for feat in continuous_features:
#     df[feat+'^2'] = df[feat]**2
#     df[feat+'^3'] = df[feat]**3


# Interaction Features

In [424]:
# Generate combinations of features
y = df.dropna()[['FI Rate']]
X = df.dropna()[continuous_features]
interactions = list(combinations(X.columns, 2))
interaction_dict = {}
for interaction in interactions:
    X_copy = X.copy()
    X_copy['interact'] = X_copy[interaction[0]] * X_copy[interaction[1]] 
    X_copy = X_copy.replace([np.inf, -np.inf], 0)
    model = LinearRegression() #run model with each possible interaction
    model.fit(X_copy, y)
    interaction_dict[model.score(X_copy, y)] = interaction # add R-squared for each interaction to a dictionary


In [425]:
# Add best 30 interactions to model
top_interactions = sorted(interaction_dict.keys(), reverse = True)[:30]
for interaction in top_interactions:
    feature1 = interaction_dict[interaction][0]
    feature2 = interaction_dict[interaction][1]
    df[feature1+'_X_'+feature2] = df[feature1] * df[feature2] #also add to new_features df


# Log Transformations

In [426]:
for feat in continuous_features:
    df['log_'+feat] = df[feat].map(lambda x: np.log(x))
df = df.replace([np.inf, -np.inf], 0)

# Dummy Variables

In [427]:
# Create dummy variables for high and low threshold programs
hi_thresh_dummies = pd.get_dummies(df['High Threshold Type'].astype(str), dtype=int)
hi_thresh_dummies['other'] = hi_thresh_dummies['Other Nutrition Program'] + hi_thresh_dummies['other nutrition pgm']
hi_thresh_dummies.drop(['Other Nutrition Program','other nutrition pgm','nan'],axis=1,inplace=True) # drop last col
hi_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
hi_thresh_dummies.columns = 'Hi_thresh_'+hi_thresh_dummies.columns

low_thresh_dummies = pd.get_dummies(df['Low Threshold Type'].astype(str), dtype=int)
low_thresh_dummies.drop('nan', axis=1,inplace=True)
low_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
low_thresh_dummies.columns = 'Lo_thresh_'+low_thresh_dummies.columns

df = pd.concat([df, low_thresh_dummies, hi_thresh_dummies],axis=1)


### Pickle the df with new features

In [428]:
with open('../pickled/feature_engineered_data.pickle', "wb") as output_file:
    pickle.dump(df, output_file)