# Projecting Food Insecurity Rates in the US by County
## Feature Engineering
The following process imports a cleaned dataset produced from [cleaning_pt2.ipynb.](notebooks/cleaning_pt2) This notebook is used to produce new features that will be used in the modeling process.
### Flatiron School Data Science Capstone<br>By Khyatee Desai

In [276]:
import pandas as pd
import numpy as np
from itertools import combinations
import sklearn
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_columns', None)
import pickle
import warnings
warnings.filterwarnings('ignore')

In [277]:
with open('../pickled/fully_cleaned_data.pickle', "rb") as input_file:
    df = pickle.load(input_file) 


### Create a percentage for each demographic

In [278]:
df.replace(-888888888.0,np.nan,inplace=True)

In [279]:
df['Percent_male'] = df['TOT_MALE']/df['TOT_POP']
df['Percent_female'] = df['TOT_FEMALE']/df['TOT_POP']
df['Percent_white'] = df['TOT_WHITE']/df['TOT_POP']
df['Percent_Black'] = df['TOT_BLACK']/df['TOT_POP']
df['Percent_native'] = df['TOT_NATIVE']/df['TOT_POP']
df['Percent_asian'] = df['TOT_ASIAN']/df['TOT_POP']
df['Percent_pacific'] = df['TOT_PACIFIC']/df['TOT_POP']
df['Percent_latinX'] = df['TOT_LATINX']/df['TOT_POP']

### Percentage PoC
Percentage of a county that is not white

In [280]:
df['Percent_PoC'] = 1-df['Percent_white']

### Workforce as a percentage of total population
Workforce represented as percentage, so that it can be compared across different counties

In [281]:
# percent of population that is working
df['Percent_working'] = df['Total_workforce']/df['TOT_POP']


### Sum of Food Establishments
Total number of food retail businesses, which is the sum of the three categories (wholesalers, restaraunts, and grocery stores)

In [282]:
df['Total_food_retail'] = df['Num_wholesale'].fillna(0)+ df['Num_restaraunts'].fillna(0)+df['Num_grocery'].fillna(0)

### Population divided by number of food establishments 
Looking at prevalence of food establishments as a function of population - aka how many Food Retail establishments exist per person within a County

In [283]:
df['Food_retail_per_person'] = df['Total_food_retail']/df['TOT_POP']


## ACS Survey Percentages
Take percentages of all features taken from the ACS survey, which use a slightly different (~5%) total population number for the denominator

In [284]:
df['Percent_disabled'] = df['pop_disabled']/df['TOT_POP']
df['Percent_hs_grad'] = df['pop_hs_grad']/df['TOT_POP']
df['Percent_bachelors'] = df['pop_bachelors']/df['TOT_POP']
df['Percent_grad_degree'] = df['pop_grad_degree']/df['TOT_POP']
df['Percent_priv_health'] = df['pop_priv_health']/df['TOT_POP']
df['Percent_public_health'] = df['pop_public_health']/df['TOT_POP']
df['Percent_no_health'] = df['pop_no_health']/df['TOT_POP']
df['Percent_65+'] = df['pop_65+']/df['TOT_POP']
df['Percent_non_citizen'] = df['pop_non_citizen']/df['TOT_POP']
df['Percent_hh_no_vehicle'] = df['hh_no_vehicle']/df['num_hh']
df['Percent_hh_SNAP'] = df['hh_SNAP']/df['num_hh']

### Drop raw count columns after deriving percentages

In [285]:
df.drop(['TOT_MALE','TOT_FEMALE','TOT_WHITE','TOT_BLACK','TOT_NATIVE','TOT_ASIAN','TOT_PACIFIC','TOT_LATINX',
         'pop_disabled', 'pop_hs_grad','pop_bachelors', 'pop_grad_degree', 'pop_priv_health', 'pop_public_health',
        'pop_no_health', 'pop_65+','pop_non_citizen','hh_no_vehicle','hh_SNAP','pop_total'],axis=1,inplace=True)

## Create Percentages for CPS 2020 Data

In [286]:
with open('../pickled/cps_20_data.pickle', "rb") as input_file:
    df_cps_20 = pickle.load(input_file) 

### Education Columns

In [287]:
df_cps_20['Percent_hs_grad'] = df_cps_20['pop_hs_grad'] / df_cps_20['Num_respondants_b']
df_cps_20['Percent_bachelors'] = df_cps_20['pop_bachelors'] / df_cps_20['Num_respondants_b']
df_cps_20['Percent_grad_degree'] = df_cps_20['pop_grad_degree'] / df_cps_20['Num_respondants_b']

### Citizenship Status Columns

In [288]:
df_cps_20['Percent_non_citizen'] = df_cps_20['pop_non_citizen'] / df_cps_20['Num_respondants_b']

### Disability Status Columns

In [289]:
df_cps_20['Percent_disabled'] = df_cps_20['HH_disabled']/ (df_cps_20['HH_disabled']+df_cps_20['HH_not_disabled'] )


### Health Insurance columns

In [290]:
df_cps_20['Percent_no_health'] = (df_cps_20['HH_not_insured'])/(df_cps_20['HH_health_insured']+df_cps_20['HH_not_insured']+df_cps_20['HH_some_insured'])
df_cps_20['Percent_priv_health'] = (df_cps_20['HH_health_priv']+df_cps_20['HH_some_insured_priv']) /(df_cps_20['HH_not_insured_priv'] + df_cps_20['HH_some_insured_priv']+df_cps_20['HH_health_priv'])
df_cps_20['Percent_public_health'] = (df_cps_20['HH_insured_pub']+df_cps_20['HH_some_insured_pub']) /(df_cps_20['HH_no_health_pub'] + df_cps_20['HH_some_insured_pub']+df_cps_20['HH_insured_pub'])


In [291]:
df_cps_20 = df_cps_20.loc[:,['Year','HH_income','HH_size', 'Percent_hs_grad','Percent_bachelors','Percent_grad_degree',
    'Percent_non_citizen','Percent_disabled','Percent_no_health','Percent_priv_health','Percent_public_health']].reset_index()

df_cps_20.rename(columns={'index':'FIPS', 'HH_income':'hh_med_income','HH_size':'hh_avg_size' },inplace=True)

### Concatenate 2020 CPS data with main df

In [292]:
df_no_20 = df[~df.Year.isin(['2020'])]
df_20 = df[df.Year=='2020']

df_20_dropped = df_20.drop(df_cps_20.columns[1:],axis=1)
df_20 = df_cps_20.merge(df_20_dropped, on='FIPS', how='outer')

df = pd.concat([df_no_20, df_20])

## Impute Missing 2020 Data

In [295]:
df.groupby('Year')[['Percent_male','Percent_female','Percent_white','Percent_Black','Percent_native',
                    'Percent_asian','Percent_pacific','Percent_latinX','Percent_PoC']].mean()





Unnamed: 0_level_0,Percent_male,Percent_female,Percent_white,Percent_Black,Percent_native,Percent_asian,Percent_pacific,Percent_latinX,Percent_PoC
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009,,,,,,,,,
2010,0.499638,0.500362,0.857873,0.089985,0.021619,0.012652,0.001112,0.084234,0.142127
2011,0.499541,0.500459,0.8397,0.110203,0.018216,0.013882,0.00113,0.064932,0.1603
2012,0.499773,0.500227,0.838104,0.110548,0.018413,0.014356,0.001165,0.066211,0.161896
2013,0.499853,0.500147,0.835338,0.111947,0.018647,0.014926,0.001204,0.067619,0.164662
2014,0.500625,0.499375,0.850215,0.092106,0.022659,0.014644,0.001266,0.091117,0.149785
2015,0.500818,0.499182,0.848638,0.092428,0.022878,0.01511,0.001301,0.092774,0.151362
2016,0.500852,0.499148,0.847006,0.092782,0.023124,0.015556,0.001332,0.094446,0.152994
2017,0.500875,0.499125,0.845457,0.093154,0.023354,0.015933,0.001368,0.096121,0.154543
2018,0.500904,0.499096,0.843955,0.093561,0.023547,0.016242,0.001411,0.097676,0.156045


# Polynomial Features
**Note:** Decided to omit polynomial features, because they decreased model performance.

In [14]:
# only using derived percentages, ignore raw counts
continuous_features = ['Rent', 'Houseless_rate','Sheltered_rate', 'Unsheltered_rate', 'TOT_POP',
       'Cost Per Meal', 'Num_wholesale','Num_restaraunts', 'Num_grocery',  'Unemployment_rate', 'Percent_male', 
         'Percent_female','Percent_white', 'Percent_Black', 'Percent_native', 'Percent_asian',
       'Percent_pacific', 'Percent_latinX','Percent_working', 'Total_food_retail','Food_retail_per_person',
        'Percent_disabled','Percent_hs_grad','Percent_bachelors','Percent_grad_degree','Percent_priv_health',
            'Percent_public_health','Percent_no_health','Percent_65+','Percent_non_citizen','Percent_hh_no_vehicle',
                    'Percent_hh_SNAP','percent_hh_poverty','hh_avg_size','num_hh','hh_med_income']


In [15]:
## add squared and cubed polynomials for each continuous feature
# for feat in continuous_features:
#     df[feat+'^2'] = df[feat]**2
#     df[feat+'^3'] = df[feat]**3


# Interaction Features
Create an interaction feature for each combination of continuous features, and add best ones to dataframe

In [16]:
# Generate combinations of features
y = df.dropna()[['FI Rate']]
X = df.dropna()[continuous_features]
interactions = list(combinations(X.columns, 2))
interaction_dict = {}

# run simple regression model with each possible interaction, and save R-squared for each interaction in a dictionary
for interaction in interactions:
    X_copy = X.copy()
    X_copy['interact'] = X_copy[interaction[0]] * X_copy[interaction[1]] 
    X_copy = X_copy.replace([np.inf, -np.inf], 0)
    model = LinearRegression()
    model.fit(X_copy, y)
    interaction_dict[model.score(X_copy, y)] = interaction 


### Add best 50 interactions to dataframe

In [17]:
# Sort the interactions dictionary, and add best 50 interactions to dataframe
top_interactions = sorted(interaction_dict.keys(), reverse = True)[:50]
for interaction in top_interactions:
    feature1 = interaction_dict[interaction][0]
    feature2 = interaction_dict[interaction][1]
    df[feature1+'_X_'+feature2] = df[feature1] * df[feature2] #also add to new_features df


# Log Transformations
Take natural log of each continuous feature, and add these log features to dataframe

In [18]:
for feat in continuous_features:
    df['log_'+feat] = df[feat].map(lambda x: np.log(x))
df = df.replace([np.inf, -np.inf], 0)

# Dummy Variables
High and Low Threshold programs delineate the assistance programs provided by State

In [19]:
# Create dummy variables for high and low threshold programs, and add to dataframe
hi_thresh_dummies = pd.get_dummies(df['High Threshold Type'].astype(str), dtype=int)
hi_thresh_dummies['other'] = hi_thresh_dummies['Other Nutrition Program'] + hi_thresh_dummies['other nutrition pgm']
hi_thresh_dummies.drop(['Other Nutrition Program','other nutrition pgm','nan'],axis=1,inplace=True) # drop last col
hi_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
hi_thresh_dummies.columns = 'Hi_thresh_'+hi_thresh_dummies.columns

low_thresh_dummies = pd.get_dummies(df['Low Threshold Type'].astype(str), dtype=int)
low_thresh_dummies.drop('nan', axis=1,inplace=True)
low_thresh_dummies.rename(columns = {'SNAP, Other Nutrition Programs': 'SNAP_other'}, inplace=True)
low_thresh_dummies.columns = 'Lo_thresh_'+low_thresh_dummies.columns

df = pd.concat([df, low_thresh_dummies, hi_thresh_dummies],axis=1)


### Pickle the new dataframe
Save the dataframe with all new features added (demographic percentages, interactions, logs, and dummies)

In [21]:
with open('../pickled/feature_engineered_data.pickle', "wb") as output_file:
    pickle.dump(df, output_file)