# <img src = "./resources/GA.png" width = "25" height = "25" /> <span style = "color:Purple" > Project 5 : Food Insecurity Regression Study </span> 
---
## <span style = "color:Green" > Preprocessing / Modeling </span>      

#### Ira Seidman, Alec Edgecliffe-Johnson, Ryan McDonald, Andrew Roberts - General Assembly 
---

In [None]:
# Data manipulation imports
import pandas as pd
import numpy as np

# Graphing imports
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer, KNNImputer #imputer imports
from copy import copy, deepcopy #copy imports

# Modeling imports for imputing
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

## Reading in all datasets

In [None]:
df_sh = pd.read_csv('data_inputs/ed_socio_health.csv')
df_wp = pd.read_csv('data_inputs/wage_poverty.csv')
df_un = pd.read_csv('data_inputs/unemployment_clean.csv')
df_fins = pd.read_csv('data_inputs/food_ins_18.csv')
df_ed = pd.read_csv('data_inputs/education_stats_dsi.csv')

In [None]:
df_sh.fips.astype(int)
df_wp.fips.astype(int)
df_un.fips.astype(int)
df_fins.fips.astype(int)
df_ed.fips.astype(int).head()

## Merge Dataframes
Merge each dataset into one main df

In [None]:
df_m = pd.merge(left = df_sh, right = df_wp, on = 'fips')
df_m = pd.merge(left = df_m, right = df_un, on = 'fips')
df_m = pd.merge(left = df_m, right = df_fins, on = 'fips')
df_m = pd.merge(left = df_m, right = df_ed, on = 'fips' )
df_m.shape

In [None]:
df_m.info()

In [None]:
df_m.isnull().sum().sum()

## Cleaning

#### Renaming state_x as full_st_name

In [None]:
df_m = df_m.rename(columns = {"state_x": "state_name",
                               "state_y": "state_abr"})

In [None]:
df_m.head()

## Dropping unnecessary columns

Dropping columns that are unlikely to have explanatory power over and above other variables and that are duplicate information from 2016 data (eg. 2019 unemployment data).

Dropping num and percent food insecure in 2016 dataset as the 2018 data has both children and total. 

In [3]:
df_m.shape

NameError: name 'df_m' is not defined

In [None]:
df_m.head()

In [None]:
# Dropping list of columns that would appear unlikely to be the strongest predictors of food insecurity and have a fair amount of nulls
drop_list = ['teen_birth_rate', 'age_adjusted_death_rate', 'child_mortality_rate', 
             'infant_mortality_rate', 'num_limited_access_to_healthy_foods', 
             'segregation_index', 'segregation_index_2', 'homicide_rate', 
             'suicide_rate_age_adjusted', 'juvenile_arrest_rate', 'area_name', 
             'num_below_poverty', 'percent_some_college', 'labor_force', 
             'percent_unemployed_CHR', 'med_inc_19', 'unemployment_rate_2019', 
             'med_household_inc_19', 'med_hh_income_percent_of_state_total_2019', 
             'num_food_insecure', 'percent_food_insecure', 'less_than_high_school_diploma', 
             'bachelor_degree_or_higher', 'percent_less_than_18_years_of_age', 'percent_65_and_over',
             'mental_health_provider_rate']

df_m = df_m.drop(columns = drop_list)
df_m.shape

In [None]:
df_m.head()

## Drop % and convert to float

In [None]:
df_m['fi_rate_18'] = df_m['fi_rate_18'].str.replace('%', '').astype(float)
df_m['ch_fi_rate_18'] = df_m['ch_fi_rate_18'].str.replace('%', '').astype(float)
df_m['cpm_18'] = df_m['cpm_18'].str.strip('US$').astype(float)
df_m.head()

In [None]:
# Check types
types = pd.DataFrame(df_m.dtypes)
types.rename(columns = {0 : 'type'}, inplace = True)
types

## Impute Missing Data

In [None]:
df_m.isnull().sum()

In [None]:
# Total number of missing values - much less than the previous 9405
df_m.isnull().sum().sum()

In [None]:
null_columns = [col for col in df_m if df_m[col].isna().any()]

# No need for dummies on the first three dfs because a simple imputation is being used without ml models that need numerical data
df_m_mean = deepcopy(df_m)
df_m_median = deepcopy(df_m)
df_m_mode = deepcopy(df_m)
df_m_knn = pd.get_dummies(deepcopy(df_m))
df_m_lr = pd.get_dummies(deepcopy(df_m))
df_m_rf = pd.get_dummies(deepcopy(df_m))

In [None]:
# Impute with mean, median, and mode
for col in null_columns:
    df_m_mean[col] = df_m[col].fillna(df_m[col].dropna().mean())
    df_m_median[col] = df_m[col].fillna(df_m[col].dropna().median())
    df_m_mode[col] = df_m[col].fillna(df_m[col].dropna().mode()[0])
    
print('Mean imputation nulls: ', df_m_mean.isnull().sum().sum())
print('Median imputation nulls: ', df_m_median.isnull().sum().sum())
print('Mode imputation nulls: ', df_m_mode.isnull().sum().sum())

lr = LinearRegression()
rf = RandomForestRegressor()

def impute_missing_data(model):
    if model == lr:
        df = deepcopy(df_m_lr)
    elif model == rf:
        df = deepcopy(df_m_rf)
    # Loop through each column that has null values to impute for each row in that column with predictions from model
    for col in null_columns:
        df_cc = df.dropna() #use complete case
        
        # Fit model
        X = df_cc.drop(columns = col)
        y = df_cc[col]
        model.fit(X, y)
        
        df_temp = deepcopy(df) #deep copy to avoid making update
        
        # Fillna temporarily for other columns with median - eventually all columns will be imputed with model, but in the meantime impute columns left to be imputed with the median
        for column in df_temp.columns:
            if column != col:
                df_temp[column] = df_temp[column].fillna(df_temp[column].dropna().median())
                
        X_temp = df_temp.drop(columns = col) #drop target for prediction so there is no nulls

        # Loop through all of the rows checking for nulls in the col column, create a pred, and set that cell equal to pred
        for index, row in df_temp.iterrows():
            if pd.isnull(df_temp[col].iloc[index]):
                X_test_row = X_temp.iloc[index] #use df without target
                X_test_row = X_test_row.values.reshape(1, -1)
                
                pred = model.predict(X_test_row)
                df_temp.loc[index, col] = pred
                #print(pred[0])
        df[col] = df_temp[col] #make updates to df for next loop
    return df

df_m_lr = impute_missing_data(lr)
print('Lr imputation nulls: ', df_m_lr.isnull().sum().sum())

df_m_rf = impute_missing_data(rf)
print('Rf imputation nulls: ',df_m_rf.isnull().sum().sum())

imp_knn = KNNImputer(n_neighbors = 2)
df_m_knn = imp_knn.fit_transform(df_m_knn)
df_m_knn = pd.DataFrame(df_m_knn, columns = list(df_m_rf.columns))
print('Knn imputation nulls: ', df_m_knn.isnull().sum().sum())

## Export to CSVs

In [None]:
df_m_mean.to_csv('./cleaned_dataframes/df_m_mean.csv', index = False)
df_m_median.to_csv('./cleaned_dataframes/df_m_median.csv', index = False)
df_m_mode.to_csv('./cleaned_dataframes/df_m_mode.csv', index = False)
df_m_knn.to_csv('./cleaned_dataframes/df_m_knn.csv', index = False)
df_m_lr.to_csv('./cleaned_dataframes/df_m_lr.csv', index = False)
df_m_rf.to_csv('./cleaned_dataframes/df_m_rf.csv', index = False)