In [2]:
import pandas as pd
import numpy as np
from itertools import combinations
import statistics as stats
import scipy.stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import sklearn
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pickle
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

### Import cleaned dataset

In [3]:
df = pd.read_csv('../datasets/cleaned_data.csv',index_col='Unnamed: 0')

In [58]:
df.sample(5)

Unnamed: 0,FIPS,Rent,lat,lng,Year,coc_number,Houseless_rate,Sheltered_rate,Unsheltered_rate,FIPS_state,FIPS_county,State,County,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,TOT_WHITE,TOT_BLACK,TOT_NATIVE,TOT_ASIAN,TOT_PACIFIC,State/County,Total_workforce,Employed,Unemployed,Unemployment_rate,FI Rate,Number Food Insecure Individuals,Low Threshold Type,High Threshold Type,Weighted Annual Dollars,Cost Per Meal,Child FI Rate,Num_wholesale,Num_restaraunts,Num_grocery
560844,37087,,,,2016,NC-503,0.000581,0.000417,0.000164,37.0,87.0,North Carolina,Haywood County,Age 40 to 44 years,3478.0,1712.0,1766.0,3348.0,41.0,26.0,31.0,1.0,"Haywood County, NC",28486,27193,1293,4.5,0.127,7560.0,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",4097000.0,3.17,0.231,,,
690988,55139,,,,2018,WI-500,0.000763,0.000733,3.1e-05,55.0,139.0,Wisconsin,Winnebago County,Age 10 to 14 years,9926.0,5042.0,4884.0,8735.0,278.0,77.0,411.0,7.0,"Winnebago County, WI",92128,89613,2515,2.7,0.085,14460.0,"SNAP, Other Nutrition Programs","SNAP, Other Nutrition Programs",7514000.0,3.07,0.143,14.0,1390.0,251.0
857022,47115,,,,2019,TN-500,0.000655,0.000399,0.000256,47.0,115.0,Tennessee,Marion County,Age 35 to 39 years,1604.0,795.0,809.0,1498.0,68.0,6.0,13.0,0.0,"Marion County, TN",12600,12087,513,4.1,,,,,,,,,,
571429,45059,,,,2016,SC-501,0.001222,0.001048,0.000174,45.0,59.0,South Carolina,Laurens County,Age 50 to 54 years,4768.0,2248.0,2520.0,3467.0,1219.0,18.0,23.0,7.0,"Laurens County, SC",30085,28512,1573,5.2,0.145,9640.0,SNAP,Other Nutrition Program,4086000.0,2.48,0.214,,,
919056,48043,,,,2009,,,,,48.0,43.0,,,,,,,,,,,,"Brewster County, TX",5153,4892,261,5.1,0.155,1420.0,SNAP,other nutrition pgm,683950.0,2.8829,0.257,,,


In [20]:
df.Year.value_counts()

2019    243835
2010    185758
2013     61042
2018     61018
2017     61007
2016     60998
2014     60998
2015     60996
2012     60991
2011     60990
2020     32502
2009      3239
Name: Year, dtype: int64

# Baseline Model
### Predicting on 2014 Data

In [22]:
df_14 = df[df.Year==2014]

# Drop all null values for now
df_14 = df_14.dropna()

### Train/Test Split

In [72]:
y = df_14['FI Rate']
X = df_14.drop(['FIPS','lat', 'lng', 'Year', 'coc_number','Number Food Insecure Individuals','FIPS_state', 'FIPS_county','State', 'FI Rate',
                'County','AGEGRP','State/County','Child FI Rate','Low Threshold Type', 'High Threshold Type'],axis=1)

In [73]:
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Model

In [74]:
# Instantiate Linear Regression model
model_1 = LinearRegression()
model_1.fit(X_train, y_train)
y_train_pred = model_1.predict(X_train)
y_test_pred = model_1.predict(X_test)

# R2 of training and test set
print('R2 Train:',model_1.score(X_train, y_train))
print('R2 Train:',model_1.score(X_test, y_test))

# RMSE of training and test set
print('RMSE Train:',np.sqrt(mean_squared_error(y_train, y_train_pred)))
print('RMSE Test:',np.sqrt(mean_squared_error(y_test, y_test_pred)))

R2 Train: 0.730017708981006
R2 Train: 0.7042209493112999
RMSE Train: 0.02105786565878052
RMSE Test: 0.021852482453572717


### Visually compare predictions

In [76]:
test_set = pd.concat([X_test, y_test],axis=1).reset_index()
df_preds = pd.concat([test_set, pd.Series(y_test_pred)],axis=1).rename(columns={0:'Y Test Preds'})
df_preds[['FI Rate','Y Test Preds']].sample(10)

Unnamed: 0,FI Rate,Y Test Preds
747,0.064,0.088926
499,0.087,0.085209
535,0.15,0.145623
326,0.113,0.115095
446,0.268,0.193299
658,0.068,0.117992
419,0.167,0.158816
696,0.153,0.156797
230,0.196,0.184432
685,0.14,0.138458


### Inspect Feature Importance

In [79]:
sorted(list(zip(model_1.coef_.tolist(), X_test.columns)), reverse=True)[:10]

[(2.772955546444385, 'Houseless_rate'),
 (1.866222175795785, 'Sheltered_rate'),
 (0.9067357677569068, 'Unsheltered_rate'),
 (0.011049119503063512, 'Unemployment_rate'),
 (1.112623289543138e-06, 'TOT_PACIFIC'),
 (7.754981790059753e-07, 'TOT_ASIAN'),
 (7.155199385976167e-07, 'TOT_BLACK'),
 (7.086248975066868e-07, 'Employed'),
 (6.168133773234779e-07, 'TOT_WHITE'),
 (4.1364443995484806e-08, 'Num_grocery')]