In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
import statistics as stats
import scipy.stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import sklearn
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pickle
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

### Import cleaned dataset

In [9]:
with open('../pickled/feature_engineered_data.pickle', "rb") as input_file:
    df = pickle.load(input_file) 

In [10]:
df.Year.value_counts()

2020    32502
2019    13129
2010     9709
2018     3328
2017     3317
2014     3308
2016     3308
2015     3306
2012     3298
2013     3297
2011     3296
2009     3239
Name: Year, dtype: int64

# Baseline Model
### Predicting on 2014 Data

In [15]:
df_14 = df[df.Year=='2014']

# Drop all null values for now
df_14 = df_14.dropna()

### Train/Test Split

In [16]:
y = df_14['FI Rate']
X = df_14.drop(['FIPS','Year', 'coc_number','FIPS_state', 'FIPS_county','State', 'FI Rate',
                'County','State/County','Low Threshold Type', 'High Threshold Type'],axis=1)

In [19]:
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Model

In [24]:
# Instantiate Linear Regression model
model_1 = LinearRegression()
model_1.fit(X_train, y_train)
y_train_pred = model_1.predict(X_train)
y_test_pred = model_1.predict(X_test)

# R2 of training and test set
print('R2 Train:',model_1.score(X_train, y_train))
print('R2 Test:',model_1.score(X_test, y_test))

# RMSE of training and test set
print('RMSE Train:',np.sqrt(mean_squared_error(y_train, y_train_pred)))
print('RMSE Test:',np.sqrt(mean_squared_error(y_test, y_test_pred)))

R2 Train: 0.927860464426694
R2 Test: 0.4529248882866488
RMSE Train: 0.010871165062304948
RMSE Test: 0.029876548460015785


### Visually compare predictions

In [26]:
test_set = pd.concat([X_test, y_test],axis=1).reset_index()
df_preds = pd.concat([test_set,pd.Series(y_test_pred)],axis=1).rename(columns={0:'Y Test Preds'})
df_preds[['FI Rate','Y Test Preds']].sample(10)

Unnamed: 0,FI Rate,Y Test Preds
23,0.144,0.128115
22,0.076,0.111327
1,0.09,0.089784
6,0.125,0.113824
42,0.115,0.031562
32,0.068,0.099991
9,0.139,0.13906
28,0.16,0.152763
35,0.145,0.117281
15,0.106,0.086316


### Inspect Feature Importance

In [79]:
sorted(list(zip(model_1.coef_.tolist(), X_test.columns)), reverse=True)[:10]

[(2.772955546444385, 'Houseless_rate'),
 (1.866222175795785, 'Sheltered_rate'),
 (0.9067357677569068, 'Unsheltered_rate'),
 (0.011049119503063512, 'Unemployment_rate'),
 (1.112623289543138e-06, 'TOT_PACIFIC'),
 (7.754981790059753e-07, 'TOT_ASIAN'),
 (7.155199385976167e-07, 'TOT_BLACK'),
 (7.086248975066868e-07, 'Employed'),
 (6.168133773234779e-07, 'TOT_WHITE'),
 (4.1364443995484806e-08, 'Num_grocery')]