## KNN classifer - initial test on which variables might be important

In [76]:
import pandas as pd
import cvxpy as cp
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [77]:
#helper function
def read_file(filename):
    data = []
    for i in range(len(filename.sheet_names)):
            data.append(pd.read_excel(filename,i))
    return data

In [78]:
xls = pd.ExcelFile('Cleaned Data/PIT_State_vF.xlsx')
PIT_State_vF = read_file(xls)
#PIT_State_vF[0]

In [79]:
xls = pd.ExcelFile('Cleaned Data/HIC_State_vF.xlsx')
HIC_State_vF = read_file(xls)
#HIC_State_vF[0].head()
#HIC_State_vF[0]["State"].all() == PIT_State_vF[0]["State"].all()

In [80]:
xls = pd.ExcelFile('Cleaned Data/PIT_State_Change_vF.xlsx')
PIT_State_Change_vF = read_file(xls)

## Change in Homelessness vs. Inventory Counts

In [136]:
copy = pd.concat([PIT_State_Change_vF[0]["Change in Total Homelessness, 2021-2022"], HIC_State_vF[0]], axis = 1)
#copy.head()

In [137]:
corr_mat = copy.corr(numeric_only = True)
corr_mat["Change in Total Homelessness, 2021-2022"].head()

Change in Total Homelessness, 2021-2022                     1.000000
Total Year-Round Beds (ES, TH, SH)                         -0.035819
Total Non-DV Year-Round Beds (ES, TH, SH)                  -0.035741
Total HMIS Year-Round Beds (ES, TH, SH)                    -0.035757
HMIS Participation Rate for Year-Round Beds (ES, TH, SH)   -0.587368
Name: Change in Total Homelessness, 2021-2022, dtype: float64

In [138]:
#MP dropped - outlier?
copy = copy.drop([26])
X = copy.drop(["State", "Change in Total Homelessness, 2021-2022", 
               "HMIS Participation Rate for Year-Round Beds (RRH)",
               "HMIS Participation Rate for Year-Round Beds (PSH)",
               "HMIS Participation Rate for Year-Round Beds (OPH)",
               "HMIS Participation Rate for Year-Round Beds (SH)",
               "HMIS Participation Rate for Year-Round Beds (TH)",
               "HMIS Participation Rate for Year-Round Beds (ES)",
               "HMIS Participation Rate for Year-Round Beds (ES, TH, SH)"], axis=1)
X = X.values
y = copy["Change in Total Homelessness, 2021-2022"]
y = y.values

In [139]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)
parameters = {"n_neighbors": range(1, 25), 'weights': ['uniform', 'distance']}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)
#gridsearch.best_params_

best_k = gridsearch.best_params_["n_neighbors"]
best_weights = gridsearch.best_params_["weights"]
bagged_knn = KNeighborsRegressor(n_neighbors=best_k, weights=best_weights)
bagging_model = BaggingRegressor(bagged_knn, n_estimators=100)

bagging_model.fit(X_train, y_train)
test_preds_grid = bagging_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_preds_grid)
test_rmse = sqrt(test_mse)
test_rmse

1.185543866664504

## Change in Homelessness vs Change in Inventory Counts

In [140]:
if HIC_State_vF[0].columns.all() == HIC_State_vF[1].columns.all():
    new1 = HIC_State_vF[0].drop(["State", 
                   "HMIS Participation Rate for Year-Round Beds (RRH)",
                   "HMIS Participation Rate for Year-Round Beds (PSH)",
                   "HMIS Participation Rate for Year-Round Beds (OPH)",
                   "HMIS Participation Rate for Year-Round Beds (SH)",
                   "HMIS Participation Rate for Year-Round Beds (TH)",
                   "HMIS Participation Rate for Year-Round Beds (ES)",
                   "HMIS Participation Rate for Year-Round Beds (ES, TH, SH)"], axis=1)

    new2 = HIC_State_vF[1].drop(["State", 
                   "HMIS Participation Rate for Year-Round Beds (RRH)",
                   "HMIS Participation Rate for Year-Round Beds (PSH)",
                   "HMIS Participation Rate for Year-Round Beds (OPH)",
                   "HMIS Participation Rate for Year-Round Beds (SH)",
                   "HMIS Participation Rate for Year-Round Beds (TH)",
                   "HMIS Participation Rate for Year-Round Beds (ES)",
                   "HMIS Participation Rate for Year-Round Beds (ES, TH, SH)"], axis=1)

In [141]:
copy = pd.concat([PIT_State_Change_vF[0]["Change in Total Homelessness, 2021-2022"], new1-new2], axis = 1)
copy = copy.drop([26])
X = copy.drop(["Change in Total Homelessness, 2021-2022"], axis=1).values
y = copy["Change in Total Homelessness, 2021-2022"].values

In [142]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1421)
parameters = {"n_neighbors": range(1, 30), 'weights': ['uniform', 'distance']}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)
#gridsearch.best_params_

best_k = gridsearch.best_params_["n_neighbors"]
best_weights = gridsearch.best_params_["weights"]
bagged_knn = KNeighborsRegressor(n_neighbors=best_k, weights=best_weights)
bagging_model = BaggingRegressor(bagged_knn, n_estimators=100)

bagging_model.fit(X_train, y_train)
test_preds_grid = bagging_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_preds_grid)
test_rmse = sqrt(test_mse)
test_rmse

0.7753874526519906

In [143]:
#change in inventory seems to be the better predictor for change in homeless, although not "foolproof"

In [144]:
from sklearn.tree import DecisionTreeRegressor 
  
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 100) 
  
# fit the regressor with X and Y data
regressor.fit(X_train, y_train)

In [145]:
y_pred = regressor.predict(X_test)

In [146]:
y_pred

array([ 0.5003167,  0.1131387, -0.011299 ,  0.1131387,  0.0593077,
        0.1496026,  0.1233655,  0.2446725,  0.5003167, -0.060086 ,
       -0.119933 ])

In [147]:
y_test

array([ 2.1640280e-01,  7.6458800e-02,  1.2996530e-01,  7.0012240e-01,
        3.4605180e-01,  2.4598780e+00, -2.2440000e-03,  1.4822344e+00,
        1.5757730e-01,  2.1687730e-01, -4.5383000e-02])

In [148]:
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = sqrt(test_mse)
test_rmse

0.831926923042276