In [1]:
import pandas as pd
import cvxpy as cp
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
#helper function
def read_file(filename):
    data = []
    for i in range(len(filename.sheet_names)):
            data.append(pd.read_excel(filename,i))
    return data

In [3]:
xls = pd.ExcelFile('Cleaned Data/PIT_State_vF.xlsx')
PIT_State_vF = read_file(xls)
#PIT_State_vF[0]

In [4]:
xls = pd.ExcelFile('Cleaned Data/HIC_State_vF.xlsx')
HIC_State_vF = read_file(xls)
#HIC_State_vF[0].head()
#HIC_State_vF[0]["State"].all() == PIT_State_vF[0]["State"].all()

In [5]:
xls = pd.ExcelFile('Cleaned Data/PIT_State_Change_vF.xlsx')
PIT_State_Change_vF = read_file(xls)

## Change in Homelessness vs. Inventory Counts

In [6]:
#copy = pd.concat([PIT_State_vF[0]["Overall Homeless, 2022"], HIC_State_vF[0]], axis = 1)
copy = pd.concat([PIT_State_Change_vF[0]["Change in Total Homelessness, 2021-2022"], HIC_State_vF[0]], axis = 1)
#copy.head()

In [75]:
corr_mat = copy.corr(numeric_only = True)
corr_mat["Change in Total Homelessness, 2021-2022"].head()

Change in Total Homelessness, 2021-2022      1.000000
Total Year-Round Beds (ES, TH, SH)          -0.030940
Total Non-DV Year-Round Beds (ES, TH, SH)   -0.031155
Total HMIS Year-Round Beds (ES, TH, SH)     -0.030117
Total Year-Round Beds (ES)                  -0.031152
Name: Change in Total Homelessness, 2021-2022, dtype: float64

In [10]:
X = copy.drop(["State", "Change in Total Homelessness, 2021-2022", 
               "HMIS Participation Rate for Year-Round Beds (RRH)",
               "HMIS Participation Rate for Year-Round Beds (PSH)",
               "HMIS Participation Rate for Year-Round Beds (OPH)",
               "HMIS Participation Rate for Year-Round Beds (SH)",
               "HMIS Participation Rate for Year-Round Beds (TH)",
               "HMIS Participation Rate for Year-Round Beds (ES)",
               "HMIS Participation Rate for Year-Round Beds (ES, TH, SH)"], axis=1)
X = X.values
y = copy["Change in Total Homelessness, 2021-2022"]
y = y.values

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)

In [23]:
parameters = {"n_neighbors": range(1, 25), 'weights': ['uniform', 'distance']}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)
#gridsearch.best_params_

best_k = gridsearch.best_params_["n_neighbors"]
best_weights = gridsearch.best_params_["weights"]
bagged_knn = KNeighborsRegressor(n_neighbors=best_k, weights=best_weights)
bagging_model = BaggingRegressor(bagged_knn, n_estimators=100)

bagging_model.fit(X_train, y_train)
test_preds_grid = bagging_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_preds_grid)
test_rmse = sqrt(test_mse)
test_rmse

173.6225341769681

## Change in Homelessness vs Change in Inventory Counts

In [60]:
if HIC_State_vF[0].columns.all() == HIC_State_vF[1].columns.all():
    new1 = HIC_State_vF[0].drop(["State", 
                   "HMIS Participation Rate for Year-Round Beds (RRH)",
                   "HMIS Participation Rate for Year-Round Beds (PSH)",
                   "HMIS Participation Rate for Year-Round Beds (OPH)",
                   "HMIS Participation Rate for Year-Round Beds (SH)",
                   "HMIS Participation Rate for Year-Round Beds (TH)",
                   "HMIS Participation Rate for Year-Round Beds (ES)",
                   "HMIS Participation Rate for Year-Round Beds (ES, TH, SH)"], axis=1)

    new2 = HIC_State_vF[1].drop(["State", 
                   "HMIS Participation Rate for Year-Round Beds (RRH)",
                   "HMIS Participation Rate for Year-Round Beds (PSH)",
                   "HMIS Participation Rate for Year-Round Beds (OPH)",
                   "HMIS Participation Rate for Year-Round Beds (SH)",
                   "HMIS Participation Rate for Year-Round Beds (TH)",
                   "HMIS Participation Rate for Year-Round Beds (ES)",
                   "HMIS Participation Rate for Year-Round Beds (ES, TH, SH)"], axis=1)

In [47]:
copy = pd.concat([PIT_State_Change_vF[0]["Change in Total Homelessness, 2021-2022"], new1-new2], axis = 1)

In [49]:
X = copy.drop(["Change in Total Homelessness, 2021-2022"], axis=1).values
y = copy["Change in Total Homelessness, 2021-2022"].values

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1421)

In [74]:
parameters = {"n_neighbors": range(1, 30), 'weights': ['uniform', 'distance']}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)
#gridsearch.best_params_

best_k = gridsearch.best_params_["n_neighbors"]
best_weights = gridsearch.best_params_["weights"]
bagged_knn = KNeighborsRegressor(n_neighbors=best_k, weights=best_weights)
bagging_model = BaggingRegressor(bagged_knn, n_estimators=100)

bagging_model.fit(X_train, y_train)
test_preds_grid = bagging_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_preds_grid)
test_rmse = sqrt(test_mse)
test_rmse

19.287371997318097