In [172]:
import pandas as pd
import numpy as np


### Read Data

In [173]:
path = "C:/Users/katon/Documents/JHU/DataVisualization/final_project/data/final_data/"

# independent variables
population_df = pd.read_excel(path+"population.xlsx").set_index('State')
housing_df = pd.read_excel(path+"housing.xlsx").set_index('State')

# dependent variables
homeless_df = pd.read_excel(path+"homeless.xlsx").set_index('State')
hpi_df = pd.read_excel(path+"hpi.xlsx").set_index('State')



### Preprocessing

In [174]:
def common_indices(dataframes):
    common_idx = set(dataframes[0].index)
    for df in dataframes[1:]:
        common_idx = common_idx.intersection(df.index)
    return list(common_idx)

In [175]:
common_index = common_indices([homeless_df, housing_df, hpi_df, population_df])

In [176]:
housing_list, population_list, homeless_list, hpi_list, state_list = [], [], [], [], []
for state in common_index:
    for year in list(housing_df.columns):
        housing_list.append(housing_df.loc[state, year])
        population_list.append(population_df.loc[state, year])
        homeless_list.append(homeless_df.loc[state, year])
        hpi_list.append(hpi_df.loc[state, year])
        state_list.append(state)

In [177]:
homeless_input = pd.DataFrame([housing_list, population_list, hpi_list, state_list, homeless_list]).T
homeless_input.columns = ['housing', 'population', 'hpi', 'state', 'homeless']

hpi_input = pd.DataFrame([housing_list, population_list, state_list, hpi_list]).T
hpi_input.columns = ['housing', 'population', 'state', 'hpi']

In [178]:
# Prep hpi
hpi_ohe = pd.get_dummies(hpi_input['state'])
hpi_input_df = pd.concat([hpi_input[['housing', 'population', 'hpi']], hpi_ohe], axis=1)
hpi_input_df = hpi_input_df.astype(int)
hpiX, hpiY = hpi_input_df.drop('hpi', axis=1), hpi_input_df['hpi']


In [179]:
# prep homeless
homeless_ohe = pd.get_dummies(homeless_input['state'])
homeless_input_df = pd.concat([homeless_input[['housing', 'population', 'hpi', 'homeless']], homeless_ohe], axis=1)
homeless_input_df = homeless_input_df.astype(int)
homelessX, homelessY = homeless_input_df.drop('homeless', axis=1), homeless_input_df['homeless']

## Model Selection

Start with prediction model for HPI, use HPI prediction as feature for homeless prediction

In [180]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold

In [181]:
## Random Forest
hpi_model = RandomForestRegressor(n_estimators=100, random_state=42) # 0.94
homeless_model = RandomForestRegressor(n_estimators=100, random_state=42) # 0.98

In [182]:
# KNN
hpi_model = KNeighborsRegressor(n_neighbors=3) # 0.68
homeless_model = KNeighborsRegressor(n_neighbors=3) # 0.98

In [193]:
# XGBoost
hpi_model = XGBRegressor() # 0.93
homeless_model = XGBRegressor() # 0.99

### HPI Model

In [194]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(hpi_model, hpiX, hpiY, cv=cv, scoring='r2')
np.mean(cv_scores)

0.939423040244382

A simple RF Regression model using housing and population data achieved a very solid R-squared score of 0.94, indicating that 94% of the variability observed in the target variable (HPI) is explained by the regression model.

Next, the homelessness prediction model is developed using housing, population, and HPI as data points.

In [195]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(homeless_model, homelessX, homelessY, cv=cv, scoring='r2')
np.mean(cv_scores)

0.9905697873709001

The homelessness model achieved comparable accuracy to the hpi model

### Making Predictions

The purpose of these models is to make predictions of future HPI and homelessness numbers based on a user inputting the number of houses constructed per year. In order to have this input be done within a Tableau dashboard, the predictions will be premade. This way, the model with not be required each time to make predictions, the dashboard will simply pull from a dataframe

In [196]:
# First, fit both models
hpi_model.fit(hpiX, hpiY)
homeless_model.fit(homelessX, homelessY)


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [204]:
hpiX.columns

Index(['housing', 'population', 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC',
       'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA',
       'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ',
       'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'],
      dtype='object')

In [197]:
state = 'CA'

In [198]:
housing_construction = housing_df.copy()
for year in housing_construction.columns[1:]:
    housing_construction[year] = housing_df[year] - housing_df[year-1]
mean_units_per_year = np.mean(housing_construction.iloc[:, 1:], axis=1).astype(int)

In [209]:
percentages = [25, 50, 75, 100, 125, 150, 175, 200]
years = list(range(2023, 2030))

mean_state_units = mean_units_per_year[state]

state_hpi_df = pd.DataFrame(index=percentages, columns=years)
state_homeless_df = pd.DataFrame(index=percentages, columns=years)

for year in state_hpi_df.columns:
    for percent in state_hpi_df.index:
        housing_val = int(housing_df.loc[state, 2021] + ((year-2021) * ((percent/100) * mean_units_per_year[state]))) # housing var
        population_val = population_df.loc[state, year] # population var 
        
        # Predict HPI value
        hpi_in = pd.DataFrame(columns=hpiX.columns)
        hpi_in.loc[0, :] = 0
        hpi_in.loc[0, 'housing'] = housing_val
        hpi_in.loc[0, 'population'] = population_val
        hpi_in.loc[0, state] = 1
        # make prediction
        hpi_prediction = hpi_model.predict(hpi_in)
        state_hpi_df.loc[percent, year] = hpi_prediction
        
        # Predict homeless value
        homeless_in = pd.DataFrame(columns=homelessX.columns)
        homeless_in.loc[0,:] = 0
        homeless_in.loc[0, 'housing'] = housing_val
        homeless_in.loc[0, 'population'] = population_val
        homeless_in.loc[0, 'hpi'] = hpi_prediction
        homeless_in.loc[0, state] = 1
        # make prediction
        homeless_prediction = homeless_model.predict(homeless_in)
        state_homeless_df.loc[percent, year] = homeless_prediction


ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`. Invalid columns:housing, population, hpi, AK, AL, AR, AZ, CA, CO, CT, DC, DE, FL, GA, HI, IA, ID, IL, IN, KS, KY, LA, MA, MD, ME, MI, MN, MO, MS, MT, NC, ND, NE, NH, NJ, NM, NV, NY, OH, OK, OR, PA, RI, SC, SD, TN, TX, UT, VA, VT, WA, WI, WV, WY

In [136]:
state_homeless_df

Unnamed: 0,2023,2024,2025,2026,2027,2028,2029
25,[159786.66666666666],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334]
50,[159786.66666666666],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334]
75,[159786.66666666666],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334]
100,[159786.66666666666],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334]
125,[159786.66666666666],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334]
150,[159786.66666666666],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334]
175,[159786.66666666666],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334]
200,[159786.66666666666],[159786.66666666666],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334],[147599.33333333334]


In [208]:
population_df.loc[state,:]

2010    3.731950e+07
2011    3.763837e+07
2012    3.794880e+07
2013    3.826079e+07
2014    3.859697e+07
2015    3.891804e+07
2016    3.916712e+07
2017    3.935850e+07
2018    3.946159e+07
2019    3.951222e+07
2020    3.950165e+07
2021    3.914299e+07
2022    3.902934e+07
2023    3.936059e+07
2024    3.969184e+07
2025    4.002308e+07
2026    4.035433e+07
2027    4.068558e+07
2028    4.101683e+07
2029    4.134808e+07
2030    4.167932e+07
2031    4.195091e+07
2032    4.222250e+07
2033    4.249409e+07
2034    4.276568e+07
2035    4.303727e+07
2036    4.330885e+07
2037    4.358044e+07
2038    4.385203e+07
2039    4.412362e+07
2040    4.439521e+07
Name: CA, dtype: float64

In [None]:
state_hpi_df