In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

## Pull CSV Files

In [2]:
zori = pd.read_csv('./data/cleandata/clean_zori.csv').drop('Unnamed: 0', axis = 1)
hpi = pd.read_csv('./data/cleandata/clean_hpi.csv').drop('Unnamed: 0', axis = 1)
zhvi = pd.read_csv('./data/cleandata/clean_zhvi.csv').drop('Unnamed: 0', axis = 1)
airq = pd.read_csv('./data/cleandata/clean_airq.csv').drop('Unnamed: 0', axis = 1)
population = pd.read_csv('./data/cleandata/clean_population.csv').drop('Unnamed: 0', axis = 1)
unemployment = pd.read_csv('./data/cleandata/clean_unemployment.csv').drop('Unnamed: 0', axis = 1)
education = pd.read_csv('./data/cleandata/clean_education.csv').drop('Unnamed: 0', axis = 1)
permits = pd.read_csv('./data/cleandata/clean_permits.csv').drop('Unnamed: 0', axis = 1)
IandH = pd.read_csv('./data/cleandata/clean_IandH.csv').drop('Unnamed: 0', axis = 1)
pce = pd.read_csv('./data/cleandata/clean_pce.csv').drop('Unnamed: 0', axis = 1)
vacancy = pd.read_csv('./data/cleandata/clean_vacancy.csv').drop('Unnamed: 0', axis = 1)
jobs = pd.read_csv('./data/cleandata/clean_jos.csv').drop('Unnamed: 0', axis = 1)

## Intermediate Matrices for Merging Dataframes before Training

In [3]:
one_year_forecast = zori[['Year', 'Month', 'Year_Month']]
one_year_forecast = pd.concat([one_year_forecast, pd.DataFrame(one_year_forecast['Year'].map(lambda year: year - 1))], 
                             axis = 1)
one_year_forecast.columns = ['Year', 'Month', 'Year_Month', 'Year2']
one_year_forecast = pd.concat([one_year_forecast, pd.DataFrame(one_year_forecast['Year'].map(lambda year: year - 2))], 
                             axis = 1)
one_year_forecast.columns = ['Year', 'Month', 'Year_Month', 'Year2', 'Year3']
one_year_forecast['Year_Month2'] = one_year_forecast['Year2'].map(str) + '_' + one_year_forecast['Month'].map(str)
one_year_forecast['Year_Month3'] = one_year_forecast['Year3'].map(str) + '_' + one_year_forecast['Month'].map(str)
one_year_forecast = one_year_forecast[['Year_Month', 'Year_Month2', 'Year_Month3']]
one_year_forecast.drop_duplicates(inplace = True)

In [4]:
one_year_forecast

Unnamed: 0,Year_Month,Year_Month2,Year_Month3
0,2014_2,2013_2,2012_2
1,2014_3,2013_3,2012_3
2,2014_4,2013_4,2012_4
3,2014_5,2013_5,2012_5
4,2014_6,2013_6,2012_6
...,...,...,...
75,2020_5,2019_5,2018_5
76,2020_6,2019_6,2018_6
77,2020_7,2019_7,2018_7
78,2020_8,2019_8,2018_8


## Final Dataframe for Training

In [5]:
df = zori[zori['ZORI'] <= 4000]
df = pd.merge(df, one_year_forecast, on = 'Year_Month')
df = pd.merge(df, zori[['Year_Month', 'ZipCode', 'ZORI']], left_on = ['Year_Month2', 'ZipCode'],
              right_on = ['Year_Month', 'ZipCode'], how = 'left')
df.drop('Year_Month_y', axis = 1, inplace = True)

In [6]:
df = pd.merge(df, zhvi[['Year_Month', 'ZipCode', 'ZHVI']], left_on = ['Year_Month2', 'ZipCode'],
              right_on = ['Year_Month', 'ZipCode'], how = 'left')
df.drop('Year_Month', axis = 1, inplace = True)

In [7]:
df = pd.merge(df, hpi[['Year_Month', 'ZipCode', 'HPI']], left_on = ['Year_Month3', 'ZipCode'],
              right_on = ['Year_Month', 'ZipCode'], how = 'left')
df.drop('Year_Month', axis = 1, inplace = True)

In [8]:
df = pd.merge(df, airq[['County', 'State', 'Year_Month', 'AQI']], left_on = ['County', 'State', 'Year_Month2'],
              right_on = ['County', 'State', 'Year_Month'], how = 'left')
df.drop('Year_Month', axis = 1, inplace = True)

In [9]:
df = pd.merge(df, population[['County', 'State', 'Year_Month', 'Population']], 
              left_on = ['County', 'State', 'Year_Month3'], right_on = ['County', 'State', 'Year_Month'], how = 'left')
df.drop('Year_Month', axis = 1, inplace = True)

In [10]:
df = pd.merge(df, unemployment[['County', 'State', 'Year_Month', 'Unemployment']], 
             left_on = ['County', 'State', 'Year_Month2'], right_on = ['County', 'State', 'Year_Month'], how = 'left')
df.drop('Year_Month', axis = 1, inplace = True)

In [11]:
df['Year2'] = df['Year'] - 1
df['Year3'] = df['Year'] - 2
df = pd.merge(df, education, left_on = ['County', 'State', 'Year3'], right_on = ['County', 'State', 'Year'], how = 'left')
df.drop('Year_y', axis = 1, inplace = True)

In [12]:
df = pd.merge(df, permits[['State', 'Units', 'Year_Month']], left_on = ['State', 'Year_Month2'], 
              right_on = ['State', 'Year_Month'], how = 'left')
df.drop('Year_Month', axis = 1, inplace = True)

In [13]:
df = pd.merge(df, IandH[['County', 'State', 'Year_Month', 'Total_Households', 'Med_income']], 
             left_on = ['County', 'State', 'Year_Month3'], right_on = ['County', 'State', 'Year_Month'], how = 'left')
df.drop('Year_Month', axis = 1, inplace = True)

In [14]:
df = pd.merge(df, pce[['PCE', 'Year_Month']], left_on = 'Year_Month2', right_on = 'Year_Month')
df.drop('Year_Month', axis = 1, inplace = True)

In [15]:
df = pd.merge(df, vacancy, left_on = ['County', 'State', 'Year3'], right_on = ['County', 'State', 'Year'], how = 'left')
df.drop('Year', axis = 1, inplace = True)

In [16]:
df = pd.merge(df, jobs[['State', 'Job Openings', 'Year_Month']], left_on = ['State', 'Year_Month2'], 
              right_on = ['State', 'Year_Month'], how = 'left')
df.drop('Year_Month', axis = 1, inplace = True)

In [17]:
df = df[['Year_x', 'Month', 'ZipCode', 'ZORI_x', 'ZORI_y', 'ZHVI', 'HPI', 'AQI', 'Population', 'Unemployment', 
         'Percent Bachelors', 'Units', 'Total_Households', 'Med_income', 'PCE', 'Rental Vacancy Rate', 'Job Openings']]

In [18]:
df.columns = ['Year', 'Month', 'ZipCode', 'ZORI', 'ZORI_lagged_1', 'ZHVI_lagged_1', 'HPI_lagged_2', 'AQI_lagged_1',
              'Population_lagged_2', 'Unemployment_lagged_1', 'Percent Bachelors_lagged_2', 'Permits_lagged_1',
              'Total_Households_lagged_2', 'Med_Income_lagged_2', 'PCE_lagged_1', 'Rental Vacancy Rate_lagged_2',
              'Job Openings_lagged_1']

In [19]:
df_clean = df.drop('ZORI_lagged_1', axis = 1)

In [20]:
df_clean.isnull().sum(axis = 0) 

Year                                0
Month                               0
ZipCode                             0
ZORI                                0
ZHVI_lagged_1                    1054
HPI_lagged_2                    17192
AQI_lagged_1                    19188
Population_lagged_2              8690
Unemployment_lagged_1            7458
Percent Bachelors_lagged_2       8690
Permits_lagged_1                   80
Total_Households_lagged_2        8690
Med_Income_lagged_2              8690
PCE_lagged_1                        0
Rental Vacancy Rate_lagged_2     8943
Job Openings_lagged_1            1445
dtype: int64

In [21]:
# Dropping all NaN values instead of imputing for simplicity for now
df_clean = df_clean[~df_clean['Percent Bachelors_lagged_2'].isnull()]
df_clean = df_clean[~df_clean['HPI_lagged_2'].isnull()]
df_clean = df_clean[~df_clean['AQI_lagged_1'].isnull()]
df_clean = df_clean[~df_clean['ZHVI_lagged_1'].isnull()]
df_clean = df_clean[~df_clean['Rental Vacancy Rate_lagged_2'].isnull()]

In [22]:
df_clean.isnull().sum(axis = 0) 

Year                            0
Month                           0
ZipCode                         0
ZORI                            0
ZHVI_lagged_1                   0
HPI_lagged_2                    0
AQI_lagged_1                    0
Population_lagged_2             0
Unemployment_lagged_1           0
Percent Bachelors_lagged_2      0
Permits_lagged_1                0
Total_Households_lagged_2       0
Med_Income_lagged_2             0
PCE_lagged_1                    0
Rental Vacancy Rate_lagged_2    0
Job Openings_lagged_1           0
dtype: int64

In [23]:
df_clean

Unnamed: 0,Year,Month,ZipCode,ZORI,ZHVI_lagged_1,HPI_lagged_2,AQI_lagged_1,Population_lagged_2,Unemployment_lagged_1,Percent Bachelors_lagged_2,Permits_lagged_1,Total_Households_lagged_2,Med_Income_lagged_2,PCE_lagged_1,Rental Vacancy Rate_lagged_2,Job Openings_lagged_1
1,2014,2,60657,1583.0,393330.0,141.503229,53.928571,5.219456e+06,10.2,20.8,155.0,1.920079e+06,51037.342494,1.553716,6.7,149.0
4,2014,2,60614,1752.0,503651.0,130.588399,53.928571,5.219456e+06,10.2,20.8,155.0,1.920079e+06,51037.342494,1.553716,6.7,149.0
5,2014,2,77449,1288.0,117563.0,115.806934,48.071429,4.192941e+06,6.3,18.0,4566.0,1.421905e+06,50987.989729,1.553716,10.7,353.0
7,2014,2,77084,1123.0,120268.0,112.776241,48.071429,4.192941e+06,6.3,18.0,4566.0,1.421905e+06,50987.989729,1.553716,10.7,353.0
8,2014,2,79936,992.0,116709.0,143.575982,57.285714,8.218877e+05,8.3,14.3,4566.0,2.578420e+05,39700.633106,1.553716,7.0,353.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247035,2020,9,30288,1253.0,134851.0,88.079328,46.200000,2.823235e+05,3.8,14.4,1059.0,9.380716e+04,44035.999593,1.721905,9.4,229.0
247037,2020,9,2466,2164.0,960437.0,205.642620,34.066667,1.590017e+06,2.2,28.4,371.0,6.047694e+05,94031.051916,1.721905,2.8,173.0
247039,2020,9,7732,1607.0,307845.0,198.179098,36.600000,6.237565e+05,3.0,29.7,2475.0,2.350342e+05,94123.195905,1.721905,3.6,191.0
247041,2020,9,85701,1015.0,273317.0,198.579601,46.033333,1.026470e+06,4.3,18.5,667.0,3.965703e+05,50954.820177,1.721905,6.4,164.0


In [24]:
train_idx = np.random.choice(range(df_clean.shape[0]), round(df_clean.shape[0] * 0.7), replace = False)

In [25]:
train = df_clean.iloc[train_idx, :]
test = df_clean.iloc[list(set(range(df_clean.shape[0])) - set(train_idx)), :]

## Multiple Linear Regression on ZORI

In [26]:
regression = LinearRegression()
regression.fit(train.iloc[:, 4:], train.iloc[:, 3])
regression.score(train.iloc[:, 4:], train.iloc[:, 3])

0.7436702378242114

In [27]:
r2_score(test.iloc[:, 3], regression.predict(test.iloc[:, 4:]))

0.7423574003274593

## Random Forest Regressor on ZORI

In [28]:
randomForest = ensemble.RandomForestRegressor()

In [29]:
grid_para_forest = [{
    "n_estimators": [10],
    "criterion": ["mse"],
    "min_samples_leaf": [10],
    "min_samples_split": [30],
    "max_depth": [5],
    "random_state": [42]}]

grid_search_forest = GridSearchCV(randomForest, grid_para_forest, cv = 5, n_jobs = -1, return_train_score = True)
grid_search_forest.fit(train.iloc[:, 4:], train.iloc[:, 3])

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse'], 'max_depth': [5],
                          'min_samples_leaf': [10], 'min_samples_split': [30],
                          'n_estimators': [10], 'random_state': [42]}],
             return_train_score=True)

In [30]:
pd.DataFrame.from_dict(grid_search_forest.best_params_, orient = 'index', columns = ['hyperparameter'])

Unnamed: 0,hyperparameter
criterion,mse
max_depth,5
min_samples_leaf,10
min_samples_split,30
n_estimators,10
random_state,42


In [31]:
grid_search_forest.cv_results_['mean_train_score'][0]

0.7929299903552405

In [32]:
grid_search_forest.score(test.iloc[:, 4:], test.iloc[:, 3])

0.7880928961493116

## Multiple Linear Regression on % Change in ZORI

In [33]:
df_change = df.drop('ZORI_lagged_1', axis = 1).copy()
df_change['Year_Month'] = df_change['Year'].map(str) + '_' + df_change['Month'].map(str)
df_change = pd.merge(df_change, one_year_forecast, on = 'Year_Month')
df_change = pd.merge(df_change, df_change, left_on = ['Year_Month2', 'ZipCode'], right_on = ['Year_Month', 'ZipCode'],
                     how = 'left')

In [34]:
df_change['ZORI_x'] = (df_change['ZORI_x'] - df_change['ZORI_y']) / df_change['ZORI_y']
df_change['ZHVI_lagged_1_x'] = (df_change['ZHVI_lagged_1_x'] - df_change['ZHVI_lagged_1_y']) / df_change['ZHVI_lagged_1_y']
df_change['HPI_lagged_2_x'] = (df_change['HPI_lagged_2_x'] - df_change['HPI_lagged_2_y']) / df_change['HPI_lagged_2_y']
df_change['AQI_lagged_1_x'] = (df_change['AQI_lagged_1_x'] - df_change['AQI_lagged_1_y']) / df_change['AQI_lagged_1_y']
df_change['Population_lagged_2_x'] = (df_change['Population_lagged_2_x'] - df_change['Population_lagged_2_y']) /\
                                     df_change['Population_lagged_2_y']
df_change['Unemployment_lagged_1_x'] = (df_change['Unemployment_lagged_1_x'] - df_change['Unemployment_lagged_1_y']) /\
                                        df_change['Unemployment_lagged_1_y']
df_change['Percent Bacehlors_lagged_2_x'] = (df_change['Percent Bachelors_lagged_2_x'] -\
                                            df_change['Percent Bachelors_lagged_2_y']) /\
                                            df_change['Percent Bachelors_lagged_2_y']
df_change['Permits_lagged_1_x'] = (df_change['Permits_lagged_1_x'] - df_change['Permits_lagged_1_y']) /\
                                  df_change['Permits_lagged_1_y']
df_change['Total_Households_lagged_2_x'] = (df_change['Total_Households_lagged_2_x'] -\
                                           df_change['Total_Households_lagged_2_y']) /\
                                           df_change['Total_Households_lagged_2_y']
df_change['Med_Income_lagged_2_x'] = (df_change['Med_Income_lagged_2_x'] - df_change['Med_Income_lagged_2_y']) /\
                                      df_change['Med_Income_lagged_2_y']
df_change['PCE_lagged_1_x'] = (df_change['PCE_lagged_1_x'] - df_change['PCE_lagged_1_y']) / df_change['PCE_lagged_1_y'] 
df_change['Rental Vacancy Rate_lagged_2_x'] = (df_change['Rental Vacancy Rate_lagged_2_x'] -\
                                              df_change['Rental Vacancy Rate_lagged_2_y']) /\
                                              df_change['Rental Vacancy Rate_lagged_2_y']
df_change['Job Openings_lagged_1_x'] = (df_change['Job Openings_lagged_1_x'] - df_change['Job Openings_lagged_1_y']) /\
                                        df_change['Job Openings_lagged_1_y']

In [35]:
df_change = df_change[['Year_x', 'Month_x', 'ZipCode', 'ZORI_x', 'ZHVI_lagged_1_x', 'HPI_lagged_2_x', 'AQI_lagged_1_x', 
                      'Population_lagged_2_x', 'Unemployment_lagged_1_x', 'Percent Bachelors_lagged_2_x', 
                      'Permits_lagged_1_x', 'Total_Households_lagged_2_x', 'Med_Income_lagged_2_x', 'PCE_lagged_1_x', 
                      'Rental Vacancy Rate_lagged_2_x', 'Job Openings_lagged_1_x']]

In [36]:
df_change.columns = ['Year', 'Month', 'ZipCode', 'ZORI_delta', 'ZHVI_lagged_1_delta', 'HPI_lagged_2_delta', 
                     'AQI_lagged_1_delta', 'Population_lagged_2_delta', 'Unemployment_lagged_1_delta',
                     'Percent Bachelors_lagged_2_delta', 'Permits_lagged_1_delta', 'Total_Households_lagged_2_delta',
                     'Med_Income_lagged_2_delta', 'PCE_lagged_1_delta', 'Rental Vacancy Rate_lagged_2_delta',
                     'Job Openings_lagged_1_delta']

In [37]:
df_change = df_change[(df_change['Year'] != 2014) & ~((df_change['Year'] == 2015) & (df_change['Month'] == 1))]

In [38]:
df_change.isnull().sum(axis = 0)

Year                                      0
Month                                     0
ZipCode                                   0
ZORI_delta                             8666
ZHVI_lagged_1_delta                    9633
HPI_lagged_2_delta                    22958
AQI_lagged_1_delta                    24759
Population_lagged_2_delta             15937
Unemployment_lagged_1_delta           14914
Percent Bachelors_lagged_2_delta       7522
Permits_lagged_1_delta                 9265
Total_Households_lagged_2_delta       15937
Med_Income_lagged_2_delta             15937
PCE_lagged_1_delta                     8666
Rental Vacancy Rate_lagged_2_delta    16282
Job Openings_lagged_1_delta            9871
dtype: int64

In [39]:
# Dropping NaN values instead of imputing for now
df_change = df_change[~df_change['ZORI_delta'].isnull()]
df_change = df_change[~df_change['ZHVI_lagged_1_delta'].isnull()]
df_change = df_change[~df_change['HPI_lagged_2_delta'].isnull()]
df_change = df_change[~df_change['AQI_lagged_1_delta'].isnull()]
df_change = df_change[~df_change['Population_lagged_2_delta'].isnull()]
df_change = df_change[~df_change['Permits_lagged_1_delta'].isnull()]
df_change = df_change[~df_change['Rental Vacancy Rate_lagged_2_delta'].isnull()]

In [40]:
df_change.isnull().sum(axis = 0)

Year                                  0
Month                                 0
ZipCode                               0
ZORI_delta                            0
ZHVI_lagged_1_delta                   0
HPI_lagged_2_delta                    0
AQI_lagged_1_delta                    0
Population_lagged_2_delta             0
Unemployment_lagged_1_delta           0
Percent Bachelors_lagged_2_delta      0
Permits_lagged_1_delta                0
Total_Households_lagged_2_delta       0
Med_Income_lagged_2_delta             0
PCE_lagged_1_delta                    0
Rental Vacancy Rate_lagged_2_delta    0
Job Openings_lagged_1_delta           0
dtype: int64

In [41]:
(df_change == np.inf).sum(axis = 0)

Year                                    0
Month                                   0
ZipCode                                 0
ZORI_delta                              0
ZHVI_lagged_1_delta                     0
HPI_lagged_2_delta                      0
AQI_lagged_1_delta                      0
Population_lagged_2_delta               0
Unemployment_lagged_1_delta             0
Percent Bachelors_lagged_2_delta        0
Permits_lagged_1_delta                590
Total_Households_lagged_2_delta         0
Med_Income_lagged_2_delta               0
PCE_lagged_1_delta                      0
Rental Vacancy Rate_lagged_2_delta     99
Job Openings_lagged_1_delta             0
dtype: int64

In [42]:
# Drop np.inf values
df_change = df_change[df_change['Permits_lagged_1_delta'] != np.inf]
df_change = df_change[df_change['Rental Vacancy Rate_lagged_2_delta'] != np.inf]

In [43]:
train_change_idx = np.random.choice(range(df_change.shape[0]), round(df_change.shape[0] * 0.7), replace = False)

In [44]:
train_change = df_change.iloc[train_change_idx, :]
test_change = df_change.iloc[list(set(range(df_change.shape[0])) - set(train_change_idx)), :]

In [45]:
regression_change = LinearRegression()
regression.fit(train_change.iloc[:, 4:], train_change.iloc[:, 3])
regression.score(train_change.iloc[:, 4:], train_change.iloc[:, 3])

0.1556915452128823

In [46]:
r2_score(test_change.iloc[:, 3], regression.predict(test_change.iloc[:, 4:]))

0.15876474464239088