# Modeling
Goal: Use model to predict vacancy rates for 2019-2020

In [106]:
#imports
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV, learning_curve, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import cross_val_score
import datetime
from pandas_profiling import ProfileReport
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARMA
from xgboost import XGBClassifier

## Introduction
In this notebook, we will make the final predictions for vacancy rate for the years 2019-2020 using the model created in notebook 4.3

In [107]:
#load data
path= '/Users/josephfrasca/Coding_Stuff/Springboard/Capstone_2/data/processed'
os.chdir(path) 
# subset data into 2014-2018 to create model and 2019-2020 to do final predictions of vacancy rates during that time
df = pd.read_csv('VacancyRate_Zillow_2014_2018')
df19_20 = pd.read_csv('VacancyRate_Zillow_2019_2020')

In [108]:
df

Unnamed: 0,Zipcode,RentPrice,Year,SizeRank,HomePrice,Vacancy_Rate%,State_AL,State_AR,State_AZ,State_CA,...,CountyName_Weber County,CountyName_Weld County,CountyName_Westchester County,CountyName_Will County,CountyName_Williamson County,CountyName_Wilson County,CountyName_Worcester County,CountyName_Yamhill County,CountyName_Yolo County,CountyName_York County
0,10025,3041.83,2014,0.0,968761.75,9.011810,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60657,1589.42,2014,1.0,450755.75,8.042922,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10023,3186.67,2014,2.0,1024543.17,19.964756,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,77494,1807.33,2014,3.0,322032.00,3.319292,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60614,1786.25,2014,4.0,580250.92,8.468203,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16143,2110,4643.58,2018,14752.0,1363870.08,17.412045,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16144,20004,2432.25,2018,15149.0,480942.83,21.036585,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16145,80951,1537.18,2018,15318.0,276619.83,1.084746,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16146,11964,20122.17,2018,17169.0,1000069.25,62.044105,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [109]:
#add leading zeros to zipcode
df['Zipcode'] = df['Zipcode'].apply(lambda x: '{0:0>5}'.format(x))
df19_20['Zipcode'] = df19_20['Zipcode'].apply(lambda x: '{0:0>5}'.format(x))

In [110]:
#change Zipcode from ojbect to integer
df.Zipcode = df.Zipcode.astype('int')
df19_20.Zipcode = df19_20.Zipcode.astype('int')
df.dtypes

Zipcode                          int64
RentPrice                      float64
Year                             int64
SizeRank                       float64
HomePrice                      float64
                                ...   
CountyName_Wilson County         int64
CountyName_Worcester County      int64
CountyName_Yamhill County        int64
CountyName_Yolo County           int64
CountyName_York County           int64
Length: 1754, dtype: object

In [111]:
df

Unnamed: 0,Zipcode,RentPrice,Year,SizeRank,HomePrice,Vacancy_Rate%,State_AL,State_AR,State_AZ,State_CA,...,CountyName_Weber County,CountyName_Weld County,CountyName_Westchester County,CountyName_Will County,CountyName_Williamson County,CountyName_Wilson County,CountyName_Worcester County,CountyName_Yamhill County,CountyName_Yolo County,CountyName_York County
0,10025,3041.83,2014,0.0,968761.75,9.011810,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60657,1589.42,2014,1.0,450755.75,8.042922,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10023,3186.67,2014,2.0,1024543.17,19.964756,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,77494,1807.33,2014,3.0,322032.00,3.319292,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60614,1786.25,2014,4.0,580250.92,8.468203,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16143,2110,4643.58,2018,14752.0,1363870.08,17.412045,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16144,20004,2432.25,2018,15149.0,480942.83,21.036585,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16145,80951,1537.18,2018,15318.0,276619.83,1.084746,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16146,11964,20122.17,2018,17169.0,1000069.25,62.044105,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Load Model

In [151]:
#define variable X, y
X = df.drop('Vacancy_Rate%', axis=1)
y = df['Vacancy_Rate%']

In [152]:
#split data for test purposes
tss = TimeSeriesSplit(n_splits = 5)
for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [159]:
loaded_model.score?

In [153]:
#load model
path= '/Users/josephfrasca/Coding_Stuff/Springboard/Capstone_2/models'
os.chdir(path)
filename = 'random_forest_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
#test it worked
result = loaded_model.score(X_test, y_test)
print(result)

0.922208934205951


## Refit Model On All Available Data

In [154]:
loaded_model.fit(X, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [160]:
cross_validate?

In [161]:
#perform 5 fold cross validation on all available data
cv_results = cross_validate(loaded_model, X, y, cv=5, n_jobs=-1)

In [162]:
#get R2 scores from cross validation
cv_scores = cv_results['test_score']
cv_scores

array([0.91043894, 0.96673606, 0.97035355, 0.96788022, 0.92274149])

In [163]:
#get mean/std of cv test scores
np.mean(cv_scores), np.std(cv_scores)

(0.9476300529044828, 0.025667430198232713)

In [164]:
#get mean/std of mae 
mae_mean, mae_std = np.mean(-1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
mae_mean, mae_std

(-0.9476300529044828, 0.025667430198232713)

## Calculate Expected Vacacny Rate for 2019-2020 From The Model

In [112]:
df19_20.isna().sum()

Zipcode             0
RentPrice           0
Year                0
SizeRank            8
State               6
City                6
Metro               6
CountyName          6
HomePrice           8
Vacancy_Rate%    6486
dtype: int64

In [113]:
df19_20.dropna(subset=['SizeRank', 'State', 'City', 'Metro', 'CountyName', 'HomePrice'], inplace=True)

In [114]:
#create X and y variables for years 2019-2020
X_19_20 = df19_20.drop('Vacancy_Rate%', axis=1)
y_19_20 = df19_20['Vacancy_Rate%']

In [115]:
#turn categorical variables into binary for predicting with get dummies
X_19_20_dummy = pd.get_dummies(X_19_20)

In [116]:
#loaded_model.score(X_19_20, y_19_20)

In [117]:
pred_19_20 = loaded_model.predict(X_19_20_dummy)

In [118]:
pred_19_20

array([12.35378661,  8.10539127, 18.87223043, ...,  2.22375246,
       59.34922924, 15.03991302])

In [127]:
X_19_20['Vacancy_Rate%'] = pred_19_20
predictions_2019_20 = X_19_20
predictions_2019_20

Unnamed: 0,Zipcode,RentPrice,Year,SizeRank,State,City,Metro,CountyName,HomePrice,Vacancy_Rate%
0,10025,3241.00,2019,0.0,NY,New York,New York-Newark-Jersey City,New York County,1072056.17,12.353787
1,60657,1784.58,2019,1.0,IL,Chicago,Chicago-Naperville-Elgin,Cook County,490717.08,8.105391
2,10023,3354.83,2019,2.0,NY,New York,New York-Newark-Jersey City,New York County,1053399.33,18.872230
3,77494,1795.08,2019,3.0,TX,Katy,Houston-The Woodlands-Sugar Land,Harris County,341633.50,3.870010
4,60614,2026.42,2019,4.0,IL,Chicago,Chicago-Naperville-Elgin,Cook County,639591.75,8.117381
...,...,...,...,...,...,...,...,...,...,...
6481,2110,4408.57,2020,14752.0,MA,Boston,Boston-Cambridge-Newton,Suffolk County,1339232.44,19.755999
6482,20004,2505.56,2020,15149.0,DC,Washington,Washington-Arlington-Alexandria,District of Columbia,497022.00,25.782105
6483,80951,1647.88,2020,15318.0,CO,Colorado Springs,Colorado Springs,El Paso County,315486.22,2.223752
6484,11964,15800.50,2020,17169.0,NY,Town of Shelter Island,New York-Newark-Jersey City,Suffolk County,1015162.00,59.349229


## Explore 2019-2020 Vacancy Rate Data

In [130]:
predictions_2019_20.State.nunique()

43

In [136]:
predictions_2019 = predictions_2019_20[predictions_2019_20.Year == 2019]
predictions_2020 = predictions_2019_20[predictions_2019_20.Year == 2020]
predictions_2019

Unnamed: 0,Zipcode,RentPrice,Year,SizeRank,State,City,Metro,CountyName,HomePrice,Vacancy_Rate%
0,10025,3241.00,2019,0.0,NY,New York,New York-Newark-Jersey City,New York County,1072056.17,12.353787
1,60657,1784.58,2019,1.0,IL,Chicago,Chicago-Naperville-Elgin,Cook County,490717.08,8.105391
2,10023,3354.83,2019,2.0,NY,New York,New York-Newark-Jersey City,New York County,1053399.33,18.872230
3,77494,1795.08,2019,3.0,TX,Katy,Houston-The Woodlands-Sugar Land,Harris County,341633.50,3.870010
4,60614,2026.42,2019,4.0,IL,Chicago,Chicago-Naperville-Elgin,Cook County,639591.75,8.117381
...,...,...,...,...,...,...,...,...,...,...
3238,2110,4718.58,2019,14752.0,MA,Boston,Boston-Cambridge-Newton,Suffolk County,1364140.92,19.498783
3239,20004,2454.67,2019,15149.0,DC,Washington,Washington-Arlington-Alexandria,District of Columbia,483746.17,25.800959
3240,80951,1603.91,2019,15318.0,CO,Colorado Springs,Colorado Springs,El Paso County,292574.83,2.097933
3241,11964,17439.00,2019,17169.0,NY,Town of Shelter Island,New York-Newark-Jersey City,Suffolk County,1019944.42,59.349229


In [125]:
#need to go back and deal with NaNs better (drop only all NaN), groupby year, and then linear fill?
#need to get rental data from ACS...

In [138]:
predictions_2020.sort_values('Vacancy_Rate%', ascending=False)

Unnamed: 0,Zipcode,RentPrice,Year,SizeRank,State,City,Metro,CountyName,HomePrice,Vacancy_Rate%
5455,34747,1575.78,2020,4108.0,FL,Citrus Ridge,Orlando-Kissimmee-Sanford,Osceola County,277538.78,68.278107
6464,11978,40654.00,2020,11537.0,NY,Westhampton Beach,New York-Newark-Jersey City,Suffolk County,1171179.44,65.945675
6396,89109,2030.67,2020,9205.0,NV,Las Vegas,Las Vegas-Henderson-Paradise,Clark County,303027.67,64.274854
6213,34242,3119.20,2020,6962.0,FL,Siesta Key,North Port-Sarasota-Bradenton,Sarasota County,635396.78,63.762117
6484,11964,15800.50,2020,17169.0,NY,Town of Shelter Island,New York-Newark-Jersey City,Suffolk County,1015162.00,59.349229
...,...,...,...,...,...,...,...,...,...,...
3741,80504,1966.56,2020,653.0,CO,Longmont,Boulder,Boulder County,452064.56,2.587632
6280,55346,2002.75,2020,7483.0,MN,Eden Prairie,Minneapolis-St. Paul-Bloomington,Hennepin County,373124.00,2.518589
5991,75022,1988.44,2020,5835.0,TX,Flower Mound,Dallas-Fort Worth-Arlington,Denton County,491887.44,2.497193
4737,20120,2110.56,2020,2403.0,VA,Centreville,Washington-Arlington-Alexandria,Fairfax County,515180.00,2.461074


In [150]:
state_groups = predictions_2020.groupby(['State']).mean()
state_groups.sort_values('Vacancy_Rate%', ascending=False)

Unnamed: 0_level_0,Zipcode,RentPrice,Year,SizeRank,HomePrice,Vacancy_Rate%
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
FL,33397.909287,1714.403175,2020.0,3686.62419,282784.038531,17.378259
LA,70253.7,1452.822,2020.0,3564.0,335594.409,16.610986
NV,89108.153846,1416.171538,2020.0,3021.019231,295219.265,13.629311
RI,2880.3,1702.075,2020.0,3805.1,339608.178,12.638598
AZ,85278.268116,1480.093551,2020.0,2998.152174,318899.116739,12.470624
NY,11001.468085,4041.434894,2020.0,3378.06383,748244.463262,11.979085
DC,20013.555556,2241.463889,2020.0,3770.5,688318.988333,10.348554
OH,44176.980392,1137.472941,2020.0,2737.039216,178799.431373,10.109677
DE,19780.625,1413.08125,2020.0,3554.0,217653.16625,9.910733
PA,18388.402597,1499.048701,2020.0,3471.805195,288865.351818,9.908845


In [None]:
#get average and std by year
#reconstruct categorical variables from dummy variables
#compare to 2014-2019 (should concat two data frames)
#look at highest/lowest vacancy rate by state, county, city, metro area etc.
#.... maybe just list this in further work b/c time is short...find areas to invest (rent/price ratios adjusted for vacancy).

##NOTE THIS IS NOT REPRESENTATIVE OF ALL ZIPCODES B/C ZILLOW ONLY HAD ~3000 zipcodes with rent prices vs. ~33k with home prices

## Summary

## Further Work

In [None]:
#need to go back and deal with NaNs better (drop only all NaN), groupby year, and then linear fill?
#need to get rental data from ACS...

#add in other zipcodes (get rent data from ACS)
#find areas to invest (rent/price ratios adjusted for vacancy).
    #add other variables to find areas to invest (ie. crime rates, unemployment rates, etc.)
