In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

import sklearn.metrics
from sklearn.model_selection import GridSearchCV

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
# render plot in default browser
pio.renderers.default = 'browser'

from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)


### Reading in data with raw values
This mainly applies to Redfin data- rather than a percentage use the raw numbers

In [2]:
df = pd.read_pickle('fulldataset.pkl')
df = df.sort_values(by=['county_fips', 'year']).reset_index(drop=True)


In [3]:
medsale = df[['county_fips', 'year','median_sale_price']]
medsale

Unnamed: 0,county_fips,year,median_sale_price
0,01007,2012,72500.0
1,01007,2015,180000.0
2,01007,2016,144486.5
3,01007,2017,99648.5
4,01007,2018,156000.0
...,...,...,...
15012,55141,2017,114050.0
15013,55141,2018,126250.0
15014,55141,2019,128200.0
15015,55141,2020,120000.0


In [4]:
df['year'].max()

2021

In [5]:
df = df.drop(['annual_change_pct', 'median_sale_price'], axis=1)

##removing features too highly correlated
df = df.drop(['home_value_median', 'median_ppsf', 'median_list_price', 'median_list_ppsf'], axis=1)

In [6]:
medsale['year'] = medsale['year'] - 1
medsale['median_sale_price_log'] = np.log(medsale['median_sale_price'])

In [7]:
medsale.drop(['median_sale_price'], axis=1, inplace=True)

In [8]:
df = df.merge(medsale, on=['year', 'county_fips'], how='left')

df['year'].max()

2021

In [9]:
df = df.dropna()

In [10]:
df['year'].max()

2019

In [11]:
with open('VAR_counties.txt', 'r') as f:
    lines = f.readlines()

VAR_counties = []
for line in lines:
    VAR_counties.append(line.strip())

In [12]:
df = df[df['county_fips'].isin(VAR_counties)]

In [13]:
len(df['county_fips'].unique())

621

### Train/Test/Val
Split data through 2018 into X and y then split 75/25 train/test. 2019 later is used as validation.

In [14]:
data = df[df['year']!=2019]
data2019 = df[df['year']==2019]


X = data.iloc[:,3:-1]
y = data.iloc[:,-1]

In [15]:
#X_norm = StandardScaler().fit_transform(X)
X_norm = X
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.25, random_state=42)


In [16]:
data2019

Unnamed: 0,state_fips,county_fips,year,debt_ratio_low,debt_ratio_high,corp_income_tax_low,corp_income_tax_high,income_tax_low,income_tax_high,11_avg_annual_employee_pct_chg,...,some_college_lessthan_1yr_pct,some_college_greaterthan_1yr_pct,bachelor_degree_pct,master_degree_pct,professional_degree_pct,doctorate_degree_pct,occupied_units_pct,vacant_units_pct,gdp,median_sale_price_log
24,01,01015,2019,1.2800,1.475,6.5,6.5,2.0,5.00,0.000000,...,0.036301,0.115479,0.079944,0.034479,0.009471,0.010404,0.829527,0.170473,21694.458,11.887588
70,01,01043,2019,1.6400,1.905,6.5,6.5,2.0,5.00,0.000000,...,0.055355,0.109779,0.070850,0.033366,0.004083,0.002017,0.838823,0.161177,21694.458,12.201060
80,01,01049,2019,1.1450,1.325,6.5,6.5,2.0,5.00,100.000000,...,0.048131,0.072840,0.051641,0.029072,0.003174,0.000000,0.787192,0.212808,21694.458,12.004568
90,01,01055,2019,1.5275,1.760,6.5,6.5,2.0,5.00,0.000000,...,0.061202,0.117769,0.073571,0.030176,0.009074,0.008126,0.836966,0.163034,21694.458,12.027843
110,01,01073,2019,0.8950,1.100,6.5,6.5,2.0,5.00,1.408451,...,0.038477,0.105712,0.139755,0.061027,0.023885,0.012866,0.854944,0.145056,21694.458,12.278393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14950,55,55127,2019,1.9900,2.395,7.9,7.9,4.0,7.65,0.000000,...,0.045779,0.100435,0.137193,0.053597,0.010667,0.004727,0.808459,0.191541,21694.458,12.496875
14968,55,55131,2019,1.8200,2.160,7.9,7.9,4.0,7.65,0.849257,...,0.050245,0.101607,0.165003,0.044834,0.005727,0.006175,0.960790,0.039210,21694.458,12.505807
14976,55,55133,2019,1.2800,1.475,7.9,7.9,4.0,7.65,-7.594937,...,0.039577,0.098922,0.214175,0.077932,0.022405,0.012261,0.956417,0.043583,21694.458,12.736701
15004,55,55139,2019,0.3900,0.895,7.9,7.9,4.0,7.65,0.000000,...,0.044332,0.084889,0.152222,0.042785,0.010692,0.007836,0.933522,0.066478,21694.458,12.072541


### Model Training and Selection
Extra Trees Regressor returns the best R2 on test and validation data

In [17]:
regr = RandomForestRegressor(max_depth=10, random_state=0)
regr.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, random_state=0)

In [18]:
score = regr.score(X_test, y_test)
print("Random Forest Regressor Test score: "+str(score)+'\n')

##0.8264377043493462

Random Forest Regressor Test score: 0.8319356255114465



In [19]:
# parameters = {'n_estimators': [100, 500, 1000],
#                 'max_depth':[5, 10, 15], 
#                 'max_features': ['auto', 'sqrt', 'log2'],
#                 'bootstrap': [True, False]}
# model = RandomForestRegressor()
# grid = GridSearchCV(model, parameters)
# grid.fit(X, y)

In [20]:
# grid.cv_results_

In [21]:
# grid.best_estimator_

## RandomForestRegressor(bootstrap=False, max_depth=15, max_features='sqrt',
 ##                     n_estimators=1000)

In [22]:
regr = RandomForestRegressor(bootstrap=False, max_depth=15, 
                                max_features='sqrt', n_estimators=1000,
                                random_state=0)
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, max_depth=15, max_features='sqrt',
                      n_estimators=1000, random_state=0)

In [23]:
score = regr.score(X_test, y_test)
print("Random Forest Regressor Test score: "+str(score)+'\n')

## 0.8302582100477878

Random Forest Regressor Test score: 0.8530357171770785



In [24]:
##try extra trees regressor
from sklearn.ensemble import ExtraTreesRegressor
et = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(
       X_train, y_train)
et.score(X_test, y_test)

## 0.8660575221930159

0.880779722438045

In [25]:
# parameters = {'n_estimators': [100, 500, 1000],
#                 'max_depth':[5, 10, 15], 
#                 'max_features': ['auto', 'sqrt', 'log2'],
#                 'bootstrap': [True, False]}
# model = ExtraTreesRegressor()
# grid_et = GridSearchCV(model, parameters)
# grid_et.fit(X, y)

In [26]:
#grid_et.best_estimator_

##ExtraTreesRegressor(max_depth=15, n_estimators=500)

In [27]:
# et = ExtraTreesRegressor(n_estimators=500, max_depth=15, random_state=0).fit(
#        X_train, y_train)
et = ExtraTreesRegressor(max_depth=15, n_estimators=500, random_state=0).fit(
       X_train, y_train)

et.score(X_test, y_test)

## 0.8685945573030818

0.8817129565815965

In [28]:
#X_val = StandardScaler().fit_transform(df2019.iloc[:,3:-2].join(df2019.iloc[:,-1]))
X_val = data2019.iloc[:,3:-1]
y_val = data2019.iloc[:,-1]

In [29]:
score = regr.score(X_val, y_val)
print("Random Forest Regressor Validation score: "+str(score)+'\n')

## 0.59241998024084

Random Forest Regressor Validation score: 0.7034226598961859



In [30]:
et.score(X_val, y_val)


##no scaler == 0.6888839595901836

0.7429970823137846

In [31]:
d = dict()
for i, j in zip(X.columns, regr.feature_importances_):
    d[i]=j

print({k: v for k, v in sorted(d.items(), key=lambda item: item[1])})
#print(et.feature_importances_)

{'11_avg_annual_employee_pct_chg': 0.0017402480618596392, '11_avg_annual_pay_pct_chg': 0.0018873290387985014, '21_avg_annual_pay_pct_chg': 0.0019460103260749368, '99_avg_annual_pay_pct_chg': 0.00198984088326672, '21_avg_annual_employee_pct_chg': 0.0020937396361952847, 'birth_45_50_pct': 0.002220831729175614, '71_avg_annual_employee_pct_chg': 0.0023852193536947433, '55_avg_annual_pay_pct_chg': 0.0023964019280715678, '54_avg_annual_pay_pct_chg': 0.002471768509801568, '71_avg_annual_pay_pct_chg': 0.0025025943619124916, '51_avg_annual_pay_pct_chg': 0.0025419818274458176, '81_avg_annual_pay_pct_chg': 0.0025702705072991813, '55_avg_annual_employee_pct_chg': 0.0025890401294522405, '52_avg_annual_pay_pct_chg': 0.002668401844586591, '53_avg_annual_pay_pct_chg': 0.0026793045895402057, '62_avg_annual_pay_pct_chg': 0.0026840182331661004, '42_avg_annual_employee_pct_chg': 0.0026875462595550357, '56_avg_annual_pay_pct_chg': 0.0026933085717212816, '22_avg_annual_pay_pct_chg': 0.002713341852493616, '2

In [32]:
d = dict()
for i, j in zip(X.columns, et.feature_importances_):
    d[i]=j

print({k: v for k, v in sorted(d.items(), key=lambda item: item[1])})

{'11_avg_annual_employee_pct_chg': 0.001240008293037977, '21_avg_annual_pay_pct_chg': 0.0012661791837130082, '99_avg_annual_pay_pct_chg': 0.0013758213813570362, '54_avg_annual_pay_pct_chg': 0.0013759314652552787, '71_avg_annual_employee_pct_chg': 0.0014164092224545908, '55_avg_annual_employee_pct_chg': 0.001459881293874061, '61_avg_annual_pay_pct_chg': 0.0014650074362497593, '51_avg_annual_employee_pct_chg': 0.001471371403921746, '11_avg_annual_pay_pct_chg': 0.001481725355915964, '61_avg_annual_employee_pct_chg': 0.0014908750210820732, '71_avg_annual_pay_pct_chg': 0.0014962241235766983, '51_avg_annual_pay_pct_chg': 0.0015318090615393843, '52_avg_annual_pay_pct_chg': 0.0015386297763355533, '81_avg_annual_pay_pct_chg': 0.001571280003839546, '53_avg_annual_employee_pct_chg': 0.0016066769727948419, '55_avg_annual_pay_pct_chg': 0.0017102358315032797, '21_avg_annual_employee_pct_chg': 0.0017126698678119282, '22_avg_annual_pay_pct_chg': 0.0017403166730041097, '23_avg_annual_pay_pct_chg': 0.00

In [33]:
d = {'Feature': X.columns, 'Importance': et.feature_importances_}
df = pd.DataFrame(d)
df = df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

fig = px.bar_polar(df.iloc[:30,:], r='Importance', theta='Feature',
            color='Feature', template='plotly_dark',
            color_discrete_sequence=px.colors.sequential.Plasma_r)
fig.show()

In [34]:
df[df['Importance']>0.0035]

Unnamed: 0,Feature,Importance
0,bachelor_degree_pct,0.20526
1,income_tax_high,0.095575
2,avg_sale_to_list,0.09377
3,debt_ratio_high,0.074762
4,hs_diploma_pct,0.066693
5,master_degree_pct,0.064972
6,professional_degree_pct,0.059322
7,debt_ratio_low,0.039577
8,vehicles_per_person,0.024566
9,months_of_supply,0.014962


In [35]:
pred2019 = et.predict(X_val)


data2019['Predicted_sale_price_change'] = pred2019

data2019['Prediction_delta'] = ((np.exp(data2019['median_sale_price_log']) - np.exp(data2019['Predicted_sale_price_change']))/np.exp(data2019['median_sale_price_log']))*100
print(data2019['Prediction_delta'].mean())
#13.758548163554817%
print(data2019['Prediction_delta'].median())
#15.523601710497283%

11.791401140478985
13.608489983189738


In [36]:

fig = px.choropleth(data2019, geojson=counties, locations='county_fips', color='Prediction_delta',
                           color_continuous_scale="Viridis",
                            range_color=(0, 100),
                           scope="usa",
                           labels={'Prediction_delta':'Prediction delta for 2019 HPI'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()