In [None]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

import sklearn.metrics
from sklearn.model_selection import GridSearchCV

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
# render plot in default browser
pio.renderers.default = 'browser'

from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)


### Reading in data with raw values and adding GDP
This mainly applies to Redfin data- rather than a percentage use the raw numbers

In [None]:
df = pd.read_pickle('noACS_VARcounties_raw.pkl')


In [None]:
df.shape

In [None]:
df = df.dropna()

df['year'].max()

### Train/Test/Val
Split data through 2018 into X and y then split 75/25 train/test. 2019 later is used as validation.

In [None]:
data = df[df['year']!=2020]
data2020 = df[df['year']==2020]


X = data.iloc[:,3:-1]
y = data.iloc[:,-1]

In [None]:
#X_norm = StandardScaler().fit_transform(X)
X_norm = X
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.25, random_state=42)


### Model Training and Selection
Extra Trees Regressor returns the best R2 on test and validation data

In [None]:
regr = RandomForestRegressor(max_depth=10, random_state=0)
regr.fit(X_train, y_train)

In [None]:
score = regr.score(X_test, y_test)
print("Random Forest Regressor Test score: "+str(score)+'\n')

##0.3177081351943415

In [None]:
# parameters = {'n_estimators': [100, 500, 1000],
#                 'max_depth':[5, 10, 15], 
#                 'max_features': ['auto', 'sqrt', 'log2'],
#                 'bootstrap': [True, False]}
# model = RandomForestRegressor()
# grid = GridSearchCV(model, parameters)
# grid.fit(X, y)

In [None]:
# grid.cv_results_

In [None]:
# grid.best_estimator_

## RandomForestRegressor(bootstrap=False, max_depth=15, max_features='sqrt',
 ##                     n_estimators=1000)

In [None]:
regr = RandomForestRegressor(bootstrap=False, max_depth=15, 
                                max_features='sqrt', n_estimators=1000,
                                random_state=0)
regr.fit(X_train, y_train)

In [None]:
score = regr.score(X_test, y_test)
print("Random Forest Regressor Test score: "+str(score)+'\n')

##0.31915245672817405

In [None]:
##try extra trees regressor
from sklearn.ensemble import ExtraTreesRegressor
et = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(
       X_train, y_train)
et.score(X_test, y_test)

## 0.3542700325259205

In [None]:
# parameters = {'n_estimators': [100, 500, 1000],
#                 'max_depth':[5, 10, 15], 
#                 'max_features': ['auto', 'sqrt', 'log2'],
#                 'bootstrap': [True, False]}
# model = ExtraTreesRegressor()
# grid_et = GridSearchCV(model, parameters)
# grid_et.fit(X, y)

In [None]:
#grid_et.best_estimator_

##ExtraTreesRegressor(max_depth=15, n_estimators=500)

In [None]:
# et = ExtraTreesRegressor(n_estimators=500, max_depth=15, random_state=0).fit(
#        X_train, y_train)
et = ExtraTreesRegressor(max_depth=15, n_estimators=500, random_state=0).fit(
       X_train, y_train)

et.score(X_test, y_test)

## 0.35258367700466553

In [None]:

data2020.head()

In [None]:
#X_val = StandardScaler().fit_transform(df2019.iloc[:,3:-2].join(df2019.iloc[:,-1]))
X_val = data2020.iloc[:,3:-1]
y_val = data2020.iloc[:,-1]

In [None]:
score = regr.score(X_val, y_val)
print("Random Forest Regressor Validation score: "+str(score)+'\n')

## -2.6305878299387553

In [None]:
et.score(X_val, y_val)


##no scaler == -3.0302461349663883

In [None]:
d = dict()
for i, j in zip(X.columns, regr.feature_importances_):
    d[i]=j

print({k: v for k, v in sorted(d.items(), key=lambda item: item[1])})
#print(et.feature_importances_)

In [None]:
d = dict()
for i, j in zip(X.columns, et.feature_importances_):
    d[i]=j

print({k: v for k, v in sorted(d.items(), key=lambda item: item[1])})

In [None]:
d = {'Feature': X.columns, 'Importance': et.feature_importances_}
df = pd.DataFrame(d)
df = df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

fig = px.bar_polar(df.iloc[:30,:], r='Importance', theta='Feature',
            color='Feature', template='plotly_dark',
            color_discrete_sequence=px.colors.sequential.Plasma_r)
fig.show()

In [None]:
df[df['Importance']>0.0035]

In [None]:
pred2020 = et.predict(X_val)


data2020['Predicted_HPI_change'] = pred2020

data2020['Prediction_delta'] = ((data2020['annual_change_pct'] - data2020['Predicted_HPI_change'])/data2020['annual_change_pct'])*100
print(data2020['Prediction_delta'].mean())
#55.5449186899792%
print(data2020['Prediction_delta'].median())
#60.496949967476766%

In [None]:

fig = px.choropleth(data2020, geojson=counties, locations='county_fips', color='Prediction_delta',
                           color_continuous_scale="Viridis",
                            range_color=(0, 100),
                           scope="usa",
                           labels={'Prediction_delta':'Prediction delta for 2019 HPI'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()