In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
filename = '../../data/processed/feature-eng-clustered.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,zipcode,start_year,avg_eqi_year_5-10,year_1_EQI_MSA,year_1_EQI_state,year_1_EQI_zip,year_1_RECPI_MSA,year_1_RECPI_state,year_1_RECPI_zip,year_2_EQI_MSA,...,recpi_state_change_2,recpi_state_change_3,recpi_state_change_4,avg_eqi_year_1-5,dataset_cluster,zip_cluster,msa_cluster,state_cluster,eqi_cluster,recpi_cluster
0,1001,1992,0.001287,0.001549,0.002111,0.002216,1.176876,28.052156,0.048744,0.001266,...,1.02056,1.03639,1.251689,0.00217,3,0,0,1,0,0
1,1001,1997,0.001853,0.001233,0.002386,0.00084,1.112023,41.1745,0.035283,0.001164,...,1.245044,1.448812,0.69367,0.001287,3,0,0,1,0,0
2,1001,2002,0.000602,0.001494,0.00245,0.001504,1.746593,50.71338,0.054131,0.001073,...,0.905151,1.007785,1.046863,0.001853,3,0,0,1,0,0
3,1001,2007,0.000898,0.000702,0.001817,0.000611,1.081845,45.171143,0.036063,0.000758,...,0.801518,1.08881,1.051707,0.000602,3,0,0,1,0,0
4,1002,1992,0.002931,0.001549,0.002111,0.000959,1.176876,28.052156,0.020148,0.001266,...,1.02056,1.03639,1.251689,0.003141,3,1,0,1,0,0


In [3]:
X = df['zipcode'].unique()
y = df['zipcode'].unique()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X = df.drop('avg_eqi_year_5-10', axis=1)
y = df[['zipcode','start_year','avg_eqi_year_5-10']]

In [6]:
X_train = pd.DataFrame(X_train)
X_train.columns = ['zipcode']

X_test = pd.DataFrame(X_test)
X_test.columns = ['zipcode']

y_train = pd.DataFrame(y_train)
y_train.columns = ['zipcode']

y_test = pd.DataFrame(y_test)
y_test.columns = ['zipcode']

In [7]:
X_train = X_train.merge(X, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)
X_test = X_test.merge(X, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)
y_train = y_train.merge(y, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)
y_test = y_test.merge(y, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)

In [8]:
X_test = X_test.drop(['zipcode','start_year'], axis=1)
X_train = X_train.drop(['zipcode','start_year'], axis=1)
y_test = y_test[['avg_eqi_year_5-10']].to_numpy().ravel()
y_train = y_train[['avg_eqi_year_5-10']].to_numpy().ravel()

## Modeling

In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import r2_score, mean_gamma_deviance
from skopt import BayesSearchCV

In [10]:
column_trans = ColumnTransformer([
    ('scale', StandardScaler(), make_column_selector(dtype_include=np.float64)),
    ('dataset_cluster', OneHotEncoder(dtype='int'), ['dataset_cluster']),
    ('zip_cluster', OneHotEncoder(dtype='int'), ['zip_cluster']),
    ('msa_cluster', OneHotEncoder(dtype='int'), ['msa_cluster']),
    ('state_cluster', OneHotEncoder(dtype='int'), ['state_cluster']),
    ('eqi_cluster', OneHotEncoder(dtype='int'), ['eqi_cluster']),
    ('recpi_cluster', OneHotEncoder(dtype='int'), ['recpi_cluster'])
    ], remainder='drop')
column_trans.fit(X_train)

ColumnTransformer(transformers=[('scale', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x0000021811C64970>),
                                ('dataset_cluster', OneHotEncoder(dtype='int'),
                                 ['dataset_cluster']),
                                ('zip_cluster', OneHotEncoder(dtype='int'),
                                 ['zip_cluster']),
                                ('msa_cluster', OneHotEncoder(dtype='int'),
                                 ['msa_cluster']),
                                ('state_cluster', OneHotEncoder(dtype='int'),
                                 ['state_cluster']),
                                ('eqi_cluster', OneHotEncoder(dtype='int'),
                                 ['eqi_cluster']),
                                ('recpi_cluster', OneHotEncoder(dtype='int'),
                                 ['recpi_cluster'])])

## KNN Regressor

In [11]:
from sklearn.neighbors import KNeighborsRegressor

In [12]:
parameters = {
    'knn__n_neighbors': (1,31),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

In [13]:
pipe = Pipeline([('transformer', column_trans), ('knn', KNeighborsRegressor())])

In [14]:
model1 = BayesSearchCV(pipe, parameters, n_iter=40, n_points=3, cv=3, n_jobs=-1)

In [15]:
model1.total_iterations

60

In [16]:
def on_step(optim_result):
    score = model1.best_score_
    print("best score: %s" % score)

In [17]:
model1.fit(X_train, y_train, callback=on_step)

best score: 0.34043062362836385
best score: 0.34194108639576154
best score: 0.3433660598575949




best score: 0.3433660598575949
best score: 0.3433660598575949
best score: 0.3433660598575949
best score: 0.3433660598575949


BayesSearchCV(cv=3,
              estimator=Pipeline(steps=[('transformer',
                                         ColumnTransformer(transformers=[('scale',
                                                                          StandardScaler(),
                                                                          <sklearn.compose._column_transformer.make_column_selector object at 0x000001F582CB9BE0>),
                                                                         ('dataset_cluster',
                                                                          OneHotEncoder(dtype='int'),
                                                                          ['dataset_cluster']),
                                                                         ('zip_cluster',
                                                                          OneHotEncoder(dtype='int'),
                                                                          ['zip_cluster']),
           

In [18]:
print("val. score: %s" % model1.best_score_)
print("test score: %s" % model1.score(X_test, y_test))
print("best params: %s" % str(model1.best_params_))

val. score: 0.3433660598575949
test score: 0.5284269787793014
best params: OrderedDict([('knn__metric', 'manhattan'), ('knn__n_neighbors', 11), ('knn__weights', 'distance')])


## Random Forest

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [14]:
# Number of trees in random forest
n_estimators = (200, 2000)
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = (10,110)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
parameters = {'rf__n_estimators': n_estimators,
               'rf__max_features': max_features,
               'rf__max_depth': max_depth,
               'rf__min_samples_split': min_samples_split,
               'rf__min_samples_leaf': min_samples_leaf,
               'rf__bootstrap': bootstrap}

In [15]:
pipe = Pipeline([('transformer', column_trans), ('rf', RandomForestRegressor(n_jobs=-1))])

In [16]:
model2 = BayesSearchCV(pipe, parameters, n_iter=10, n_points=3, cv=3, n_jobs=-1)

In [17]:
model2.total_iterations

60

In [18]:
def on_step(optim_result):
    score = model2.best_score_
    print("best score: %s" % score)

In [19]:
model2.fit(X_train, y_train, callback=on_step)

best score: 0.38811298834883523
best score: 0.3894891126961437
best score: 0.3894891126961437
best score: 0.3894891126961437


BayesSearchCV(cv=3,
              estimator=Pipeline(steps=[('transformer',
                                         ColumnTransformer(transformers=[('scale',
                                                                          StandardScaler(),
                                                                          <sklearn.compose._column_transformer.make_column_selector object at 0x0000021811C64970>),
                                                                         ('dataset_cluster',
                                                                          OneHotEncoder(dtype='int'),
                                                                          ['dataset_cluster']),
                                                                         ('zip_cluster',
                                                                          OneHotEncoder(dtype='int'),
                                                                          ['zip_cluster']),
           

In [20]:
print("val. score: %s" % model2.best_score_)
print("test score: %s" % model2.score(X_test, y_test))
print("best params: %s" % str(model2.best_params_))

val. score: 0.3894891126961437
test score: 0.6048412796479616
best params: OrderedDict([('rf__bootstrap', False), ('rf__max_depth', 72), ('rf__max_features', 'sqrt'), ('rf__min_samples_leaf', 4), ('rf__min_samples_split', 2), ('rf__n_estimators', 315)])


## Gradient Boosting

In [28]:
from sklearn.ensemble import GradientBoostingRegressor

In [29]:
parameters = {
    "gbr__learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "gbr__min_samples_split": (0.1, 0.5),
    "gbr__min_samples_leaf": (0.1, 0.5),
    "gbr__max_depth":[3,5,8],
    "gbr__max_features":["log2","sqrt"],
    "gbr__criterion": ["friedman_mse",  "mae"],
    "gbr__subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "gbr__n_estimators":[10]
    }

In [30]:
pipe = Pipeline([('transformer', column_trans), ('gbr', GradientBoostingRegressor())])

In [31]:
model3 = BayesSearchCV(pipe, parameters, n_iter=10, n_points=3, cv=3, n_jobs=-1)

In [32]:
model3.total_iterations

80

In [33]:
def on_step(optim_result):
    score = model3.best_score_
    print("best score: %s" % score)

In [34]:
model3.fit(X_train, y_train, callback=on_step)

best score: 0.009754694827970204
best score: 0.046440730882059286
best score: 0.1012259832335307
best score: 0.1012259832335307


BayesSearchCV(cv=3,
              estimator=Pipeline(steps=[('transformer',
                                         ColumnTransformer(transformers=[('scale',
                                                                          StandardScaler(),
                                                                          <sklearn.compose._column_transformer.make_column_selector object at 0x0000021811C64970>),
                                                                         ('dataset_cluster',
                                                                          OneHotEncoder(dtype='int'),
                                                                          ['dataset_cluster']),
                                                                         ('zip_cluster',
                                                                          OneHotEncoder(dtype='int'),
                                                                          ['zip_cluster']),
           

In [35]:
print("val. score: %s" % model3.best_score_)
print("test score: %s" % model3.score(X_test, y_test))
print("best params: %s" % str(model3.best_params_))

val. score: 0.1012259832335307
test score: 0.1263938831267737
best params: OrderedDict([('gbr__criterion', 'friedman_mse'), ('gbr__learning_rate', 0.2), ('gbr__max_depth', 3), ('gbr__max_features', 'sqrt'), ('gbr__min_samples_leaf', 0.2608960555840142), ('gbr__min_samples_split', 0.17133048643520515), ('gbr__n_estimators', 10), ('gbr__subsample', 0.85)])
