# Tree Models
Summary:
The code in this file tries out different tree methods, including XGB, Random Forest, LightGBM, and tree ensembles.

In [1]:
import numpy as np
import pandas as pd


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score as auc
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as rmse_score

from scipy.stats import randint
from scipy.stats import loguniform
from scipy.stats import uniform

from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor

from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample


In [3]:
# load data
X_train = pd.read_csv('../Data/X_train_model2.csv')
y_train = pd.read_csv('../Data/y_train_model2.csv')
X_validate = pd.read_csv('../Data/X_valid_model2.csv')
y_validate = pd.read_csv('../Data/y_valid_model2.csv')

X_train_all = pd.read_csv('../Data/X_train_all.csv')
y_train_all = pd.read_csv('../Data/y_train_all.csv')
X_test = pd.read_csv('../Data/X_test_all.csv')
y_test = pd.read_csv('../Data/y_test_all.csv')


In [4]:
X_train.drop(columns=['Unnamed: 0'], inplace=True)
y_train.drop(columns=['Unnamed: 0'], inplace=True)
X_validate.drop(columns=['Unnamed: 0'], inplace=True)
y_validate.drop(columns=['Unnamed: 0'], inplace=True)

X_train_all.drop(columns=['Unnamed: 0'], inplace=True)
y_train_all.drop(columns=['Unnamed: 0'], inplace=True)
X_test.drop(columns=['Unnamed: 0'], inplace=True)
y_test.drop(columns=['Unnamed: 0'], inplace=True)


In [5]:
# rename this column which gives an error in LGBM because its name has quotation marks
X_train.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)
X_validate.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)

X_train_all.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)
X_test.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)


In [6]:
# create a sample to train faster
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, train_size=0.25, random_state=42)
X_train_sample, X_validate_sample, y_train_sample, y_validate_sample = train_test_split(X_train_sample, y_train_sample, train_size=0.8, random_state=33)

In [59]:
X_train.shape

(15251, 1724)

## XGB

In [26]:
from xgboost.sklearn import XGBRegressor

pipe_xgb = Pipeline(
    steps=[
        ("boost", XGBRegressor(random_state=42)),
    ]
)

param_distr = dict(
    # boost__n_estimators=list(range(10, 300, 50)),
    # boost__max_depth=[1,9,17,25,33],
    boost__learning_rate=list(np.arange(0.54, 0.58, 0.005)),
    # boost__subsample=[0.8],
    # boost__colsample_bytree=uniform(0.67,0.1),
    # boost__reg_alpha=[0.001, 0.1, 1, 5, 10,50,100],
    # boost__reg_lambda=[1],
    # boost__gamma=[i / 10.0 for i in range(0, 5)],
    # boost__tree_method=['gpu_hist']
)

# first use grid search to tune learning rate
scoring = {"rmse": "neg_root_mean_squared_error", "r2": "r2"}
search = GridSearchCV(
    pipe_xgb, param_grid=param_distr, scoring=scoring, refit="rmse", n_jobs=7
)
search.fit(X_train_sample, y_train_sample)
search.best_params_

# settle for learning_rate==0.55 for now


-1867331.6763641709

In [48]:
[search.score(X_train_sample, y_train_sample),search.score(X_validate_sample, y_validate_sample), r2_score(y_train_sample,search.predict(X_train_sample))]

[-181809.14251725803, -1867331.6763641709, 0.9927597923931099]

In [54]:
# use grid search to tune n_estimators and max_depth
param_distr = dict(
    boost__n_estimators=list(range(10, 300, 50)),
    boost__max_depth=[1, 9, 17, 25, 33],
    boost__learning_rate=[0.55],
    # boost__subsample=[0.8],
    # boost__colsample_bytree=uniform(0.67,0.1),
    # boost__reg_alpha=[0.001, 0.1, 1, 5, 10,50,100],
    # boost__reg_lambda=[1],
    # boost__gamma=[i / 10.0 for i in range(0, 5)],
    #boost__tree_method=['gpu_hist']
)

scoring = {"rmse": "neg_root_mean_squared_error", "r2": "r2"}
search = GridSearchCV(
    pipe_xgb, param_grid=param_distr, scoring=scoring, refit="rmse", n_jobs=7
)
search.fit(X_train_sample, y_train_sample)
search.best_params_

# n_estimators == 260
# max_depth==1  # ??
# learning_rate==0.55


{'boost__learning_rate': 0.55,
 'boost__max_depth': 1,
 'boost__n_estimators': 260}

In [57]:
[search.score(X_train_sample, y_train_sample),search.score(X_validate_sample, y_validate_sample), r2_score(y_train_sample,search.predict(X_train_sample))]

[-795025.2459974313, -1837020.7223167343, 0.8615536781671949]

In [60]:
# random search to fine tune n_estimators, max_depth, learning_rate
param_distr = dict(
    boost__n_estimators=list(range(220,241,2)),
    boost__max_depth=[3,4,5,6,7],
    boost__learning_rate=list(np.arange(0.55, 0.561, 0.001)),
    # boost__subsample=[0.8],
    # boost__colsample_bytree=uniform(0.67,0.1),
    # boost__reg_alpha=[0.001, 0.1, 1, 5, 10,50,100],
    # boost__reg_lambda=[1],
    # boost__gamma=[i / 10.0 for i in range(0, 5)],
    boost__tree_method=['gpu_hist']
)

scoring = {"rmse": "neg_root_mean_squared_error", "r2": "r2"}
search = RandomizedSearchCV(
    pipe_xgb, param_distributions=param_distr, n_iter=50, scoring=scoring, refit="rmse",
)
search.fit(X_train_sample, y_train_sample)
search.best_params_

# n_estimators == 232
# max_depth==4
# learning_rate==0.552


{'boost__tree_method': 'gpu_hist',
 'boost__n_estimators': 232,
 'boost__max_depth': 4,
 'boost__learning_rate': 0.552}

In [61]:
[search.score(X_train_sample, y_train_sample),search.score(X_validate_sample, y_validate_sample), r2_score(y_train_sample,search.predict(X_train_sample)), r2_score(y_validate_sample,search.predict(X_validate_sample))]

[-186746.997407232,
 -1738479.9506470044,
 0.9923611701248028,
 0.6240834140479761]

In [52]:
# random search to tune colsample_bytree, colsample_bylevel, colsample_bynode, subsample
param_distr = dict(
    boost__n_estimators=[232],
    boost__max_depth=[4],
    boost__learning_rate=[0.5539725958037197],
    boost__subsample=uniform(0.9,0.1),
    boost__colsample_bytree=uniform(0.85,0.15),
    boost__colsample_bylevel=uniform(0.85,0.15),
    boost__colsample_bynode=uniform(0.85,0.15),
    #boost__reg_alpha=[0.001, 0.1, 1, 5, 10,50,100],
    #boost__reg_lambda=[1],
    #boost__gamma=[i / 10.0 for i in range(0, 5)],
    boost__tree_method=['gpu_hist'],
)

scoring = {"rmse": "neg_root_mean_squared_error", "r2": "r2"}
search = RandomizedSearchCV(
    pipe_xgb, param_distributions=param_distr, n_iter=50, scoring=scoring, refit="rmse", n_jobs=-1
)
search.fit(X_train_sample, y_train_sample)
search.best_params_

# 'boost__colsample_bylevel': 0.9329875395758149,
# 'boost__colsample_bynode': 0.9121949417737552,
# 'boost__colsample_bytree': 0.9208525913050812,
# boost__subsample = 0.9316075499680158,


{'boost__colsample_bylevel': 0.9329875395758149,
 'boost__colsample_bynode': 0.9121949417737552,
 'boost__colsample_bytree': 0.9208525913050812,
 'boost__learning_rate': 0.5539725958037197,
 'boost__max_depth': 4,
 'boost__n_estimators': 232,
 'boost__subsample': 0.9316075499680158,
 'boost__tree_method': 'gpu_hist'}

In [54]:
search.cv_results_['mean_test_rmse'][search.cv_results_['rank_test_rmse'].argsort()][0:10]

array([-1247299.04831368, -1270176.18437785, -1316302.89111985,
       -1320558.94206085, -1333248.25000627, -1337789.05822611,
       -1367166.6120021 , -1370640.30192292, -1382565.55485372,
       -1397136.74650372])

In [55]:
start = 0
end = 10
[
    search.cv_results_["param_boost__colsample_bytree"][
        search.cv_results_["rank_test_rmse"].argsort()
    ][start:end],
    search.cv_results_["param_boost__colsample_bylevel"][
        search.cv_results_["rank_test_rmse"].argsort()
    ][start:end],
    search.cv_results_["param_boost__colsample_bynode"][
        search.cv_results_["rank_test_rmse"].argsort()
    ][start:end],
]


[masked_array(data=[0.9208525913050812, 0.9813607751546145,
                    0.8542935621606396, 0.9984150765247191,
                    0.9132944958902808, 0.9517186056579441,
                    0.9447189769947804, 0.8893801084539098,
                    0.8660859932261816, 0.9279220448464871],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 masked_array(data=[0.9329875395758149, 0.8843830436602977,
                    0.9846695217936348, 0.9706353153128668,
                    0.8551035727173427, 0.857017451660754,
                    0.8511293468363749, 0.9408573608640934,
                    0.9154869492143337, 0.9393630359568601],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 masked_array(data=[0.9121949417737552, 0.8964954097977099,
             

In [69]:
# grid search to find appropriate ranges for alpha
param_distr = dict(
    boost__n_estimators=[232],
    boost__max_depth=[4],
    boost__learning_rate=[0.5539725958037197],
    boost__subsample=[0.9316075499680158],
    boost__colsample_bytree=[0.9329875395758149],
    boost__colsample_bylevel=[0.9121949417737552],
    boost__colsample_bynode=[0.9121949417737552],
    boost__reg_alpha=[0.007],
    #boost__reg_lambda=[1e-5, 1e-2, 0.1, 1, 100],
    #boost__gamma=[ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
    boost__tree_method=['gpu_hist'],
)

scoring = {"rmse": "neg_root_mean_squared_error", "r2": "r2"}
search = RandomizedSearchCV(
    pipe_xgb, param_distributions=param_distr, n_iter=25, scoring=scoring, refit="rmse", n_jobs=-1
)
search.fit(X_train_sample, y_train_sample)
search.best_params_

# reg_alpha==0.007


{'boost__colsample_bylevel': 0.9121949417737552,
 'boost__colsample_bynode': 0.9121949417737552,
 'boost__colsample_bytree': 0.9329875395758149,
 'boost__learning_rate': 0.5539725958037197,
 'boost__max_depth': 4,
 'boost__n_estimators': 232,
 'boost__reg_alpha': 0.007097120378686548,
 'boost__subsample': 0.9316075499680158,
 'boost__tree_method': 'gpu_hist'}

In [72]:
search.cv_results_['mean_test_rmse'][search.cv_results_['rank_test_rmse'].argsort()][0:5]

array([-1402065.13114425, -1402065.1328874 , -1402065.13307297,
       -1402065.13350536, -1402065.13377996])

In [73]:
search.cv_results_['param_boost__reg_alpha'][search.cv_results_['rank_test_rmse'].argsort()][0:5]

masked_array(data=[0.007097120378686548, 0.006877459887905132,
                   0.0075955380568508, 0.0077153141352083,
                   0.0119056500606871],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object)

In [78]:
# grid search to find appropriate ranges for lambda
param_distr = dict(
    boost__n_estimators=[232],
    boost__max_depth=[4],
    boost__learning_rate=[0.5539725958037197],
    boost__subsample=[0.9316075499680158],
    boost__colsample_bytree=[0.9329875395758149],
    boost__colsample_bylevel=[0.9121949417737552],
    boost__colsample_bynode=[0.9121949417737552],
    boost__reg_alpha=[0.007],
    #boost__reg_lambda=[0.85, 0.9, 0.95, 1, 1.05, 1.1, 1.15, 20],
    boost__reg_lambda=[20,21,22,23,24,25,26,27,28],
    #boost__gamma=[ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
    boost__tree_method=['gpu_hist']
)

scoring = {"rmse": "neg_root_mean_squared_error", "r2": "r2"}
search = GridSearchCV(
    pipe_xgb, param_grid=param_distr, scoring=scoring, refit="rmse", n_jobs=-1
)
search.fit(X_train_sample, y_train_sample)
search.best_params_

# reg_lambda==22


{'boost__colsample_bylevel': 0.9121949417737552,
 'boost__colsample_bynode': 0.9121949417737552,
 'boost__colsample_bytree': 0.9329875395758149,
 'boost__learning_rate': 0.5539725958037197,
 'boost__max_depth': 4,
 'boost__n_estimators': 232,
 'boost__reg_alpha': 0.007,
 'boost__reg_lambda': 22,
 'boost__subsample': 0.9316075499680158,
 'boost__tree_method': 'gpu_hist'}

In [79]:
[search.score(X_train_sample, y_train_sample),search.score(X_validate_sample, y_validate_sample), r2_score(y_train_sample,search.predict(X_train_sample)), r2_score(y_validate_sample,search.predict(X_validate_sample))]

[-363738.5739433267,
 -915496.4817767616,
 0.9747345200859736,
 0.6103473308097953]

In [80]:
search.cv_results_['mean_test_rmse'][search.cv_results_['rank_test_rmse'].argsort()]

array([-1321826.60879358, -1326868.38084788, -1349326.03712791,
       -1371027.76186885, -1372477.55735449, -1379280.02623537,
       -1382366.69862287, -1384479.96406338, -1390818.0867575 ])

In [81]:
search.cv_results_['param_boost__reg_lambda'][search.cv_results_['rank_test_rmse'].argsort()]

masked_array(data=[22, 20, 21, 24, 27, 25, 23, 26, 28],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object)

In [None]:
# random search to tune lambda
param_distr = dict(
    boost__n_estimators=[232],
    boost__max_depth=[4],
    boost__learning_rate=[0.5539725958037197],
    boost__subsample=[0.9469164064652795],
    boost__colsample_bytree=[0.7453610814071551],
    boost__reg_alpha=[0],
    boost__reg_lambda=uniform(21.5,1)
    #boost__gamma=[ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
    boost__tree_method=['gpu_hist']
)

scoring = {"rmse": "neg_root_mean_squared_error", "r2": "r2"}
search = RandomizedSearchCV(
    pipe_xgb, param_distributions=param_distr, n_iter=15, scoring=scoring, refit="rmse", n_jobs=7
)
search.fit(X_train_sample, y_train_sample)
search.best_params_

# reg_labmda == 23.054589566725642


In [None]:
[search.score(X_train_sample, y_train_sample),search.score(X_validate_sample, y_validate_sample), r2_score(y_train_sample,search.predict(X_train_sample)), r2_score(y_validate_sample,search.predict(X_validate_sample))]

In [None]:
search.cv_results_['param_boost__reg_lambda'][search.cv_results_['rank_test_rmse'].argsort()]

In [None]:
search.cv_results_['mean_test_rmse'][search.cv_results_['rank_test_rmse'].argsort()]

In [166]:
# grid search to tune gamma
param_distr = dict(
    boost__n_estimators=[232],
    boost__max_depth=[4],
    boost__learning_rate=[0.5539725958037197],
    boost__subsample=[0.9469164064652795],
    boost__colsample_bytree=[0.7453610814071551],
    boost__reg_alpha=[0],
    boost__reg_lambda=[23.054589566725642],
    boost__gamma=[0,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,10],
    boost__tree_method=['gpu_hist']
)

scoring = {"rmse": "neg_root_mean_squared_error", "r2": "r2"}
search = GridSearchCV(
    pipe_xgb, param_grid=param_distr, scoring=scoring, refit="rmse", n_jobs=7
)
search.fit(X_train_sample, y_train_sample)
search.best_params_

# gamma doesn't change model fit, use default


{'boost__colsample_bytree': 0.7453610814071551,
 'boost__gamma': 0,
 'boost__learning_rate': 0.5539725958037197,
 'boost__max_depth': 4,
 'boost__n_estimators': 232,
 'boost__reg_alpha': 0,
 'boost__reg_lambda': 23.054589566725642,
 'boost__subsample': 0.9469164064652795,
 'boost__tree_method': 'gpu_hist'}

In [167]:
search.cv_results_['mean_test_r2'][search.cv_results_['rank_test_rmse'].argsort()]

array([0.62668411, 0.62668411, 0.62668411, 0.62668411, 0.62668411,
       0.62668411, 0.62668411, 0.62668411, 0.62668411])

In [31]:
# grid search to tune min_child_weight
pipe_xgb = Pipeline(
    steps=[
        ("boost", XGBRegressor(random_state=42)),
    ]
)

param_distr = dict(
    boost__n_estimators=[232],
    boost__max_depth=[4],
    boost__learning_rate=[0.5539725958037197],
    boost__subsample=[0.9469164064652795],
    boost__colsample_bytree=[0.7453610814071551],
    boost__reg_alpha=[0],
    boost__reg_lambda=[23.054589566725642],
    #boost__gamma=[0,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,10],
    boost__tree_method=['gpu_hist'],
)

scoring = {"rmse": "neg_root_mean_squared_error", "r2": "r2"}
search = GridSearchCV(
    pipe_xgb, param_grid=param_distr, scoring=scoring, refit="rmse", n_jobs=7
)
search.fit(X_train_sample, y_train_sample)
search.best_params_

# min_child_weight==1, default
# max_delta_step==0, default


{'boost__colsample_bytree': 0.7453610814071551,
 'boost__learning_rate': 0.5539725958037197,
 'boost__max_delta_step': 0,
 'boost__max_depth': 4,
 'boost__n_estimators': 232,
 'boost__reg_alpha': 0,
 'boost__reg_lambda': 23.054589566725642,
 'boost__subsample': 0.9469164064652795,
 'boost__tree_method': 'gpu_hist'}

In [33]:
search.cv_results_['mean_test_rmse'][search.cv_results_['rank_test_rmse'].argsort()]

array([-1296769.10640471, -2398492.1704085 , -2420965.27103071,
       -2423838.22728325, -2425280.10535191, -2426147.22964011])

In [32]:
search.cv_results_['param_boost__max_delta_step'][search.cv_results_['rank_test_rmse'].argsort()]

masked_array(data=[0, 500, 100, 50, 25, 10],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object)

In [85]:
xgb0 = XGBRegressor(
    random_state=42,
    tree_method="gpu_hist",
)
xgb0.fit(X_train, y_train)
r2_score(y_validate,xgb0.predict(X_validate))

0.5460490962488239

In [None]:
xgb1 = XGBRegressor(
    random_state=42,
    n_estimators=232,
    max_depth=4,
    learning_rate=0.5539725958037197,
    subsample=0.9469164064652795,
    colsample_bytree=0.7453610814071551,
    reg_alpha=0,
    #reg_lambda=1,
    #tree_method="gpu_hist",
)
xgb1.fit(X_train, y_train)
r2_score(y_validate,xgb1.predict(X_validate))

In [54]:
xgb2 = XGBRegressor(
    random_state=42,
    n_estimators=232,
    max_depth=4,
    learning_rate=0.5539725958037197,
    subsample=0.9316075499680158,
    colsample_bytree=0.9329875395758149,
    colsample_bylevel=0.9121949417737552,
    colsample_bynode=0.9121949417737552,
    reg_alpha=0.007,
    reg_lambda=22,
    tree_method="gpu_hist",
)
xgb2.fit(X_train, y_train)
r2_score(y_validate,xgb2.predict(X_validate))

0.5562318385762138

## Light GBM

### Not Using PCA

In [210]:
# tune learning rate

pipe_lgb = Pipeline(
    steps=[
        ("boost", LGBMRegressor(random_state=42)),
    ]
)

param_distr = dict(
    # boost__n_estimators=list(range(10, 300, 50)),
    # boost__max_depth=[1,9,17,25,33],
    boost__learning_rate=uniform(0.45, 0.1),
    # boost__subsample=[0.8],
    # boost__colsample_bytree=uniform(0.67,0.1),
    # boost__reg_alpha=[0.001, 0.1, 1, 5, 10,50,100],
    # boost__reg_lambda=[1],
    # boost__gamma=[i / 10.0 for i in range(0, 5)],
)

# first use grid search to tune learning rate
scoring = {"rmse": "neg_root_mean_squared_error", "r2": "r2"}
search = RandomizedSearchCV(
    pipe_lgb, param_distributions=param_distr, n_iter = 100, scoring=scoring, refit="rmse", n_jobs=7
)
search.fit(X_train_sample, y_train_sample)
search.best_params_

# settle for learning_rate==0.5103207351442216 for now


{'boost__learning_rate': 0.5103207351442216}

In [213]:
# sort learning rate by performance
search.cv_results_['param_boost__learning_rate'][search.cv_results_['rank_test_rmse'].argsort()][0:5]

masked_array(data=[0.5103207351442216, 0.5209454734897873,
                   0.508604386223031, 0.5142722083871047,
                   0.5145309786241092],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object)

In [214]:
# see performance of top 5
search.cv_results_['mean_test_rmse'][search.cv_results_['rank_test_rmse'].argsort()][0:5]

array([-1554598.83217997, -1560590.60751826, -1565645.95865904,
       -1567932.06177577, -1573734.45385374])

In [77]:
# tune with Hyperopt
space = {
    'boosting': 'dart',
    'n_estimators': scope.int(hp.quniform('n_estimators', 220,280,5)),
    'num_leaves': scope.int(hp.quniform("num_leaves", 100, 220, 20)),
    'max_depth': scope.int(hp.quniform("max_depth", 2, 7, 1)),
    'learning_rate': hp.loguniform('learning_rate', -5, 0.5),
    'min_data_in_leaf': scope.int(hp.qloguniform('min_data_in_leaf', 0, 5, 1)),
    'feature_fraction': hp.uniform('feature_fraction', 0.81, 0.87),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.57, 0.67),
    'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_value', -13, -8)]),
    'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_value', -7, -3)]),
    'min_child_weight': hp.loguniform('min_child_weight', -15, -10),
}

def lgbm_tuning(params):
    clf=LGBMRegressor(**params)
    clf.fit(X_train_sample, y_train_sample)
    rmse = (rmse_score(y_true=y_validate_sample, y_pred=clf.predict(X_validate_sample)))**0.5
    print("RMSE:", rmse)
    return {'loss': rmse, 'status': STATUS_OK}
    
trials = Trials()


In [86]:
best = fmin(
    fn=lgbm_tuning,
    space = space,
    algo=tpe.suggest,
    max_evals=750,
    trials=trials
)

RMSE:                                                    
882772.3759703788                                        
RMSE:                                                                             
853557.2498354886                                                                 
RMSE:                                                                             
1057641.4346141862                                                                
RMSE:                                                                             
887039.9077815757                                                                 
RMSE:                                                                             
910678.5645174311                                                                 
RMSE:                                                                             
846596.2358331791                                                                 
RMSE:                                                 

In [117]:
trials2 = Trials()

In [118]:
# tune with hyperopt, compare results with above
space2 = {
    'boosting': 'dart',
    'n_estimators': scope.int(hp.quniform('n_estimators', 220,260,2)),
    'num_leaves': scope.int(hp.quniform("num_leaves", 150, 200, 2)),
    'max_depth': scope.int(hp.quniform("max_depth", 1, 6, 1)),
    'learning_rate': hp.loguniform('learning_rate', -2.5, -0.5),
    'min_data_in_leaf': scope.int(hp.qloguniform('min_data_in_leaf', 0, 5, 1)),
    'feature_fraction': hp.uniform('feature_fraction', 0.80, 0.84),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.57, 0.63),
    'lambda_l1': hp.choice('lambda_l1', [0]),
    'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_value', -6, -2)]),
    'min_child_weight': hp.loguniform('min_child_weight', -14, -11),
}

best2 = fmin(
    fn=lgbm_tuning,
    space = space2,
    algo=tpe.suggest,
    max_evals=250,
    trials=trials2
)


RMSE:                                                  
874857.3988090659                                      
RMSE:                                                                           
940515.5883157734                                                               
RMSE:                                                                           
1063289.423928752                                                               
RMSE:                                                                           
1030385.0258528952                                                              
RMSE:                                                                           
1036238.875151661                                                               
RMSE:                                                                           
1089647.3822697788                                                              
RMSE:                                                                         

In [116]:
best2

{'bagging_fraction': 0.5935028967044329,
 'feature_fraction': 0.8234547277320136,
 'lambda_l1': 0,
 'lambda_l2': 1,
 'lambda_l2_value': 0.003188254434880409,
 'learning_rate': 0.3624190675446414,
 'max_depth': 4.0,
 'min_child_weight': 5.261334196835273e-06,
 'min_data_in_leaf': 1.0,
 'n_estimators': 260.0,
 'num_leaves': 170.0}

In [192]:
# save models
lgbm0 = LGBMRegressor(
    random_state=42,
)
lgbm0.fit(X_train, y_train)
r2_score(y_validate,lgbm0.predict(X_validate))


0.563460879391384

In [50]:
# save models
lgbm1 = LGBMRegressor(
    boosting='DART',
    random_state=42,
    n_estimators=232,
    max_depth=4,
    learning_rate=0.5539725958037197,
    subsample=0.9469164064652795,
    colsample_bytree=0.7453610814071551,
    reg_alpha=0,
    reg_lambda=23.054589566725642,
)
lgbm1.fit(X_train, y_train)
r2_score(y_validate,lgbm1.predict(X_validate))



0.5863865128921038

In [87]:
# save models
lgbm2 = LGBMRegressor(
    boosting='DART',
    bagging_fraction=best['bagging_fraction'], 
    feature_fraction=best['feature_fraction'],
    reg_alpha=0,
    reg_lambda=0,
    max_depth=int(best['max_depth']),
    min_child_weight=best['min_child_weight'],
    min_data_in_leaf=int(best['min_data_in_leaf']),
    n_estimators=int(best['n_estimators']),
    num_leaves=int(best['num_leaves']),
    learning_rate=best['learning_rate'],
)
lgbm2.fit(X_train, y_train)
r2_score(y_validate,lgbm2.predict(X_validate))



0.6002258598584601

In [None]:
# save model
lgbm3 = LGBMRegressor(
    boosting='DART',
    bagging_fraction=best2['bagging_fraction'], 
    feature_fraction=best2['feature_fraction'],
    reg_alpha=0,
    reg_lambda=0,
    max_depth=int(best2['max_depth']),
    min_child_weight=best2['min_child_weight'],
    min_data_in_leaf=int(best2['min_data_in_leaf']),
    n_estimators=int(best2['n_estimators']),
    num_leaves=int(best2['num_leaves']),
    learning_rate=best2['learning_rate'],
)
lgbm3.fit(X_train, y_train)
r2_score(y_validate,lgbm3.predict(X_validate))

In [114]:
rmse_score(y_validate,lgbm3.predict(X_validate))

2235855226201.393

### Use PCA

In [55]:
# see if PCA helps
pipe_lgb = Pipeline(
    steps=[
        ("pca", PCA(random_state=42)),
        ("boost", LGBMRegressor(random_state=42)),
    ]
)

space_pca = {
    'pca__n_components': scope.int(hp.quniform('pca__n_components',240,300,10)),
    'boost__boosting': 'dart',
    'boost__n_estimators': scope.int(hp.quniform('n_estimators', 200,300,10)),
    'boost__num_leaves': scope.int(hp.quniform("num_leaves", 60, 120, 10)),
    'boost__max_depth': scope.int(hp.quniform("max_depth", 2, 15, 2)),
    'boost__learning_rate': hp.loguniform('learning_rate', -3, -0.5),
    'boost__min_data_in_leaf': scope.int(hp.qloguniform('min_data_in_leaf', 0, 6, 1)),
    'boost__feature_fraction': hp.uniform('feature_fraction', 0.75, 1),
    'boost__bagging_fraction': hp.uniform('bagging_fraction', 0.75, 1),
    'boost__lambda_l1': hp.choice('lambda_l1', [0]),
    'boost__lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_value', -6, -1)]),
    'boost__min_child_weight': hp.loguniform('min_child_weight', -14, -10),
    'boost__verbose':hp.choice('boost__verbose', [-1]),
}

def lgbm_tuning_pca(params):
    pipe_lgb.set_params(**params)
    pipe_lgb.fit(X_train_sample, y_train_sample.values.ravel())
    rmse = (rmse_score(y_true=y_validate_sample, y_pred=pipe_lgb.predict(X_validate_sample)))**0.5
    print("RMSE:", rmse)
    return {'loss': rmse, 'status': STATUS_OK}

trials_pca = Trials()


In [58]:
best3 = fmin(
    fn=lgbm_tuning_pca,
    space = space_pca,
    algo=tpe.suggest,
    max_evals=918,
    trials=trials_pca,
)


100%|█████████▉| 916/918 [00:00<?, ?trial/s, best loss=?]


In [45]:
trials_pca.best_trial['misc']['vals']

{'bagging_fraction': [0.9296748933903163],
 'boost__verbose': [0],
 'feature_fraction': [0.8523144224876821],
 'lambda_l1': [0],
 'lambda_l2': [0],
 'lambda_l2_value': [],
 'learning_rate': [0.19468798873080276],
 'max_depth': [10.0],
 'min_child_weight': [1.5141559432392004e-06],
 'min_data_in_leaf': [14.0],
 'n_estimators': [260.0],
 'num_leaves': [100.0],
 'pca__n_components': [250.0]}

In [59]:
pipe_lgb = Pipeline(
    steps=[
        ("pca", PCA(random_state=42)),
        ("boost", LGBMRegressor(random_state=42, verbosity=-10)),
    ]
)

pipe_lgb.set_params(
    pca__n_components=int(best3['pca__n_components']),
    boost__boosting_type='DART',
    boost__bagging_fraction=best3['bagging_fraction'], 
    boost__feature_fraction=best3['feature_fraction'],
    boost__reg_alpha=0,
    boost__reg_lambda=0,
    boost__max_depth=int(best3['max_depth']),
    boost__min_child_weight=best3['min_child_weight'],
    boost__min_data_in_leaf=int(best3['min_data_in_leaf']),
    boost__n_estimators=int(best3['n_estimators']),
    boost__num_leaves=int(best3['num_leaves']),
    boost__learning_rate=best3['learning_rate'],
)
pipe_lgb.fit(X_train_all, y_train_all)
r2_score(y_test,pipe_lgb.predict(X_test))

  y = column_or_1d(y, warn=True)




0.5176064032618453

## Random Forest

In [40]:
# random forest
from sklearn.ensemble import RandomForestRegressor as RF

# tune with Hyperopt
space_rf = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 186,200,1)),
    'max_depth': scope.int(hp.quniform("max_depth", 13, 20, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 10, 16, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 4, 7, 1)),
    'min_weight_fraction_leaf': hp.choice('min_weight_fraction_leaf', [0, hp.uniform('min_weight_fraction_leaf_value', 0, 0.3)]),
    'max_samples': hp.choice('max_samples', [None, hp.uniform('max_samples_value', 0.73, 0.76)]),
}

def rf_tuning(params):
    clf=RF(random_state=42, n_jobs=-1, **params)
    clf.fit(X_train_sample, y_train_sample)
    rmse = (rmse_score(y_true=y_validate_sample, y_pred=clf.predict(X_validate_sample)))**0.5
    print("RMSE:", rmse)
    return {'loss': rmse, 'status': STATUS_OK}
    
trials_rf = Trials()


In [None]:
best_rf = fmin(
    fn=rf_tuning,
    space = space_rf,
    algo=tpe.suggest,
    max_evals=280,
    trials=trials_rf
)

In [32]:
best_rf

{'max_depth': 18.0,
 'max_samples': 1,
 'max_samples_value': 0.7473640349917936,
 'min_samples_leaf': 6.0,
 'min_samples_split': 14.0,
 'min_weight_fraction_leaf': 0,
 'n_estimators': 192.0}

In [None]:
# save model
rf = RF(
    n_estimators=int(best_rf['n_estimators']), 
    max_depth=int(best_rf['max_depth']),
    min_samples_split=int(best_rf['min_samples_split']),
    min_samples_leaf=int(best_rf['min_samples_leaf']),
    min_weight_fraction_leaf=best_rf['min_weight_fraction_leaf'],
    max_samples=best_rf['max_samples_value'],
)
rf.fit(X_train, y_train)
r2_score(y_validate,rf.predict(X_validate))

In [23]:
rmse_score(y_validate, rf.predict(X_validate))

3001342415326.6753

## Ensemble

In [37]:
# try fitting a few ensemble models
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet

estimators = [
    ('xgb', xgb2),
    ('lgb2', lgbm2),
    ('lgb3', lgbm3),
    ('rf', rf),
]

ensemble = StackingRegressor(
    estimators=estimators, final_estimator=ElasticNetCV(random_state=42), n_jobs=7
)


In [38]:
ensemble.fit(X_train_sample, y_train_sample)

  y = column_or_1d(y, warn=True)


StackingRegressor(estimators=[('xgb',
                               XGBRegressor(base_score=None, booster=None,
                                            colsample_bylevel=0.9121949417737552,
                                            colsample_bynode=0.9121949417737552,
                                            colsample_bytree=0.9329875395758149,
                                            enable_categorical=False,
                                            gamma=None, gpu_id=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.5539725958037197,
                                            max_delta_step=None, max_depth=4...
                                             feature_fraction=0.8382067905272091,
                                             learning_rate=0.36928736490005915,
                                            

In [39]:
r2_score(y_validate_sample,ensemble.predict(X_validate_sample))

-9.070566892825012

## Test on test data

In [7]:
# load test data
X_train_all = pd.read_csv('../Data/X_train_all.csv')
y_train_all = pd.read_csv('../Data/y_train_all.csv')
X_test = pd.read_csv('../Data/X_test_all.csv')
y_test = pd.read_csv('../Data/y_test_all.csv')


In [8]:
# apply the same cleaning to test data, drop unneeded columns
X_train_all.drop(columns=['Unnamed: 0'], inplace=True)
y_train_all.drop(columns=['Unnamed: 0'], inplace=True)
X_test.drop(columns=['Unnamed: 0'], inplace=True)
y_test.drop(columns=['Unnamed: 0'], inplace=True)
X_train_all.rename(columns={'city_"ONeals"': 'city_ONeals'}, inplace=True)
X_test.rename(columns={'city_"ONeals"': 'city_ONeals'}, inplace=True)


In [None]:
# fit models again on all training data
xgb1.fit(X_train_all, y_train_all)
xgb2.fit(X_train_all, y_train_all)
lgbm1.fit(X_train_all, y_train_all)
lgbm2.fit(X_train_all, y_train_all)
lgbm3.fit(X_train_all, y_train_all)


In [152]:
# get R2 score on test data
[
    r2_score(y_test,xgb1.predict(X_test)),
    r2_score(y_test,xgb2.predict(X_test)),
    r2_score(y_test,lgbm1.predict(X_test)),
    r2_score(y_test,lgbm2.predict(X_test)),
    r2_score(y_test,lgbm3.predict(X_test)),
]


[0.5828178439753278,
 0.6347163088775949,
 0.578957619083049,
 0.5752404631284671,
 0.6169016826518074]

In [10]:
xgb2.fit(X_train_all, y_train_all)
r2_score(y_test,xgb2.predict(X_test))

0.5905424729402105

In [12]:
rmse_score(y_test,xgb2.predict(X_test))**0.5

1336536.0258482425

In [56]:
ensemble.fit(X_train_all, y_train_all)

  y = column_or_1d(y, warn=True)


StackingRegressor(estimators=[('xgb',
                               XGBRegressor(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=0.9121949417737552,
                                            colsample_bynode=0.9121949417737552,
                                            colsample_bytree=0.9329875395758149,
                                            enable_categorical=False, gamma=0,
                                            gpu_id=0, importance_type=None,
                                            interaction_constraints='',
                                            learning_rate=0.5539725958037197,
                                            max_delta_step=0, max_depth=4,
                                            min_chi...
                                             n_estimators=245, num_leaves=180,
                                             reg_alpha=0, reg_lambda=0)),
                              ('lgb3',
             

In [57]:
r2_score(y_test, ensemble.predict(X_test))

0.6185377523080829

## Save

In [1]:
# save models to local drive
import pickle

def save_obj(obj, filename):
    try:
        with open(filename, "wb") as f:
            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
    except Exception as ex:
        print("Error:", ex)


def load_obj(filename):
    try:
        with open(filename, "rb") as f:
            return pickle.load(f)
    except Exception as ex:
        print("Error:", ex)


In [5]:
save_obj(xgb2, "../models/scratch_models/xgb2.pickle")
save_obj(lgbm2, "../models/scratch_models/lgbm2.pickle")
save_obj(lgbm3, "../models/scratch_models/lgbm3.pickle")
save_obj(rf, "../models/scratch_models/rf.pickle")
save_obj(ensemble, "../models/scratch_models/ensemble.pickle")

In [6]:
rf = load_obj('../models/scratch_models/rf.pickle')
xgb2 = load_obj('../models/scratch_models/xgb2.pickle')
lgbm2 = load_obj('../models/scratch_models/lgbm2.pickle')
lgbm3 = load_obj('../models/scratch_models/lgbm3.pickle')
ensemble = load_obj('../models/scratch_models/ensemble.pickle')