In [3]:
import os
import utils
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb

from bayes_opt import BayesianOptimization
from tqdm import tqdm_notebook as tqdm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from constants import DATA_DIR
from constants import PUNT_TYPES
warnings.filterwarnings('ignore')

In [14]:
cv = RepeatedKFold(n_splits=2, n_repeats=100)

### Ridge Regression

In [5]:
def ridge(X, y):
    parameters = {'alpha':[1, 10, 100]}
    Ridge_reg= GridSearchCV(Ridge(), parameters, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
    Ridge_reg.fit(X, y)
    best_ridge = Ridge_reg.best_estimator_
    print(Ridge_reg.best_params_)
    errors = utils.cross_val(best_ridge, X, y, n_folds=3)
    #utils.summarize_errors(errors, verbose=0)
    return np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

### Lasso Regression

In [6]:
def lasso(X, y):
    parameters = {'alpha':[1e-3, 1e-2, 1e-1]}
    lasso_result = GridSearchCV(Lasso(), parameters, scoring='neg_root_mean_squared_error',cv=cv,n_jobs=-1)
    lasso_result.fit(X, y)
    best_model = lasso_result.best_estimator_
    print(lasso_result.best_params_)
    errors = utils.cross_val(best_model, X, y, n_folds=3)
    utils.summarize_errors(errors, verbose=0)
    return np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

### Vanilla XGBRegressor

In [7]:
def vanilla_xgb(X,y):
    vanilla_reg = XGBRegressor().fit(X, y)
    errors = utils.cross_val(vanilla_reg, X, y, n_folds=3)
    utils.summarize_errors(errors, verbose=0)
    return np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

###  XGBRegressor w/Bayesian Optimized Hyperparameters

In [25]:
def bo_tune_xgb(max_depth, gamma, n_estimators, learning_rate, subsample):
    params = {'gamma': gamma, 
              'learning_rate':learning_rate, 
              'max_depth': int(max_depth), 
              'n_estimators': int(n_estimators), 
              'subsample': subsample, 
              'eta': 0.1, 
              'eval_metric': 'rmse'}
    scores = cross_val_score(XGBRegressor(random_state=1, **params), X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
    score = scores.mean()
    return score

In [26]:
def bo_xgb(X, y):
    params = {'max_depth': (3, 10), 
              'gamma': (0, 1), 
              'n_estimators':(100,120), 
              'learning_rate':(0,1), 
              'subsample': (0.8, 1)}
    xgb_bo = BayesianOptimization(bo_tune_xgb, params, random_state = 1)
    xgb_bo.maximize(n_iter=5, init_points=20)
    
    params = xgb_bo.max['params']
    params['max_depth']= int(params['max_depth'])
    params['n_estimators']= int(params['n_estimators'])
    print(params)
    clf = XGBRegressor(**params)
    errors = utils.cross_val(clf, X, y, n_folds=3)
    return np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

### Neural Network

In [27]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn import preprocessing
from keras import regularizers
from keras.callbacks import ModelCheckpoint
import math

In [28]:
def get_modelcheckpoint_path():
    #Create a file path for a model and save models in hdf5 files with datetime, validation losses and epochs
    parent = DATA_DIR+"/NN/"
    child = pd.Timestamp.now().strftime('%Y%m%d-%Hh%Mm') + "-model-epoch_{epoch:02d}-rmse_{val_loss:.5f}.hdf5"
    return parent + child 

def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(X.shape[1], input_dim=X.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(15, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

def dropout_model():
    # create model
    model = Sequential()
    model.add(Dense(X.shape[1], input_dim=X.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(15, kernel_initializer='normal', activation='relu'))
    #model.add(Dense(8, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

def create_regressor(model, verbose=0):
    reg = KerasRegressor(build_fn=model, epochs=100, batch_size=5, validation_split=0.2, shuffle=True, verbose=verbose)
    return reg

def plot_learning_process(history):
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    return None

def base_keras_model(X, y):
    base_keras = create_regressor(baseline_model, 0)
    #base_history = base_keras.fit(X, y)
    errors = utils.cross_val(base_keras, X, y, n_folds=5)
    #plot_learning_process(base_history)
    return np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

def dropout_keras_model(X, y):
    dropout_keras = create_regressor(dropout_model, 0)
    #dropout_history = base_keras.fit(X, y)
    #plot_learning_process(dropout_history)
    errors = utils.cross_val(dropout_keras, X, y, n_folds=5)
    #utils.summarize_errors(errors, verbose=1)
    return np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

In [29]:
rmse_table = pd.DataFrame()
mae_table = pd.DataFrame()
models = [bo_xgb]
#models=[ridge, lasso, vanilla_xgb, bo_xgb, base_keras_model]
for punt in tqdm(PUNT_TYPES):
    if punt == []:
        punt = 'Base'
    else:
        punt = '+'.join(punt)
    df = pd.read_csv(os.path.join(DATA_DIR,'ABT', punt+'.csv'))
    X = df.loc[:, df.columns != 'VALUE']
    y = df['VALUE'].values.reshape(-1,1).flatten()
    for model in tqdm(models):
        rmse_value, mae_value = model(X, y)
        rmse_table.loc[punt, model.__name__] = rmse_value
        mae_table.loc[punt, model.__name__] = mae_value


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-3.083   [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-2.663   [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-3.187   [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.878   [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-3.403   [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-2.559   [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.90437, SD: 0.04409
RMSE | Mean: 2.56093, SD: 0.10679




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.794   [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-2.392   [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-2.834   [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.526   [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-2.981   [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-2.27    [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.73231, SD: 0.04409
RMSE | Mean: 2.28917, SD: 0.1033




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.791   [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-2.393   [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-2.845   [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.54    [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-3.106   [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-2.311   [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.75584, SD: 0.03432
RMSE | Mean: 2.3311, SD: 0.09208




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.592   [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-2.269   [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-2.614   [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.438   [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-2.901   [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-2.161   [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.56619, SD: 0.04001
RMSE | Mean: 2.1514, SD: 0.09483




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.672   [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-2.34    [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-2.76    [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.532   [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-2.985   [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-2.217   [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.63646, SD: 0.03744
RMSE | Mean: 2.23172, SD: 0.10677




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.854   [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-2.444   [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-2.873   [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.597   [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-3.131   [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-2.31    [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.69125, SD: 0.03756
RMSE | Mean: 2.30091, SD: 0.09664




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.668   [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-2.283   [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-2.636   [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.468   [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-2.916   [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-2.195   [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.61147, SD: 0.03596
RMSE | Mean: 2.18406, SD: 0.08891




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.794   [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-2.405   [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-2.818   [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.549   [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-3.05    [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-2.307   [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.68853, SD: 0.0392
RMSE | Mean: 2.2847, SD: 0.10605




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.39    [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-2.025   [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-2.415   [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.203   [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-2.552   [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-1.945   [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.46493, SD: 0.0417
RMSE | Mean: 1.95559, SD: 0.09254




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.392   [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-2.116   [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-2.456   [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.264   [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-2.674   [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-2.021   [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.51352, SD: 0.0396
RMSE | Mean: 2.00438, SD: 0.0877




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.372   [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-2.065   [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-2.498   [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.264   [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-2.654   [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-1.978   [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.42438, SD: 0.0209
RMSE | Mean: 1.96015, SD: 0.0543




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.289   [0m | [0m 0.417   [0m | [0m 0.7203  [0m | [0m 3.001   [0m | [0m 106.0   [0m | [0m 0.8294  [0m |
| [95m 2       [0m | [95m-1.96    [0m | [95m 0.09234 [0m | [95m 0.1863  [0m | [95m 5.419   [0m | [95m 107.9   [0m | [95m 0.9078  [0m |
| [0m 3       [0m | [0m-2.373   [0m | [0m 0.4192  [0m | [0m 0.6852  [0m | [0m 4.431   [0m | [0m 117.6   [0m | [0m 0.8055  [0m |
| [0m 4       [0m | [0m-2.137   [0m | [0m 0.6705  [0m | [0m 0.4173  [0m | [0m 6.911   [0m | [0m 102.8   [0m | [0m 0.8396  [0m |
| [0m 5       [0m | [0m-2.516   [0m | [0m 0.8007  [0m | [0m 0.9683  [0m | [0m 5.194   [0m | [0m 113.8   [0m | [0m 0.9753  [0m |
| [95m 6       [0m | [95m-1.892   [0m | [95m 0.8946  [0m | [95m 0.08504 [0m | [95m 3.273   [0m | [95

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



   <--- Validation Errors --->
MAE  | Mean: 1.42561, SD: 0.02522
RMSE | Mean: 1.91278, SD: 0.0526





In [30]:
print(rmse_table)
print(mae_table)

           bo_xgb
Base     1.904370
FG%      1.732306
FT%      1.755836
PTS      1.566192
TRB      1.636463
AST      1.691248
STL      1.611466
BLK      1.688529
FG%+TRB  1.464932
BLK+FG%  1.513515
AST+STL  1.424376
PTS+FT%  1.425609
           bo_xgb
Base     2.560931
FG%      2.289170
FT%      2.331099
PTS      2.151405
TRB      2.231718
AST      2.300905
STL      2.184062
BLK      2.284696
FG%+TRB  1.955586
BLK+FG%  2.004376
AST+STL  1.960146
PTS+FT%  1.912780


### Making Predictions

In [24]:
df = pd.read_csv(DATA_DIR+'/Predictions/predictions.csv')
df.drop(columns=['PLAYER','TEAM','SEASON'], inplace=True)
df = pd.get_dummies(df)
X = df.loc[:, df.columns != 'VALUE']
y = df['VALUE'].values.reshape(-1,1).flatten()
values = clf.predict(X)

print(utils.calculate_MAE(values, y))
print(utils.calculate_RMSE(values, y))

1.7282533284491566
2.48234542403067


In [25]:
df = pd.read_csv(DATA_DIR+'/Predictions/predictions.csv')
df['PRED'] = values
df.to_csv(DATA_DIR+'/Predictions/end.csv')