<b>Imports</b>

In [None]:
import os
import time
import warnings

from IPython.display import HTML, display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns

import lightgbm as lgb
from lightgbm.sklearn import LGBMRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor

from bayes_opt import BayesianOptimization

<b>Settings</b>

In [None]:
warnings.filterwarnings("ignore")
np.set_printoptions(threshold=None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: "%.3f" % x)
pd.set_option("display.max_colwidth", 99999)
%matplotlib inline
display(HTML("<style>.container { width: 90% !important; }</style>"))

N_JOBS = -1
SEED = 2021
FOLDS = 5

<b>Functions</b>

In [None]:
class OptimBayes(object):
    def __init__(self,
                 train,
                 y_train,
                 kind,
                 n_estimators):
        self.train=train
        self.y_train=y_train
        self.kind=kind
        self.n_estimators=n_estimators

    ###########################################################################################

    def init_bayes_opt(self, model_name, min_max_params):
        if (model_name == "rf"):
            bo = BayesianOptimization(
                f=self.run_rf,
                pbounds={
                    "max_depth": (min_max_params["max_depth_min"],
                                  min_max_params["max_depth_max"]),
                    "max_features": (min_max_params["max_features_min"],
                                     min_max_params["max_features_max"]),
                    "min_samples_leaf":
                    (min_max_params["min_samples_leaf_min"],
                     min_max_params["min_samples_leaf_max"]),
                    "min_samples_split":
                    (min_max_params["min_samples_split_min"],
                     min_max_params["min_samples_split_max"]),
                },
                random_state=SEED)
        elif (model_name == "et"):
            bo = BayesianOptimization(
                f=self.run_et,
                pbounds={
                    "max_depth": (min_max_params["max_depth_min"],
                                  min_max_params["max_depth_max"]),
                    "max_features": (min_max_params["max_features_min"],
                                     min_max_params["max_features_max"]),
                    "min_samples_leaf":
                    (min_max_params["min_samples_leaf_min"],
                     min_max_params["min_samples_leaf_max"]),
                    "min_samples_split":
                    (min_max_params["min_samples_split_min"],
                     min_max_params["min_samples_split_max"]),
                },
                random_state=SEED)
        elif (model_name == "xg"):
            bo = BayesianOptimization(
                f=self.run_xg,
                pbounds={
                    "colsample_bylevel":
                    (min_max_params["colsample_bylevel_min"],
                     min_max_params["colsample_bylevel_max"]),
                    "colsample_bytree":
                    (min_max_params["colsample_bytree_min"],
                     min_max_params["colsample_bytree_max"]),
                    "gamma": (min_max_params["gamma_min"],
                              min_max_params["gamma_max"]),
                    "max_depth": (min_max_params["max_depth_min"],
                                  min_max_params["max_depth_max"]),
                    "min_child_weight":
                    (min_max_params["min_child_weight_min"],
                     min_max_params["min_child_weight_max"]),
                    "subsample": (min_max_params["subsample_min"],
                                  min_max_params["subsample_max"]),
                },
                random_state=SEED)
        elif (model_name == "lg"):
            bo = BayesianOptimization(
                f=self.run_lg,
                pbounds={
                    "colsample_bytree":
                    (min_max_params["colsample_bytree_min"],
                     min_max_params["colsample_bytree_max"]),
                    "is_unbalance": (min_max_params["is_unbalance_min"],
                                     min_max_params["is_unbalance_max"]),
                    "max_depth": (min_max_params["max_depth_min"],
                                  min_max_params["max_depth_max"]),
                    "min_child_samples":
                    (min_max_params["min_child_samples_min"],
                     min_max_params["min_child_samples_max"]),
                    "min_split_gain": (min_max_params["min_split_gain_min"],
                                       min_max_params["min_split_gain_max"]),
                    "subsample": (min_max_params["subsample_min"],
                                  min_max_params["subsample_max"]),
                    "subsample_freq": (min_max_params["subsample_freq_min"],
                                       min_max_params["subsample_freq_max"]),
                },
                random_state=SEED)

        return (bo)

    ###########################################################################################

    def show_best_combos(self, bo, length=15):
        results_df = pd.DataFrame()
        for i in range(len(bo.res)):
            for key in bo.res[i].keys():
                if key == "target":
                    target = bo.res[i][key]
                elif key == "params":
                    cur_df = pd.DataFrame(bo.res[i][key], index=[0])
            cur_df["target"] = target
            cur_df["iter"] = i
            results_df = pd.concat([results_df, cur_df])
        results_df.sort_values(["target"], ascending=False, inplace=True)
        results_df.set_index("iter", inplace=True)
        display(results_df.head(length))
        display(results_df.tail(length))

        return results_df

    ###########################################################################################

    def pair_plots(self, history_df, min_max_params, param1, param2):
        _x, _y, _z = history_df[param1].values, history_df[
            param2].values, history_df["target"].values

        # Set up a regular grid of interpolation points
        param1min = min_max_params[param1 + "_min"]
        param1max = min_max_params[param1 + "_max"]
        param2min = min_max_params[param2 + "_min"]
        param2max = min_max_params[param2 + "_max"]
        xi, yi = np.linspace(param1min, param1max, 100), np.linspace(
            param2min, param2max, 100)
        xi, yi = np.meshgrid(xi, yi)

        # Interpolate
        rbf = scipy.interpolate.Rbf(
            _x, _y, _z, function="multiquadric", smooth=2)
        zi = rbf(xi, yi)

        plt.figure()
        plt.imshow(
            zi,
            cmap="plasma",
            aspect=(param1max - param1min) / (param2max - param2min),
            vmin=_z.min(),
            vmax=_z.max(),
            origin="lower",
            extent=[param1min, param1max, param2min, param2max])
        q = plt.scatter(_x, _y, c=_z, cmap="plasma")
        plt.colorbar(q)
        plt.xlabel(param1)
        plt.ylabel(param2)
        plt.show(block=False)

    ###########################################################################################

    def fit_and_predict(self, model, model_name):
        all_preds = []
        all_targets = []
        
        # Iterate on each fold
        kf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
        for j, (train_idx, valid_idx) in enumerate(kf.split(train, y)):
            '''if j == 2:
                break'''
            
            X_train = train.iloc[train_idx]
            X_valid = train.iloc[valid_idx]

            y_train = y.iloc[X_train.index]
            y_valid = y.iloc[X_valid.index]                  

            # OOF predictions
            if "xg" in model_name:
                #sample_weight = self.y_train.replace({1 : 1., 2 : 3., 3 : 4.})
                #sample_weight_eval_set = self.y_valid.replace({1 : 1., 2 : 3., 3 : 4.})
                model.fit(
                    X_train.drop(["season", "name"], axis=1),
                    y_train,
                    eval_set=[(X_valid.drop(["season", "name"], axis=1), y_valid)],
                    early_stopping_rounds=10,
                    verbose=0)
                preds_X_valid = model.predict(
                    X_valid.drop(["season", "name"], axis=1), ntree_limit=model.best_ntree_limit)
                best_iteration = model.best_ntree_limit
            elif "lg" in model_name:
                model.fit(
                    X_train.drop(["season", "name"], axis=1),
                    y_train,
                    eval_set=[(X_valid.drop(["season", "name"], axis=1), y_valid)],
                    early_stopping_rounds=10,
                    verbose=0)
                preds_X_valid = model.predict(
                    X_valid.drop(["season", "name"], axis=1), ntree_limit=model.best_iteration_)
                best_iteration = model.best_iteration_
            else:
                model.fit(X_train.drop(["season", "name"], axis=1), y_train)
                preds_X_valid = model.predict(X_valid.drop(["season", "name"], axis=1))
                best_iteration = -1

            all_preds.extend(preds_X_valid)
            all_targets.extend(y_valid.values)

        return (all_preds, all_targets)
    
    ###########################################################################################

    def run_model(self, model, model_name):
        # Fit and predict
        all_preds, all_targets = self.fit_and_predict(model, model_name)

        # Post process
        postpro_df = pd.DataFrame()
        postpro_df["targets"] = all_targets
        postpro_df["preds"] = all_preds
        postpro_df.loc[postpro_df["preds"] < 0, "preds"] = 0

        # Compute and return error
        score = scoring(postpro_df["targets"], postpro_df["preds"])
        return (score)

    ###########################################################################################

    def run_rf(self,
                   max_features,
                   max_depth,
                   min_samples_split,
                   min_samples_leaf):
        rf = RandomForestRegressor(
            n_estimators=self.n_estimators,
            max_features=max_features,
            max_depth=int(max_depth),
            min_samples_split=int(min_samples_split),
            min_samples_leaf=int(min_samples_leaf),
            random_state=SEED,
            n_jobs=N_JOBS)
        return (self.run_model(rf, "rf"))

    ###########################################################################################

    def run_et(self, 
                   max_features, 
                   max_depth, 
                   min_samples_split,
                   min_samples_leaf):
        et = ExtraTreesRegressor(
            n_estimators=self.n_estimators,
            max_features=max_features,
            max_depth=int(max_depth),
            min_samples_split=int(min_samples_split),
            min_samples_leaf=int(min_samples_leaf),
            random_state=SEED,
            n_jobs=N_JOBS)

        return (self.run_model(et, "et"))

    ###########################################################################################

    def run_xg(self, max_depth, min_child_weight, subsample,
                   colsample_bytree, colsample_bylevel, gamma):
        xg = XGBRegressor(
            objective="multi:softmax",
            learning_rate=0.1,
            n_estimators=self.n_estimators,
            max_depth=int(max_depth),
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            colsample_bylevel=colsample_bylevel,
            gamma=gamma,
            seed=SEED,
            nthread=N_JOBS)

        return (self.run_model(xg, "xg"))

    ###########################################################################################

    def run_lg(self, max_depth, min_child_samples, subsample,
                   subsample_freq, colsample_bytree, min_split_gain, is_unbalance):
        num_leaves = (2**int(max_depth)) - 1
        if (num_leaves > 4095):
            num_leaves = 4095
            
        if (self.kind == "classification"):
            if (int(is_unbalance) > 1):
                apply_weights = "balanced"
            else:
                apply_weights = None

        lg = LGBMRegressor(
            boosting_type="gbdt",
            objective="regression",
            learning_rate=0.1,
            n_estimators=self.n_estimators,
            num_leaves=num_leaves,
            max_depth=int(max_depth),
            min_child_samples=int(min_child_samples),
            subsample=subsample,
            subsample_freq=int(subsample_freq),
            colsample_bytree=colsample_bytree,
            min_split_gain=min_split_gain,
            seed=SEED,
            n_jobs=N_JOBS)

        return (self.run_model(lg, "lg"))


In [None]:
def scoring(y_true, y_pred):
    if isinstance(y_true, pd.core.series.Series):
        mask = y_true.notnull()
        y_true = y_true[mask].tolist()
        y_pred = y_pred[mask].tolist()
    elif isinstance(y_pred, np.ndarray):
        negmask = np.isnan(y_true)
        y_true = y_true[~negmask]
        y_pred = y_pred[~negmask]   
    
    return -np.round(mean_squared_error(y_true, y_pred), 5)

<b>Script</b>

In [None]:
# Load preprocessed data
init_train = pd.read_csv("../data/processed/all_seasons.csv")

# Use all seasons but 2018 (the one we try to predict) and 2017 (which will be used for prediction)
print(init_train.shape[0])
init_train = init_train.loc[(init_train["season"] != 2018) & (init_train["season"] != 2017), :]
print(init_train.shape[0])
display(init_train.tail(2))

In [None]:
# Apply initial config
y = init_train["playoff_wins"]
train = init_train.drop(["playoff_wins"], axis=1)

In [None]:
# Parameters ranges and initial exploration space
min_max_params_rf = {
    "max_depth_min" : 4,
    "max_depth_max" : 12,
    "max_features_min" : 0.2,
    "max_features_max" : 0.9,
    "min_samples_leaf_min" : 2,
    "min_samples_leaf_max" : 10,
    "min_samples_split_min" : 2,
    "min_samples_split_max" : 10,
}

min_max_params_xg = {
    "colsample_bylevel_min" : 0.3,
    "colsample_bylevel_max" : 1.0,
    "colsample_bytree_min" : 0.3,
    "colsample_bytree_max" : 1.0,
    "gamma_min" : 1,
    "gamma_max" : 50,
    "max_depth_min" : 5,
    "max_depth_max" : 15,
    "min_child_weight_min" : 2,
    "min_child_weight_max" : 30,
    "subsample_min" : 0.3,
    "subsample_max" : 1.0,
}

min_max_params_lg = {
    "colsample_bytree_min" : 0.25,
    "colsample_bytree_max" : 1.0,
    "is_unbalance_min" : 1,
    "is_unbalance_max" : 1.9,
    "max_depth_min" : 4,
    "max_depth_max" : 20,
    "min_child_samples_min" : 10,
    "min_child_samples_max" : 25,
    "min_split_gain_min" : 0.001,
    "min_split_gain_max" : 0.01,
    "subsample_min" : 0.2,
    "subsample_max" : 0.75,
    "subsample_freq_min" : 1,
    "subsample_freq_max" : 8,
}


In [None]:
# Initialisation
problem_type = "regression"
model_name = "lg"
n_estimators = 50

print("MODEL : " + model_name)
o_b = OptimBayes(
    train, 
    y, 
    problem_type, 
    n_estimators=n_estimators)
bo = o_b.init_bayes_opt(model_name, min_max_params_lg)

In [None]:
# Maximization
init_points = 100
n_iter = 300
xi = 0.07 # between 0.0 (exploitation) and 0.1 (exploration)

start = time.time()
bo.maximize(init_points=init_points, n_iter=n_iter, xi=xi, acq="ei")
print("BayesianOptimization took %.2f seconds" % ((time.time() - start)))

In [None]:
# Show results
history_df = o_b.show_best_combos(bo, length=20)

In [None]:
# Draw pair plots
if (model_name == "xg") :
    o_b.pair_plots(history_df, min_max_params_xg, "max_depth", "min_child_weight")
    o_b.pair_plots(history_df, min_max_params_xg, "max_depth", "subsample")
    o_b.pair_plots(history_df, min_max_params_xg, "max_depth", "colsample_bytree")
    o_b.pair_plots(history_df, min_max_params_xg, "max_depth", "colsample_bylevel")
    o_b.pair_plots(history_df, min_max_params_xg, "max_depth", "gamma")
    o_b.pair_plots(history_df, min_max_params_xg, "min_child_weight", "subsample")
    o_b.pair_plots(history_df, min_max_params_xg, "min_child_weight", "colsample_bytree")
    o_b.pair_plots(history_df, min_max_params_xg, "min_child_weight", "colsample_bylevel")
    o_b.pair_plots(history_df, min_max_params_xg, "min_child_weight", "gamma")
    o_b.pair_plots(history_df, min_max_params_xg, "subsample", "colsample_bytree")
    o_b.pair_plots(history_df, min_max_params_xg, "subsample", "colsample_bylevel")
    o_b.pair_plots(history_df, min_max_params_xg, "subsample", "gamma")
    o_b.pair_plots(history_df, min_max_params_xg, "colsample_bytree", "colsample_bylevel")
    o_b.pair_plots(history_df, min_max_params_xg, "colsample_bytree", "gamma")
    o_b.pair_plots(history_df, min_max_params_xg, "colsample_bylevel", "gamma")
elif (model_name == "lg") :
    o_b.pair_plots(history_df, min_max_params_lg, "max_depth", "min_child_samples")
    o_b.pair_plots(history_df, min_max_params_lg, "max_depth", "subsample")
    o_b.pair_plots(history_df, min_max_params_lg, "max_depth", "subsample_freq")
    o_b.pair_plots(history_df, min_max_params_lg, "max_depth", "colsample_bytree")
    o_b.pair_plots(history_df, min_max_params_lg, "max_depth", "min_split_gain")
    o_b.pair_plots(history_df, min_max_params_lg, "max_depth", "is_unbalance")
    o_b.pair_plots(history_df, min_max_params_lg, "min_child_samples", "subsample")
    o_b.pair_plots(history_df, min_max_params_lg, "min_child_samples", "subsample_freq")
    o_b.pair_plots(history_df, min_max_params_lg, "min_child_samples", "colsample_bytree")
    o_b.pair_plots(history_df, min_max_params_lg, "min_child_samples", "min_split_gain")
    o_b.pair_plots(history_df, min_max_params_lg, "min_child_samples", "is_unbalance")
    o_b.pair_plots(history_df, min_max_params_lg, "subsample", "subsample_freq")
    o_b.pair_plots(history_df, min_max_params_lg, "subsample", "colsample_bytree")
    o_b.pair_plots(history_df, min_max_params_lg, "subsample", "min_split_gain")
    o_b.pair_plots(history_df, min_max_params_lg, "subsample", "is_unbalance")
    o_b.pair_plots(history_df, min_max_params_lg, "subsample_freq", "colsample_bytree")
    o_b.pair_plots(history_df, min_max_params_lg, "subsample_freq", "min_split_gain")
    o_b.pair_plots(history_df, min_max_params_lg, "subsample_freq", "is_unbalance")
    o_b.pair_plots(history_df, min_max_params_lg, "colsample_bytree", "min_split_gain")
    o_b.pair_plots(history_df, min_max_params_lg, "colsample_bytree", "is_unbalance")
    o_b.pair_plots(history_df, min_max_params_lg, "min_split_gain", "is_unbalance")
elif (model_name == "et") :
    o_b.pair_plots(history_df, min_max_params_et, "max_depth", "max_features")
    o_b.pair_plots(history_df, min_max_params_et, "max_depth", "min_samples_leaf")
    o_b.pair_plots(history_df, min_max_params_et, "max_depth", "min_samples_split")
    o_b.pair_plots(history_df, min_max_params_et, "min_samples_leaf", "min_samples_split")
elif (model_name == "rf") :
    o_b.pair_plots(history_df, min_max_params_rf, "max_depth", "max_features")
    o_b.pair_plots(history_df, min_max_params_rf, "max_depth", "min_samples_leaf")
    o_b.pair_plots(history_df, min_max_params_rf, "max_depth", "min_samples_split")
    o_b.pair_plots(history_df, min_max_params_rf, "min_samples_leaf", "min_samples_split")
