# Preparation

In [None]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import fbeta_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
DIR_PATH = "/content/drive/MyDrive/Colab Notebooks/ML CCC"

RANDOM_STATE = 42

In [None]:
f2_score = lambda y_test, y_pred: fbeta_score(y_test, y_pred, beta=2)
f2_scorer = make_scorer(fbeta_score, beta=2)

In [None]:
X_train = pd.read_csv(os.path.join(DIR_PATH, 'X_train_prep.csv')) 
X_test = pd.read_csv(os.path.join(DIR_PATH, 'X_test_prep.csv'))
y_train = pd.read_csv(os.path.join(DIR_PATH, 'y_train_prep.csv'))
y_test = pd.read_csv(os.path.join(DIR_PATH, 'y_test_prep.csv'))

In [None]:
y_train = y_train.to_numpy().ravel()
y_test = y_test.to_numpy().ravel()

In [None]:
def plot_search_results(searcher):
    # https://scikit-learn.org/stable/auto_examples/model_selection/plot_successive_halving_iterations.html
    
    results = pd.DataFrame(searcher.cv_results_)
    results["params_str"] = results.params.apply(str)
    results.drop_duplicates(subset=("params_str", "iter"), inplace=True)
    mean_scores = results.pivot(
        index="iter", columns="params_str", values="mean_test_score"
    )
    ax = mean_scores.plot(legend=False, alpha=0.6)

    labels = [
        f"iter={i}\nn_samples={searcher.n_resources_[i]}\nn_candidates={searcher.n_candidates_[i]}"
        for i in range(searcher.n_iterations_)
    ]

    ax.set_xticks(range(searcher.n_iterations_))
    ax.set_xticklabels(labels, rotation=45, multialignment="left")
    ax.set_title("Scores of candidates over iterations")
    ax.set_ylabel("mean test score", fontsize=15)
    ax.set_xlabel("iterations", fontsize=15)
    plt.tight_layout()
    plt.show()


# Gradient Boosting tuning

## Initialization

In [None]:
gb = GradientBoostingClassifier(random_state=RANDOM_STATE, verbose=1)

## Phase 1

In [None]:
param_grid = {
    'learning_rate': [0.001, 0.01, 0.1, 1],  # =0.1
    'n_estimators':  [5, 10, 20, 50, 100, 200, 500],  # =100
    'max_depth':     np.linspace(3, 33, num=11, dtype=np.int64)  # =3
}
param_grid

{'learning_rate': [0.001, 0.01, 0.1, 1],
 'max_depth': array([ 3,  6,  9, 12, 15, 18, 21, 24, 27, 30, 33]),
 'n_estimators': [5, 10, 20, 50, 100, 200, 500]}

In [None]:
searcher = HalvingGridSearchCV(
    gb, param_grid,
    # min_resources=500, aggressive_elimination=True,
    factor=3, cv=3, scoring=f2_scorer,
    verbose=2, random_state=RANDOM_STATE, n_jobs=-1,
).fit(X_train, y_train)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 58
max_resources_: 14324
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 308
n_resources: 58
Fitting 3 folds for each of 308 candidates, totalling 924 fits


In [None]:
plot_search_results(searcher)

In [None]:
best = searcher.best_estimator_
best

In [None]:
y_test_pred = best.predict(X_test)

In [None]:
f2_score(y_test, y_test_pred)

## Phase 2

In [None]:
param_grid = {
    # 'learning_rate': [0.001, 0.01, 0.1, 1],  # =0.1
    'n_estimators':  [5, 10, 20, 50, 100, 200, 500],  # =100
    'max_depth':     np.linspace(3, 33, num=11, dtype=np.int64)  # =3
}
param_grid

In [None]:
searcher = HalvingGridSearchCV(
    gb, param_grid,
    # min_resources=500, aggressive_elimination=True,
    factor=3, cv=3, scoring=f2_scorer,
    verbose=2, random_state=RANDOM_STATE, n_jobs=-1,
).fit(X_train, y_train)

In [None]:
plot_search_results(searcher)

In [None]:
best = searcher.best_estimator_
best

In [None]:
y_test_pred = best.predict(X_test)

In [None]:
f2_score(y_test, y_test_pred)

## Phase 3

In [None]:
param_grid = {
    # 'learning_rate': [0.001, 0.01, 0.1, 1],  # =0.1
    'n_estimators':  [5, 10, 20, 50, 100, 200, 500],  # =100
    # 'max_depth':     np.linspace(3, 33, num=11, dtype=np.int64)  # =3
}
param_grid

In [None]:
searcher = HalvingGridSearchCV(
    gb, param_grid,
    # min_resources=500, aggressive_elimination=True,
    factor=3, cv=3, scoring=f2_scorer,
    verbose=2, random_state=RANDOM_STATE, n_jobs=-1,
).fit(X_train, y_train)

In [None]:
plot_search_results(searcher)

In [None]:
best = searcher.best_estimator_
best

In [None]:
y_test_pred = best.predict(X_test)

In [None]:
f2_score(y_test, y_test_pred)

In [None]:
best.fit(X_train, y_train)

In [None]:
y_test_pred = best.predict(X_test)

In [None]:
f2_score(y_test, y_test_pred)