# Part 04 - Hyperparameter Tuning

In [30]:
# import modules

from typing import Dict, List, Union
import time

import joblib
import pandas as pd
import numpy as np
import yaml
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

from src.commons.Utils import impute_scale_and_convert_to_numpy

In [31]:
ohe_train: pd.DataFrame
ohe_val: pd.DataFrame
ohe_test: pd.DataFrame

churn_train: pd.DataFrame
churn_val: pd.DataFrame
churn_test: pd.DataFrame

ohe_train = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/ohe_train.csv"
)

churn_train = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/churn_train.csv"
)

ohe_val = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/ohe_val.csv"
)

churn_val = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/churn_val.csv"
)

ohe_test = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/ohe_test.csv"
)

churn_test = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/churn_test.csv"
)

In [32]:
with open(file="../../config.yaml", mode="r") as file_stream: 
    yml = yaml.load(
        stream=file_stream,
        Loader=yaml.SafeLoader
    )
    mean_total_charges: float = yml["MEAN_TOTAL_CHARGES"]

In [33]:
scaler_folder: str = "../../models/scaler"

feature_train_np: np.ndarray 
churn_train_np: np.ndarray

feature_val_np: np.ndarray
churn_val_np: np.ndarray

feature_test_np: np.ndarray
churn_test_np: np.ndarray

feature_train_np, churn_train_np = impute_scale_and_convert_to_numpy(
    ohe_df=ohe_train,
    columns_with_nulls=["TotalCharges"],
    impute_val=[mean_total_charges],
    scaler_folder=scaler_folder,
    churn_df=churn_train
)

feature_val_np, churn_val_np = impute_scale_and_convert_to_numpy(
    ohe_df=ohe_val,
    columns_with_nulls=["TotalCharges"],
    impute_val=[mean_total_charges],
    scaler_folder=scaler_folder,
    churn_df=churn_val    
)

feature_test_np, churn_test_np = impute_scale_and_convert_to_numpy(
    ohe_df=ohe_test,
    columns_with_nulls=["TotalCharges"],
    impute_val=[mean_total_charges],
    scaler_folder=scaler_folder,
    churn_df=churn_test    
)


In [34]:
# Load Trained PCA
pca: PCA = joblib.load(
    filename="../../models/feature_pca.pkl"
)

# Convert feature_np to feature_pca
feature_train_pca: np.ndarray = pca.transform(X=feature_train_np)
feature_val_pca: np.ndarray = pca.transform(X=feature_val_np)
feature_test_pca: np.ndarray = pca.transform(X=feature_test_np)

In [35]:
# Load trained GradientBoostingClassifier
gb_classifier: GradientBoostingClassifier = joblib.load(
    filename="../../models/gb_classifier.pkl"
)

# Define parameter grid
param_grid: Dict[str, List] = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.1, 0.2, 0.3],
    "max_depth": [3, 5, 7],
    "min_samples_split": [2, 5, 10]
}

# Create GridSearchCV object
grid_search: GridSearchCV = GridSearchCV(
    estimator=gb_classifier,
    param_grid=param_grid,
    scoring="accuracy",
    verbose=1
)

# Measure the time taken for searching the grid
_tic: float = time.time()

grid_search.fit(
    X=feature_train_pca,
    y=churn_train_np.ravel()
)

_toc: float = time.time()

search_period: float = _toc - _tic

# Get the best parameters
best_params: Dict[str, Union[int, float]] = grid_search.best_params_

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [37]:
# Evaluate the best model
best_gb: GradientBoostingClassifier = GradientBoostingClassifier(**best_params)
best_gb.fit(X=feature_train_pca, y=churn_train_np.ravel())

joblib.dump(
    value=best_gb,
    filename="../../models/gb_classifier_best.pkl"
)

y_pred_train: np.ndarray = best_gb.predict(X=feature_train_pca)
y_pred_val: np.ndarray = best_gb.predict(X=feature_val_pca)
y_pred_test: np.ndarray = best_gb.predict(X=feature_test_pca)

accuracy_train: float = accuracy_score(
    y_true=churn_train_np.ravel(),
    y_pred=y_pred_train
)

accuracy_val: float = accuracy_score(
    y_true=churn_val_np.ravel(),
    y_pred=y_pred_val
)

accuracy_test: float = accuracy_score(
    y_true=churn_test_np.ravel(),
    y_pred=y_pred_test
)

print("Best GradientBoostingClassifier model")
print(f"search period {search_period} seconds")
print("----"*10)
print(f"Accuracy Score train {accuracy_train}")
print(f"Accuracy Score val {accuracy_val}")
print(f"Accuracy Score test {accuracy_test}")

Best GradientBoostingClassifier model
search period 776.6343929767609 seconds
----------------------------------------
Accuracy Score train 0.9942012719790497
Accuracy Score val 0.8126752664049355
Accuracy Score test 0.7986539540100953


In [38]:
display(best_params)

{'learning_rate': 0.2,
 'max_depth': 7,
 'min_samples_split': 2,
 'n_estimators': 200}