# Part03 Algorithm Implementation, Model Training, and Performance Measurement

In [1]:
# import modeules

from typing import List, Dict, Union
import time

import joblib
import pandas as pd
import numpy as np
import yaml

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

from src.classes.NpData import NpData
from src.commons.Utils import impute_scale_and_convert_to_numpy

In [2]:
# read training dataset
ohe_train: pd.DataFrame = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/ohe_train.csv"
)

churn_train: pd.DataFrame = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/churn_train.csv"
)

ohe_val: pd.DataFrame = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/ohe_val.csv"
)

churn_val: pd.DataFrame = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/churn_val.csv"
)

ohe_test: pd.DataFrame = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/ohe_test.csv"
)

churn_test: pd.DataFrame = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/churn_test.csv"
)

In [3]:
with open(file="../../config.yaml", mode="r") as file_stream: 
    stream_loader = yaml.load(
        stream=file_stream,
        Loader=yaml.SafeLoader
    )
    mean_total_charges: float = stream_loader["MEAN_TOTAL_CHARGES"]

In [4]:
scaler_folder: str = f"../../models/scaler"

feature_train_np: np.ndarray
churn_train_np: np.ndarray

feature_val_np: np.ndarray
churn_val_np: np.ndarray

feature_test_np: np.ndarray
churn_test_np: np.ndarray

feature_train_np, churn_train_np = impute_scale_and_convert_to_numpy(
    ohe_df=ohe_train,
    columns_with_nulls=["TotalCharges"],
    impute_val=[mean_total_charges],
    scaler_folder=scaler_folder,
    churn_df=churn_train
)

feature_val_np, churn_val_np = impute_scale_and_convert_to_numpy(
    ohe_df=ohe_val,
    columns_with_nulls=["TotalCharges"],
    impute_val=[mean_total_charges],
    scaler_folder=scaler_folder,
    churn_df=churn_val    
)

feature_test_np, churn_test_np = impute_scale_and_convert_to_numpy(
    ohe_df=ohe_test,
    columns_with_nulls=["TotalCharges"],
    impute_val=[mean_total_charges],
    scaler_folder=scaler_folder,
    churn_df=churn_test    
)

Ok, so we have : 
 (1) imported all necessary modules
 (2) load train, validation, and test dataset
 (3) impute, scale, and convert the dataset to numpy

Now, we will subject all feature_** to PCA, so that we can get denser representation instead of a sparse matrix

In [5]:
# Load trained PCA
pca: PCA = joblib.load(
    filename="../../models/feature_pca.pkl",
)

feature_train_pca: np.ndarray = pca.transform(feature_train_np)
feature_val_pca: np.ndarray = pca.transform(feature_val_np)
feature_test_pca: np.ndarray = pca.transform(feature_test_np)

In [6]:
# initiate candidate models

# LogisticRegression is famous for being the simplest model for binary classification task such as churn prediction. 
# We can use LogisticRegression model as the base whose performance against which to compare other models
lr_classifier: LogisticRegression = LogisticRegression()

# RandomForestClassifier is a famous ensemble algorithm used for multi-class classification. It can handle binary classification as well
# The algorithm is an ensemble algorithm based on tree simple learners
# The algorithm make classification by taking a "voting" from all the trees trained
rf_classifier: RandomForestClassifier = RandomForestClassifier()

# GradientBoostingClassifier is another ensemble algorithm used for multi-class classification. 
# This algorithm too are based on tree simple learners
# In the GradientBoosting algorithm, however, the tree are arranged in sequence, whereby each sequence down the line try to rectify the error made upstream simple learner(s)
gb_classifier: GradientBoostingClassifier = GradientBoostingClassifier()

# Finally, SupportVectorMachine (SVM) classifier (SVC) is another algorithm that can be used for classification task
# This algorithm aims to "project" the data points to ever higher dimensions until a hyperplane can be found that separate the data points into classes
sv_classifier: SVC = SVC()


# We start all models with default hyperparameters

In [7]:
# Combining all these candidate models into Dictionary 
class ModelCandidate(object):
    def __init__(
            self,
            classification_model: Union[LogisticRegression, RandomForestClassifier, GradientBoostingClassifier, SVC],
            n_class: int
    ) -> None:
        self.classification_model: Union[LogisticRegression, RandomForestClassifier, GradientBoostingClassifier, SVC] = classification_model
        self.confusion_matrix: np.ndarray = np.empty(shape=(n_class, n_class))
        self.accuracy_score: float = 0.0
        self.f1_score: float = 0.0
        self.training_time: float = 0.0 
    
    
    def save_trained_model(
            self, 
            trained_model: Union[LogisticRegression, RandomForestClassifier, GradientBoostingClassifier, SVC]
    ) -> None:
        self.classification_model = trained_model
    
    
    def log_training_time(
            self, 
            training_time: float
    ) -> None:
        self.training_time = training_time
    
    def calculate_accuracy(
            self, 
            y_true: np.ndarray,
            y_pred: np.ndarray
    ) -> None:
        self.accuracy_score = accuracy_score(
            y_true=y_true,
            y_pred=y_pred
        )
    
    def calculate_confusion_matrix(
            self, 
            y_true: np.ndarray,
            y_pred: np.ndarray
    ) -> None:
        self.confusion_matrix = confusion_matrix(
            y_true=y_true, 
            y_pred=y_pred
        )
    
    def calculate_f1_score(
            self, 
            y_true: np.ndarray,
            y_pred: np.ndarray
    ) -> None:
        self.f1_score = f1_score(
            y_true=y_true, 
            y_pred=y_pred
        )
        


classifier_candidates: Dict[str, ModelCandidate] = {
    "lr_classifier": ModelCandidate(classification_model=lr_classifier, n_class=len(np.unique(churn_train_np))),
    "rf_classifier": ModelCandidate(classification_model=rf_classifier, n_class=len(np.unique(churn_train_np))),
    "gb_classifier": ModelCandidate(classification_model=gb_classifier, n_class=len(np.unique(churn_train_np))),
    "sv_classifier": ModelCandidate(classification_model=sv_classifier, n_class=len(np.unique(churn_train_np)))
}

for _, each_candidate in classifier_candidates.items():
    _tic: float = time.time()
    
    _classifier_model = each_candidate.classification_model
    _classifier_model.fit(
        X=feature_train_pca,
        y=churn_train_np.ravel()
    )
    
    each_candidate.save_trained_model(trained_model=_classifier_model)
    
    _toc: float = time.time()
    training_time: float = _toc - _tic
    each_candidate.log_training_time(training_time=training_time)
    
    y_pred: np.ndarray = _classifier_model.predict(X=feature_val_pca)
    
    each_candidate.calculate_accuracy(
        y_true=churn_val_np.ravel(),
        y_pred=y_pred
    )
    
    each_candidate.calculate_f1_score(
        y_true=churn_val_np.ravel(),
        y_pred=y_pred        
    )
    
    each_candidate.calculate_confusion_matrix(
        y_true=churn_val_np.ravel(),
        y_pred=y_pred        
    )
    

In [8]:

for each_key, each_candidate in classifier_candidates.items():
    print(f"{each_key} training time is {each_candidate.training_time} seconds")
    print(f"{each_key} accuracy is {each_candidate.accuracy_score}")
    print(f"{each_key} f1_score is {each_candidate.f1_score}")
    print("---"*10)

lr_classifier training time is 0.008517742156982422 seconds
lr_classifier accuracy is 0.7420078519349411
lr_classifier f1_score is 0.6904441453566622
------------------------------
rf_classifier training time is 0.8806240558624268 seconds
rf_classifier accuracy is 0.807627593942793
rf_classifier f1_score is 0.788140827671402
------------------------------
gb_classifier training time is 0.8179631233215332 seconds
gb_classifier accuracy is 0.7453729669097028
gb_classifier f1_score is 0.6997354497354497
------------------------------
sv_classifier training time is 0.7209358215332031 seconds
sv_classifier accuracy is 0.7352776219854178
sv_classifier f1_score is 0.6890645586297759
------------------------------
