In [1]:
from pathlib import Path
import pandas as pd

pd.set_option("display.max_columns", None)  # Display all columns
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("ggplot")
import numpy as np
from sklearn.linear_model import LogisticRegression
from dataclasses import dataclass
from sklearn.metrics import f1_score, auc, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler
from collections import Counter
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler

In [2]:
save_data_dir = Path(Path.cwd().parent, "data", "interim")
X_train = pd.read_pickle(Path(save_data_dir, "X_train.pkl"))
y_train = pd.read_pickle(Path(save_data_dir, "y_train.pkl"))
X_test = pd.read_pickle(Path(save_data_dir, "X_test.pkl"))
y_test = pd.read_pickle(Path(save_data_dir, "y_test.pkl"))

In [3]:
print(X_train.shape, y_train.shape)

(80000, 115) (80000,)


# Initial Thoughts
## Potential models
This is a binary classification task, and generally simpler models are best to start with. Therefore I will try logistic regression and a boosted decision tree initially. 

## Metrics
This is an imbalanced dataset, so useful metrics will be F1 score and ROC/AUC, as well as precision and recall. 

In [4]:
class MLModel:
    """Class to hold info about different experiment metrics"""

    def __init__(self, name, clf, X_train, y_train):
        self.name = name
        self.clf = clf
        self.X_train = X_train
        self.y_train = y_train

    def calc_f1_score(self):
        self.f1 = np.mean(
            cross_val_score(self.clf, self.X_train, self.y_train, cv=5, scoring="f1")
        )

    def calc_precision(self):
        self.precision = np.mean(
            cross_val_score(
                self.clf, self.X_train, self.y_train, cv=5, scoring="precision"
            )
        )

    def calc_recall(self):
        self.recall = np.mean(
            cross_val_score(
                self.clf, self.X_train, self.y_train, cv=5, scoring="recall"
            )
        )

    def calc_auc(self):
        self.auc = np.mean(
            cross_val_score(
                self.clf, self.X_train, self.y_train, cv=5, scoring="roc_auc"
            )
        )

    def get_hyperparameters(self):
        self.hyperparameters = self.clf.get_params()

    def get_probabilities(self):
        self.probabilities = (
            self.clf.fit(self.X_train, self.y_train)
            .predict_proba(X_train)
            .argmax(axis=1)
        )

    def run_all(self):
        self.calc_f1_score()
        self.calc_precision()
        self.calc_recall()
        self.calc_auc()
        self.get_hyperparameters()
        self.get_probabilities()

In [5]:
models_dict = {}

# Logistic Regeression
## No tuning, default params

In [6]:
clf = LogisticRegression(max_iter=1000)

LogRegDefault = MLModel(
    name="Logistic Regression default params", clf=clf, X_train=X_train, y_train=y_train
)
LogRegDefault.run_all()
models_dict[LogRegDefault.name] = LogRegDefault

In [7]:
LogRegDefault.f1

0.19219009167682788

# Sampling. 

Since our F1 score is quite low I think hyperparameter tuning at this stage is pointless, as we would only expect a 5 - 10% increase in performance. 
At this point I'll try Under, over, and SMOTE sampling and check the results. 

## Under Sampling

In [8]:
cc = ClusterCentroids(random_state=0)
X_undersampled, y_undersampled = cc.fit_resample(X_train, y_train)

  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
print(sorted(Counter(y_train).items()))
print(sorted(Counter(y_undersampled).items()))

[(0, 79464), (1, 536)]
[(0, 536), (1, 536)]


In [10]:
LogRegDefaultUnderSampled = MLModel(
    name="Logistic Regression default params - undersampled",
    clf=clf,
    X_train=X_undersampled,
    y_train=y_undersampled,
)
LogRegDefaultUnderSampled.run_all()
models_dict[LogRegDefaultUnderSampled.name] = LogRegDefaultUnderSampled

In [11]:
LogRegDefaultUnderSampled.f1

0.9340085060100731

## Oversampling

In [12]:
ros = RandomOverSampler(random_state=0)
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

In [13]:
print(sorted(Counter(y_train).items()))
print(sorted(Counter(y_oversampled).items()))

[(0, 79464), (1, 536)]
[(0, 79464), (1, 79464)]


In [14]:
LogRegDefaultOverSampled = MLModel(
    name="Logistic Regression default params - oversampled",
    clf=clf,
    X_train=X_oversampled,
    y_train=y_oversampled,
)
LogRegDefaultOverSampled.run_all()
models_dict[LogRegDefaultOverSampled.name] = LogRegDefaultOverSampled

In [15]:
LogRegDefaultOverSampled.f1

0.8943985767606243

## SMOTE

In [16]:
smote_enn = SMOTEENN(random_state=0)
X_SMOTE, y_SMOTE = smote_enn.fit_resample(X_train, y_train)

In [17]:
print(sorted(Counter(y_train).items()))
print(sorted(Counter(y_oversampled).items()))

[(0, 79464), (1, 536)]
[(0, 79464), (1, 79464)]


In [18]:
LogRegDefaultSMOTE = MLModel(
    name="Logistic Regression default params - oversampled",
    clf=clf,
    X_train=X_SMOTE,
    y_train=y_SMOTE,
)
LogRegDefaultSMOTE.run_all()
models_dict[LogRegDefaultSMOTE.name] = LogRegDefaultSMOTE

In [19]:
LogRegDefaultOverSampled.f1

0.8943985767606243