In [None]:


import seaborn as sn
from dotenv import load_dotenv

from load_data import loading_and_pre_processing_pipeline

load_dotenv()

feature_data = (
    loading_and_pre_processing_pipeline()
    .sort_values("user_id")
)

feature_data['date_diff'] = (feature_data['next_date'] - feature_data['date']).apply(lambda date: date.days)

feature_data = feature_data.sort_values('date')
feature_data = feature_data[feature_data['date_diff'] <= 30]

print(
    f'{len(feature_data)} records from {len(set(feature_data["user_id"].values))} users present. '
)
print(f"{len(feature_data.dropna(axis=0))} of which are complete.")
feature_data.head()

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import auc, precision_recall_curve, roc_auc_score
from autogluon.tabular import TabularPredictor
from imblearn.under_sampling import RandomUnderSampler
from log_reg_model import create_model
import pandas as pd
from tqdm.notebook import tqdm

SPLITS = 10

TARGET = 'test_result'
labeled_data = feature_data[feature_data[TARGET].notna()].drop(columns=['next_test_result', 'test_week_start', 'next_week', 'next_date', 'date', 'user_id'])
features = labeled_data.drop(columns=TARGET).columns

# time series split for training and testing
tsp = TimeSeriesSplit(n_splits=SPLITS)

lr_pr_auc = []
gluon_pr_auc = []
lr_roc_auc = []
gluon_roc_auc = []
split_date = []
test_data_start = []
train_data_end = []

for train_index, test_index in tqdm(tsp.split(labeled_data), total=SPLITS):

    # save the start and end date of the train and test data
    split_date.append(feature_data[feature_data[TARGET].notna()].iloc[train_index].date.max())
    test_data_start.append(feature_data[feature_data[TARGET].notna()].iloc[test_index].date.min())
    train_data_end.append(feature_data[feature_data[TARGET].notna()].iloc[train_index].date.max())

    # train and test data
    train_data = labeled_data.iloc[train_index]
    test_data = labeled_data.iloc[test_index]

    # under sampling to balance training data
    X_train_resampled, y_train_resampled = RandomUnderSampler().fit_resample(train_data.values, train_data[[TARGET]].astype(bool).values)
    train_data_resampled = pd.DataFrame(X_train_resampled, columns=test_data.columns)

    # under sampling to balance test data
    X_resampled, y_resampled = RandomUnderSampler().fit_resample(test_data.values, test_data[[TARGET]].astype(bool).values)
    test_data_resampled = pd.DataFrame(X_resampled, columns=test_data.columns)

    # training the logistic regression model
    lr_model = create_model()
    lr_model.fit(train_data[features], train_data[TARGET].astype(bool))

    # training the gluon model
    gluon_predictor = TabularPredictor(label=TARGET, eval_metric='balanced_accuracy', sample_weight='auto_weight').fit(train_data_resampled)

    # calculate precision recall area under the curve and save to row in dataframe
    def calculate_precision_recall_area_under_curve(model, data, target):
        try:
            proba = model.predict_proba(data[features])[:, 1]
        except TypeError:
            proba = model.predict_proba(data[features])[True].values
        precision, recall, thresholds = precision_recall_curve(data[target].astype(bool), proba)
        return auc(recall, precision)

    # calculate roc area under the curve
    def calculate_roc_area_under_curve(model, data, target):
        try:
            proba = model.predict_proba(data[features])[:, 1]
        except TypeError:
            proba = model.predict_proba(data[features])[True].values
        return roc_auc_score(data[target].astype(bool), proba)

    lr_pr_auc.append(calculate_precision_recall_area_under_curve(lr_model, test_data_resampled, TARGET))
    gluon_pr_auc.append(calculate_precision_recall_area_under_curve(gluon_predictor, test_data_resampled, TARGET))

    lr_roc_auc.append(calculate_roc_area_under_curve(lr_model, test_data_resampled, TARGET))
    gluon_roc_auc.append(calculate_roc_area_under_curve(gluon_predictor, test_data_resampled, TARGET))

In [None]:
# plot the auc curve for both models over time series splits
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 5))
pd.DataFrame({'Logistic regression': lr_pr_auc, 'Autogluon best model': gluon_pr_auc, 'date': split_date}).set_index('date').plot(ax=ax, kind='line')
ax.set_title("Precision recall AUC for Logistic regression and Autogluon best model")

Apparently, the Autogluon best model performs better than the Logistic regression model.

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
pd.DataFrame({'Logistic regression': lr_roc_auc, 'Autogluon best model': gluon_roc_auc, 'date': split_date}).set_index('date').plot(ax=ax, kind='line')
ax.set_title("ROC AUC for Logistic regression and Autogluon best model")

In [None]:
gluon_predictor.feature_importance(test_data_resampled)

In [None]:
gluon_predictor.leaderboard(test_data_resampled, extra_metrics=['roc_auc'], silent=True)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List
from sklearn.inspection import permutation_importance

# calculate the confusion matrix and plot it

def plot_confusion_matrix_on_axis(ax, y_true, y_pred):
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_true, y_pred)
    ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    tick_marks = np.arange(len(set(y_true)))
    ax.set_xticks(tick_marks, set(y_true), rotation=45)
    ax.set_yticks(tick_marks, set(y_true))
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, cm[i, j],
                     horizontalalignment="center",
                     color="white" if cm[i, j] > cm.max() / 2. else "black")
    plt.tight_layout()
    ax.grid(visible=None)
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')

# plot roc curve on axis

def plot_roc_curve_on_ax(ax, y_true, y_pred):
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    ax.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % roc_auc)
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic example')
    ax.legend(loc="lower right")

# plot precision recall curve on axis

def plot_precision_recall_curve_on_ax(ax, y_true, y_pred):
    from sklearn.metrics import precision_recall_curve, auc
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    pr_auc = auc(recall, precision)
    ax.plot(recall, precision, label='Precision-Recall curve (AUC = %0.2f)' % pr_auc)
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_ylim([0.0, 1.05])
    ax.set_xlim([0.0, 1.0])
    ax.set_title('Precision-Recall example')
    ax.legend(loc="lower left")

# plot feature importance on axis

def plot_permutation_feature_importance(ax, target, feature_names, test_data, classifier):
    y_test = test_data[target].astype(bool).values
    X_test = test_data[feature_names]
    try:
        importance = permutation_importance(classifier, X_test, y_test, n_repeats=2, scoring="roc_auc")
        decreasing_importance = (
            pd.DataFrame({"mean": importance["importances_mean"], "label": feature_names})
            .sort_values("mean", ascending=False)
            .label.values
        )

        df = (
            pd.DataFrame(
                columns=pd.Index(data=feature_names, name="features"),
                data=importance["importances"].T,
            )
            .stack("features")
            .reset_index()
        )
        df.columns = ["iter", "features", "importance"]
        sns.boxplot(
            y="features",
            x="importance",
            data=df,
            order=decreasing_importance,
            ax=ax,
        )
    except:
        gluon_feature_importance = classifier.feature_importance(test_data)
        columns = ['features'] + gluon_feature_importance.columns.tolist()
        df = gluon_feature_importance.reset_index()
        df.columns= columns
        print(gluon_feature_importance)
        sns.boxplot(
            y="features",
            x="importance",
            data=df,
            ax=ax,
        )



# plot the confusion matrix and roc curve for the autogluon best model
def plot_analysis(axes: List, target: str, features: List[str], test_data: pd.DataFrame, predictor):
    try:
        y_pred = predictor.predict(test_data[features])
        y_proba = predictor.predict_proba(test_data[features])[:, 1]
    except TypeError:
        y_pred = predictor.predict(test_data[features])
        y_proba = predictor.predict_proba(test_data[features])[True].values
    y_true = test_data[target].astype(bool)
    plot_confusion_matrix_on_axis(axes[0], y_true, y_pred)
    plot_roc_curve_on_ax(axes[1], y_true, y_proba)
    plot_precision_recall_curve_on_ax(axes[2], y_true, y_proba)
    plot_permutation_feature_importance(axes[3], target, features, test_data, predictor)

fig, axes = plt.subplots(4, 2, figsize=(10, 20))

lr_axes = axes[:,1]
plot_analysis(lr_axes, TARGET, features, test_data_resampled, lr_model)

gluon_axes = axes[:,0]
plot_analysis(gluon_axes, TARGET, features, test_data_resampled, gluon_predictor)

plt.tight_layout()