In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import average_precision_score, roc_auc_score, confusion_matrix, classification_report

# Load Dataset

In [12]:
X_train = pd.read_csv('../data/X_train_resampled.csv')
y_train = pd.read_csv('../data/y_train_resampled.csv')
X_val = pd.read_csv('../data/X_val.csv')
y_val = pd.read_csv('../data/y_val.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_test = pd.read_csv('../data/y_test.csv')

# Baseline

In [13]:
# Initialize Logistic Regression model
logreg = LogisticRegression(
    max_iter=5000,
    random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [14]:
# Train (fit) the model
logreg.fit(X_train_scaled, y_train.values.ravel())

# Evaluate on training data
y_train_pred = logreg.predict(X_train_scaled)
y_train_proba = logreg.predict_proba(X_train_scaled)[:, 1]
print("Training Classification Report:\n")
print(classification_report(y_train, y_train_pred, digits=4))
roc_auc_train = roc_auc_score(y_train, y_train_proba)
pr_auc_train = average_precision_score(y_train, y_train_proba)
print(f"Training ROC-AUC Score: {roc_auc_train:.4f}")
print(f"Training PR-AUC Score:  {pr_auc_train:.4f}")
cm_train = confusion_matrix(y_train, y_train_pred)
print("\n Training Confusion Matrix:")
print(cm_train)

Training Classification Report:

              precision    recall  f1-score   support

           0     0.9473    1.0000    0.9729    132938
           1     1.0000    0.9444    0.9714    132938

    accuracy                         0.9722    265876
   macro avg     0.9736    0.9722    0.9722    265876
weighted avg     0.9736    0.9722    0.9722    265876

Training ROC-AUC Score: 0.9759
Training PR-AUC Score:  0.9846

 Training Confusion Matrix:
[[132937      1]
 [  7395 125543]]


In [15]:
# Predict on validation data
y_pred = logreg.predict(X_val_scaled)
y_proba = logreg.predict_proba(X_val_scaled)[:, 1]

# Evaluate the model
print("Classification Report:\n")
print(classification_report(y_val, y_pred, digits=4))

roc_auc = roc_auc_score(y_val, y_proba)
pr_auc = average_precision_score(y_val, y_proba)
cm = confusion_matrix(y_val, y_pred)

print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"PR-AUC Score:  {pr_auc:.4f}")
print("\n Confusion Matrix:")
print(cm)

Classification Report:

              precision    recall  f1-score   support

           0     0.9496    1.0000    0.9741     18991
           1     0.0000    0.0000    0.0000      1009

    accuracy                         0.9496     20000
   macro avg     0.4748    0.5000    0.4871     20000
weighted avg     0.9016    0.9496    0.9250     20000

ROC-AUC Score: 0.5118
PR-AUC Score:  0.0522

 Confusion Matrix:
[[18991     0]
 [ 1009     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Add L2 Regularisation

In [16]:
# Regularized Logistic Regression
logreg_reg = LogisticRegression(
    max_iter=5000,
    penalty='l2',  # L2 regularization
    C=0.1,         # smaller C = stronger regularization
    random_state=42
)

# Fit the model
logreg_reg.fit(X_train_scaled, y_train.values.ravel())

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,5000


In [17]:
# Evaluate on training data
y_train_pred = logreg_reg.predict(X_train_scaled)
y_train_proba = logreg_reg.predict_proba(X_train_scaled)[:, 1]
print("Training Classification Report:\n")
print(classification_report(y_train, y_train_pred, digits=4))
roc_auc_train = roc_auc_score(y_train, y_train_proba)
pr_auc_train = average_precision_score(y_train, y_train_proba)
print(f"Training ROC-AUC Score: {roc_auc_train:.4f}")
print(f"Training PR-AUC Score:  {pr_auc_train:.4f}")
cm_train = confusion_matrix(y_train, y_train_pred)
print("\n Training Confusion Matrix:")
print(cm_train)

Training Classification Report:

              precision    recall  f1-score   support

           0     0.9473    1.0000    0.9729    132938
           1     1.0000    0.9444    0.9714    132938

    accuracy                         0.9722    265876
   macro avg     0.9736    0.9722    0.9722    265876
weighted avg     0.9736    0.9722    0.9722    265876

Training ROC-AUC Score: 0.9759
Training PR-AUC Score:  0.9846

 Training Confusion Matrix:
[[132937      1]
 [  7395 125543]]


In [18]:
# Evaluate on validation data
y_val_pred = logreg_reg.predict(X_val_scaled)
y_val_proba = logreg_reg.predict_proba(X_val_scaled)[:, 1]
print("Validation Classification Report:\n")
print(classification_report(y_val, y_val_pred, digits=4))  
roc_auc_val = roc_auc_score(y_val, y_val_proba)
pr_auc_val = average_precision_score(y_val, y_val_proba)
print(f"Validation ROC-AUC Score: {roc_auc_val:.4f}")
print(f"Validation PR-AUC Score:  {pr_auc_val:.4f}")
cm_val = confusion_matrix(y_val, y_val_pred)
print("\n Validation Confusion Matrix:")
print(cm_val)

Validation Classification Report:

              precision    recall  f1-score   support

           0     0.9496    1.0000    0.9741     18991
           1     0.0000    0.0000    0.0000      1009

    accuracy                         0.9496     20000
   macro avg     0.4748    0.5000    0.4871     20000
weighted avg     0.9016    0.9496    0.9250     20000

Validation ROC-AUC Score: 0.5128
Validation PR-AUC Score:  0.0523

 Validation Confusion Matrix:
[[18991     0]
 [ 1009     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Tune with GridSearch

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

logreg = LogisticRegression(
    max_iter=5000,
    penalty='l2',
    random_state=42
)

grid = GridSearchCV(logreg, param_grid, scoring='average_precision', cv=3, n_jobs=-1)
grid.fit(X_train_scaled, y_train.values.ravel())

print("Best C:", grid.best_params_)
logreg_tuned = grid.best_estimator_

Best C: {'C': 0.1}


In [20]:
# Evaluate on validation data
y_val_pred = logreg_tuned.predict(X_val_scaled)
y_val_proba = logreg_tuned.predict_proba(X_val_scaled)[:, 1]
print("Validation Classification Report:\n")
print(classification_report(y_val, y_val_pred, digits=4))  
roc_auc_val = roc_auc_score(y_val, y_val_proba)
pr_auc_val = average_precision_score(y_val, y_val_proba)
print(f"Validation ROC-AUC Score: {roc_auc_val:.4f}")
print(f"Validation PR-AUC Score:  {pr_auc_val:.4f}")
cm_val = confusion_matrix(y_val, y_val_pred)
print("\n Validation Confusion Matrix:")
print(cm_val)

Validation Classification Report:

              precision    recall  f1-score   support

           0     0.9496    1.0000    0.9741     18991
           1     0.0000    0.0000    0.0000      1009

    accuracy                         0.9496     20000
   macro avg     0.4748    0.5000    0.4871     20000
weighted avg     0.9016    0.9496    0.9250     20000

Validation ROC-AUC Score: 0.5128
Validation PR-AUC Score:  0.0523

 Validation Confusion Matrix:
[[18991     0]
 [ 1009     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
