In [1]:
# import libraries
import numpy as np
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_recall_curve,
    average_precision_score
)


In [2]:
# Create a imbalanced dataset
X, y = make_classification(
    n_samples=10000,
    n_features=20,
    n_informative=5,
    n_redundant=5,
    weights=[0.95, 0.05],  # 5% minority
    random_state=42
)

pd.Series(y).value_counts(normalize=True)


0    0.9457
1    0.0543
Name: proportion, dtype: float64

In [3]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [4]:
# Baseline model (without handling class imbalance)
baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(X_train, y_train)

y_pred = baseline_model.predict(X_test)
y_prob = baseline_model.predict_proba(X_test)[:, 1]

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("PR-AUC:", average_precision_score(y_test, y_prob))


[[1888    3]
 [  93   16]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1891
           1       0.84      0.15      0.25       109

    accuracy                           0.95      2000
   macro avg       0.90      0.57      0.61      2000
weighted avg       0.95      0.95      0.94      2000

ROC-AUC: 0.818823107040108
PR-AUC: 0.4231507441408628


In [5]:
# Logistic Regression with class_weight='balanced'
weighted_model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000
)
weighted_model.fit(X_train, y_train)

y_pred_w = weighted_model.predict(X_test)
y_prob_w = weighted_model.predict_proba(X_test)[:, 1]

print(confusion_matrix(y_test, y_pred_w))
print(classification_report(y_test, y_pred_w))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_w))
print("PR-AUC:", average_precision_score(y_test, y_prob_w))


[[1387  504]
 [  18   91]]
              precision    recall  f1-score   support

           0       0.99      0.73      0.84      1891
           1       0.15      0.83      0.26       109

    accuracy                           0.74      2000
   macro avg       0.57      0.78      0.55      2000
weighted avg       0.94      0.74      0.81      2000

ROC-AUC: 0.8375210436689485
PR-AUC: 0.33532678444379754


In [6]:
# Threshold tuning
custom_threshold = 0.3

y_pred_thresh = (y_prob_w >= custom_threshold).astype(int)

print(confusion_matrix(y_test, y_pred_thresh))
print(classification_report(y_test, y_pred_thresh))


[[ 872 1019]
 [   8  101]]
              precision    recall  f1-score   support

           0       0.99      0.46      0.63      1891
           1       0.09      0.93      0.16       109

    accuracy                           0.49      2000
   macro avg       0.54      0.69      0.40      2000
weighted avg       0.94      0.49      0.60      2000

