In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, confusion_matrix, classification_report

# Load Datasets

In [2]:
X_train = pd.read_csv('../data/X_train.csv')
y_train = pd.read_csv('../data/y_train.csv')
X_train_resampled = pd.read_csv('../data/X_train_resampled.csv')
y_train_resampled = pd.read_csv('../data/y_train_resampled.csv')
X_val = pd.read_csv('../data/X_val.csv')
y_val = pd.read_csv('../data/y_val.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_test = pd.read_csv('../data/y_test.csv')

# Use Log-Transformed Features

In [3]:
log_transformed_columns = ['days_since_request_log', 'intended_balcon_amount_log',
                           'zip_count_4w_log', 'velocity_24h_log', 'velocity_4w_log',
                           'date_of_birth_distinct_emails_4w_log','session_length_in_minutes_log']
original_columns = ['days_since_request', 'intended_balcon_amount_clean',
                    'zip_count_4w', 'velocity_24h', 'velocity_4w',
                    'date_of_birth_distinct_emails_4w', 'session_length_in_minutes_cleaned']

In [4]:
X_train_resampled_log = X_train_resampled.drop(columns=original_columns)
X_train_log = X_train.drop(columns=original_columns)
X_val_log = X_val.drop(columns=original_columns)
X_test_log = X_test.drop(columns=original_columns)

## Use SMOTE resampled Training Data

### Baseline

In [5]:
# Baseline XGBoost model
xgb_baseline = XGBClassifier(
    n_estimators=300,          # number of trees
    max_depth=6,               # tree depth
    learning_rate=0.2,         # step size shrinkage
    eval_metric='auc',     # evaluation metric
    random_state=42,
    n_jobs=-1                  # use all CPU cores
)

# Fit on training data
xgb_baseline.fit(X_train_resampled_log, y_train_resampled.values.ravel())

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [6]:
# Evaluate on Train Set
y_train_pred = xgb_baseline.predict(X_train_resampled_log)
y_train_proba = xgb_baseline.predict_proba(X_train_resampled_log)[:, 1]

# Classification report
print("Training Classification Report:\n")
print(classification_report(y_train_resampled, y_train_pred, digits=4))

# ROC-AUC
roc_auc_train = roc_auc_score(y_train_resampled, y_train_proba)
print(f"Training ROC-AUC Score: {roc_auc_train:.4f}")

# PR-AUC
pr_auc_train = average_precision_score(y_train_resampled, y_train_proba)
print(f"Training PR-AUC Score: {pr_auc_train:.4f}")

# Confusion matrix
print("Training Confusion Matrix:\n", confusion_matrix(y_train_resampled, y_train_pred))

Training Classification Report:

              precision    recall  f1-score   support

           0     0.9911    0.9997    0.9954    786838
           1     0.9997    0.9910    0.9954    786838

    accuracy                         0.9954   1573676
   macro avg     0.9954    0.9954    0.9954   1573676
weighted avg     0.9954    0.9954    0.9954   1573676

Training ROC-AUC Score: 0.9996
Training PR-AUC Score: 0.9996
Training Confusion Matrix:
 [[786630    208]
 [  7043 779795]]


In [7]:
# Evaluate on Validation Set
y_val_pred = xgb_baseline.predict(X_val_log)
y_val_proba = xgb_baseline.predict_proba(X_val_log)[:, 1]

# Classification report
print("\nValidation Classification Report:\n")
print(classification_report(y_val, y_val_pred, digits=4))

# ROC-AUC
roc_auc_val = roc_auc_score(y_val, y_val_proba)
print(f"Validation ROC-AUC Score: {roc_auc_val:.4f}")

# PR-AUC
pr_auc_val = average_precision_score(y_val, y_val_proba)
print(f"Validation PR-AUC Score: {pr_auc_val:.4f}")

# Confusion matrix
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))


Validation Classification Report:

              precision    recall  f1-score   support

           0     0.9875    0.9983    0.9929    106718
           1     0.3599    0.0717    0.1196      1450

    accuracy                         0.9858    108168
   macro avg     0.6737    0.5350    0.5562    108168
weighted avg     0.9791    0.9858    0.9812    108168

Validation ROC-AUC Score: 0.8789
Validation PR-AUC Score: 0.1490
Validation Confusion Matrix:
 [[106533    185]
 [  1346    104]]


### Hyperparameter Tuning

In [9]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 0.5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "eval_metric": "auc",
        "random_state": 42,
        "n_jobs": -1
    }

    model = XGBClassifier(**params)

    auc = cross_val_score(
        model,
        X_train_resampled_log,
        y_train_resampled.values.ravel(),
        scoring="roc_auc",
        cv=5,
        n_jobs=-1
    ).mean()

    return auc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best params:", study.best_params)
print("Best AUC:", study.best_value)

[I 2025-11-07 13:21:43,052] A new study created in memory with name: no-name-861949dd-cb40-4589-848f-3b6cd3ddd198
[I 2025-11-07 13:22:09,098] Trial 0 finished with value: 0.9811942308008745 and parameters: {'n_estimators': 218, 'max_depth': 5, 'learning_rate': 0.1381595665075031, 'subsample': 0.6333449702682407, 'colsample_bytree': 0.9231590858482497, 'gamma': 0.2367807107309881, 'min_child_weight': 9}. Best is trial 0 with value: 0.9811942308008745.
[I 2025-11-07 13:23:48,946] Trial 1 finished with value: 0.9808178488112913 and parameters: {'n_estimators': 732, 'max_depth': 8, 'learning_rate': 0.043316151420247605, 'subsample': 0.9054730990298401, 'colsample_bytree': 0.8967971077161351, 'gamma': 0.2403394229617135, 'min_child_weight': 1}. Best is trial 0 with value: 0.9811942308008745.
[I 2025-11-07 13:25:19,526] Trial 2 finished with value: 0.9885897313826121 and parameters: {'n_estimators': 584, 'max_depth': 10, 'learning_rate': 0.18640620979411388, 'subsample': 0.9601154145663083, 

Best params: {'n_estimators': 481, 'max_depth': 10, 'learning_rate': 0.15792742525557002, 'subsample': 0.7497372319612025, 'colsample_bytree': 0.607044441007861, 'gamma': 0.40732611117875384, 'min_child_weight': 10}
Best AUC: 0.9906132480837473


In [10]:
# Extract best parameters
best_params = study.best_params  # from your Optuna study

# Optional: add fixed parameters
best_params.update({
    "eval_metric": "auc",
    "random_state": 42,
    "n_jobs": -1
})

# Train final model on full training set
final_model = XGBClassifier(**best_params)
final_model.fit(X_train_resampled_log, y_train_resampled.values.ravel())

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.607044441007861
,device,
,early_stopping_rounds,
,enable_categorical,False


In [11]:
# Evaluate final model on train set
y_train_final_pred = final_model.predict(X_train_resampled_log)
y_train_final_proba = final_model.predict_proba(X_train_resampled_log)[:, 1]

# classification report
print("Final Model Training Classification Report:\n")
print(classification_report(y_train_resampled, y_train_final_pred, digits=4))

# roc auc
roc_auc_train_final = roc_auc_score(y_train_resampled, y_train_final_proba)
print(f"Final Model Training ROC-AUC Score: {roc_auc_train_final:.4f}")

# pr auc
pr_auc_train_final = average_precision_score(y_train_resampled, y_train_final_proba)
print(f"Final Model Training PR-AUC Score: {pr_auc_train_final:.4f}")

# confusion matrix
print("Final Model Training Confusion Matrix:\n", confusion_matrix(y_train_resampled, y_train_final_pred))

Final Model Training Classification Report:

              precision    recall  f1-score   support

           0     0.9959    1.0000    0.9979    786838
           1     1.0000    0.9959    0.9979    786838

    accuracy                         0.9979   1573676
   macro avg     0.9979    0.9979    0.9979   1573676
weighted avg     0.9979    0.9979    0.9979   1573676

Final Model Training ROC-AUC Score: 1.0000
Final Model Training PR-AUC Score: 1.0000
Final Model Training Confusion Matrix:
 [[786828     10]
 [  3238 783600]]


In [12]:
# Evaluate final model on validation set
y_val_final_pred = final_model.predict(X_val_log)
y_val_final_proba = final_model.predict_proba(X_val_log)[:, 1]

# classification report
print("\nFinal Model Validation Classification Report:\n")
print(classification_report(y_val, y_val_final_pred, digits=4))

# roc auc
roc_auc_val_final = roc_auc_score(y_val, y_val_final_proba)
print(f"Final Model Validation ROC-AUC Score: {roc_auc_val_final:.4f}")

# pr auc
pr_auc_val_final = average_precision_score(y_val, y_val_final_proba)
print(f"Final Model Validation PR-AUC Score: {pr_auc_val_final:.4f}")

# confusion matrix
print("Final Model Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_final_pred))


Final Model Validation Classification Report:

              precision    recall  f1-score   support

           0     0.9874    0.9986    0.9929    106718
           1     0.3598    0.0593    0.1018      1450

    accuracy                         0.9860    108168
   macro avg     0.6736    0.5289    0.5474    108168
weighted avg     0.9789    0.9860    0.9810    108168

Final Model Validation ROC-AUC Score: 0.8655
Final Model Validation PR-AUC Score: 0.1282
Final Model Validation Confusion Matrix:
 [[106565    153]
 [  1364     86]]
