## Import Statements

In [17]:
import pandas as pd
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix,
    classification_report
)
import matplotlib.pyplot as plt

## Load Dataset

In [5]:
X_train = pd.read_csv("../Datasets/X_train.csv")
y_train = pd.read_csv("../Datasets/y_train.csv").values.ravel()

X_val = pd.read_csv("../Datasets/X_val.csv")
y_val = pd.read_csv("../Datasets/y_val.csv").values.ravel()

X_test = pd.read_csv("../Datasets/X_test.csv")
y_test = pd.read_csv("../Datasets/y_test.csv").values.ravel()

X_train_resampled = pd.read_csv("../Datasets/X_train_resampled.csv")
y_train_resampled = pd.read_csv("../Datasets/y_train_resampled.csv").values.ravel()

# Random Forest

## Use Log-Transformed Features

In [8]:
X_train_resampled.columns

Index(['income', 'name_email_similarity', 'prev_address_months_count',
       'current_address_months_count', 'days_since_request', 'zip_count_4w',
       'velocity_6h', 'velocity_24h', 'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'credit_risk_score',
       'email_is_free', 'phone_home_valid', 'phone_mobile_valid',
       'bank_months_count', 'has_other_cards', 'proposed_credit_limit',
       'foreign_request', 'keep_alive_session', 'device_distinct_emails_8w',
       'month', 'age_10', 'age_20', 'age_30', 'age_40', 'age_50', 'age_60',
       'age_70', 'age_80', 'age_90', 'payment_AA', 'payment_AB', 'payment_AC',
       'payment_AD', 'payment_AE', 'employment_CA', 'employment_CB',
       'employment_CC', 'employment_CD', 'employment_CE', 'employment_CF',
       'employment_CG', 'housing_BA', 'housing_BB', 'housing_BC', 'housing_BD',
       'housing_BE', 'housing_BF', 'housing_BG', 'source_INTERNET',
       'source_TELEAPP', 'device_linux', 'device_

In [9]:
log_transformed_columns = ['days_since_request_log', 'intended_balcon_amount_log',
                           'zip_count_4w_log', 'velocity_24h_log', 'velocity_4w_log',
                           'date_of_birth_distinct_emails_4w_log','session_length_in_minutes_log']
original_columns = ['days_since_request', 'intended_balcon_amount_clean',
                    'zip_count_4w', 'velocity_24h', 'velocity_4w',
                    'date_of_birth_distinct_emails_4w', 'session_length_in_minutes_cleaned']

In [10]:
X_train_resampled_log = X_train_resampled.drop(columns=original_columns)
X_train_log = X_train.drop(columns=original_columns)
X_val_log = X_val.drop(columns=original_columns)
X_test_log = X_test.drop(columns=original_columns)

### Use SMOTE Resampled Training Data

#### Baseline

In [11]:
# --- Baseline Random Forest Model ---
rf_baseline = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf_baseline.fit(X_train_resampled_log, y_train_resampled)

# --- Evaluate on training set ---
y_train_pred = rf_baseline.predict(X_train_resampled_log)
y_train_prob = rf_baseline.predict_proba(X_train_resampled_log)[:, 1]

print("\n=== Training Set Metrics ===")
print(classification_report(y_train_resampled, y_train_pred, digits=4))
roc_auc_train = roc_auc_score(y_train_resampled, y_train_prob)
pr_auc_train = average_precision_score(y_train_resampled, y_train_prob)
print(f"ROC-AUC Score: {roc_auc_train:.4f}")
print(f"PR-AUC Score:  {pr_auc_train:.4f}")

cm_train = confusion_matrix(y_train_resampled, y_train_pred)
print("\nConfusion Matrix (Training Set):")
print(cm_train)


=== Training Set Metrics ===
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000    786838
           1     1.0000    1.0000    1.0000    786838

    accuracy                         1.0000   1573676
   macro avg     1.0000    1.0000    1.0000   1573676
weighted avg     1.0000    1.0000    1.0000   1573676

ROC-AUC Score: 1.0000
PR-AUC Score:  1.0000

Confusion Matrix (Training Set):
[[786838      0]
 [     3 786835]]


In [12]:
# --- Evaluate on validation set ---
y_val_pred = rf_baseline.predict(X_val_log)
y_val_prob = rf_baseline.predict_proba(X_val_log)[:, 1]

print("\n=== Validation Set Metrics ===")
print(classification_report(y_val, y_val_pred, digits=4))
roc_auc_val = roc_auc_score(y_val, y_val_prob)
pr_auc_val = average_precision_score(y_val, y_val_prob)
print(f"ROC-AUC Score: {roc_auc_val:.4f}")
print(f"PR-AUC Score:  {pr_auc_val:.4f}")

cm_val = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix (Validation Set):")
print(cm_val)


=== Validation Set Metrics ===
              precision    recall  f1-score   support

           0     0.9870    0.9989    0.9929    106718
           1     0.2739    0.0297    0.0535      1450

    accuracy                         0.9859    108168
   macro avg     0.6304    0.5143    0.5232    108168
weighted avg     0.9774    0.9859    0.9803    108168

ROC-AUC Score: 0.8398
PR-AUC Score:  0.1097

Confusion Matrix (Validation Set):
[[106604    114]
 [  1407     43]]


In [13]:
# --- Evaluate on test set ---
y_test_pred = rf_baseline.predict(X_test_log)
y_test_prob = rf_baseline.predict_proba(X_test_log)[:, 1]

print("\n=== Test Set Metrics ===")
print(classification_report(y_test, y_test_pred, digits=4))
roc_auc_test = roc_auc_score(y_test, y_test_prob)
pr_auc_test = average_precision_score(y_test, y_test_prob)
print(f"ROC-AUC Score: {roc_auc_test:.4f}")
print(f"PR-AUC Score:  {pr_auc_test:.4f}")

cm_test = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (Test Set):")
print(cm_test)


=== Test Set Metrics ===
              precision    recall  f1-score   support

           0     0.9854    0.9995    0.9924     95415
           1     0.2500    0.0112    0.0214      1428

    accuracy                         0.9849     96843
   macro avg     0.6177    0.5054    0.5069     96843
weighted avg     0.9746    0.9849    0.9781     96843

ROC-AUC Score: 0.8434
PR-AUC Score:  0.1268

Confusion Matrix (Test Set):
[[95367    48]
 [ 1412    16]]


#### Hyperparameter Tuning

In [14]:
def objective(trial):
    # pick from your small grid
    n_estimators = trial.suggest_categorical("n_estimators", [100, 200])
    max_depth = trial.suggest_categorical("max_depth", [10, 12])
    min_samples_split = trial.suggest_categorical("min_samples_split", [2, 5])
    min_samples_leaf = trial.suggest_categorical("min_samples_leaf", [1, 3])
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])

    params = {
        "n_estimators": n_estimators,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf,
        "max_features": max_features,
        "bootstrap": True,      # fix for simplicity
        "random_state": 42,
        "n_jobs": -1
    }

    model = RandomForestClassifier(**params)
    model.fit(X_train_resampled_log, y_train_resampled)
    y_pred_proba = model.predict(X_val_log)
    auc = roc_auc_score(y_val, y_pred_proba)
    return auc

In [15]:
# --- Run Optuna study ---
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=12, show_progress_bar=True)  

print("\n=== Best Trial ===")
print(f"Best ROC-AUC Score: {study.best_value:.4f}")
print("Best Parameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")


[I 2025-11-18 12:10:35,233] A new study created in memory with name: no-name-8ec92083-4fb0-4773-986f-213c4c7b3298
Best trial: 0. Best value: 0.688424:   8%|▊         | 1/12 [01:59<21:59, 119.98s/it]

[I 2025-11-18 12:12:35,239] Trial 0 finished with value: 0.6884243811114177 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.6884243811114177.


Best trial: 0. Best value: 0.688424:  17%|█▋        | 2/12 [03:14<15:30, 93.01s/it] 

[I 2025-11-18 12:13:49,371] Trial 1 finished with value: 0.671964578253612 and parameters: {'n_estimators': 100, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.6884243811114177.


Best trial: 0. Best value: 0.688424:  25%|██▌       | 3/12 [06:33<21:13, 141.55s/it]

[I 2025-11-18 12:17:08,685] Trial 2 finished with value: 0.6703528862079952 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6884243811114177.


Best trial: 0. Best value: 0.688424:  33%|███▎      | 4/12 [09:29<20:39, 154.99s/it]

[I 2025-11-18 12:20:04,274] Trial 3 finished with value: 0.6866365303077203 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.6884243811114177.


Best trial: 0. Best value: 0.688424:  42%|████▏     | 5/12 [12:56<20:17, 173.87s/it]

[I 2025-11-18 12:23:31,594] Trial 4 finished with value: 0.6692059575639568 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6884243811114177.


Best trial: 0. Best value: 0.688424:  50%|█████     | 6/12 [14:12<14:03, 140.61s/it]

[I 2025-11-18 12:24:47,663] Trial 5 finished with value: 0.6878677933658219 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.6884243811114177.


Best trial: 0. Best value: 0.688424:  58%|█████▊    | 7/12 [16:58<12:24, 148.82s/it]

[I 2025-11-18 12:27:33,369] Trial 6 finished with value: 0.6756367894502495 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.6884243811114177.


Best trial: 0. Best value: 0.688424:  67%|██████▋   | 8/12 [18:50<09:08, 137.11s/it]

[I 2025-11-18 12:29:25,399] Trial 7 finished with value: 0.6831283026939836 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6884243811114177.


Best trial: 0. Best value: 0.688424:  75%|███████▌  | 9/12 [20:21<06:08, 122.79s/it]

[I 2025-11-18 12:30:56,723] Trial 8 finished with value: 0.6749021623860758 and parameters: {'n_estimators': 100, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.6884243811114177.


Best trial: 0. Best value: 0.688424:  83%|████████▎ | 10/12 [23:11<04:34, 137.34s/it]

[I 2025-11-18 12:33:46,616] Trial 9 finished with value: 0.6836783246338561 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6884243811114177.


Best trial: 0. Best value: 0.688424:  92%|█████████▏| 11/12 [25:20<02:14, 134.89s/it]

[I 2025-11-18 12:35:55,959] Trial 10 finished with value: 0.6884243811114177 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.6884243811114177.


Best trial: 0. Best value: 0.688424: 100%|██████████| 12/12 [27:19<00:00, 136.59s/it]

[I 2025-11-18 12:37:54,294] Trial 11 finished with value: 0.6884243811114177 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.6884243811114177.

=== Best Trial ===
Best ROC-AUC Score: 0.6884
Best Parameters:
  n_estimators: 200
  max_depth: 10
  min_samples_split: 5
  min_samples_leaf: 1
  max_features: log2





In [19]:
# --- Retrain best model ---
best_params = study.best_params
best_params["random_state"] = 42
best_params["n_jobs"] = -1

best_rf = RandomForestClassifier(**best_params)
best_rf.fit(X_train_resampled_log, y_train_resampled)

# --- Feature Importance ---
importances = best_rf.feature_importances_
features = X_train_resampled_log.columns
feat_imp = pd.DataFrame({'Feature': features, 'Importance': importances})
feat_imp = feat_imp.sort_values(by='Importance', ascending=False)

print("\n=== Feature Importances ===")
print(feat_imp.head(10))  # top 10 features

# --- Evaluate on training set ---
y_train_pred = best_rf.predict(X_train_resampled_log)
y_train_prob = best_rf.predict_proba(X_train_resampled_log)[:, 1]

print("\n=== Training Set Metrics ===")
print(classification_report(y_train_resampled, y_train_pred, digits=4))
roc_auc_train = roc_auc_score(y_train_resampled, y_train_prob)
pr_auc_train = average_precision_score(y_train_resampled, y_train_prob)
print(f"ROC-AUC Score: {roc_auc_train:.4f}")
print(f"PR-AUC Score:  {pr_auc_train:.4f}")

cm_train = confusion_matrix(y_train_resampled, y_train_pred)
print("\nConfusion Matrix (Training Set):")
print(cm_train)


=== Feature Importances ===
                         Feature  Importance
38                    housing_BA    0.175886
50                device_windows    0.136120
14            keep_alive_session    0.096623
8               phone_home_valid    0.083130
28                    payment_AC    0.057488
11               has_other_cards    0.047947
21                        age_50    0.040972
3   current_address_months_count    0.038797
31                 employment_CA    0.033641
0                         income    0.027534

=== Training Set Metrics ===
              precision    recall  f1-score   support

           0     0.9669    0.9549    0.9608    786838
           1     0.9554    0.9673    0.9613    786838

    accuracy                         0.9611   1573676
   macro avg     0.9612    0.9611    0.9611   1573676
weighted avg     0.9612    0.9611    0.9611   1573676

ROC-AUC Score: 0.9924
PR-AUC Score:  0.9921

Confusion Matrix (Training Set):
[[751326  35512]
 [ 25736 761102]]


In [20]:
# --- Evaluate on validation set ---
y_val_pred = best_rf.predict(X_val_log)
y_val_prob = best_rf.predict_proba(X_val_log)[:, 1]

print("\n=== Validation Set Metrics ===")
print(classification_report(y_val, y_val_pred, digits=4))
roc_auc_val = roc_auc_score(y_val, y_val_prob)
pr_auc_val = average_precision_score(y_val, y_val_prob)
print(f"ROC-AUC Score: {roc_auc_val:.4f}")
print(f"PR-AUC Score:  {pr_auc_val:.4f}")

cm_val = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix (Validation Set):")
print(cm_val)


=== Validation Set Metrics ===
              precision    recall  f1-score   support

           0     0.9919    0.9500    0.9705    106718
           1     0.1039    0.4269    0.1671      1450

    accuracy                         0.9429    108168
   macro avg     0.5479    0.6884    0.5688    108168
weighted avg     0.9800    0.9429    0.9597    108168

ROC-AUC Score: 0.8602
PR-AUC Score:  0.1169

Confusion Matrix (Validation Set):
[[101377   5341]
 [   831    619]]


In [21]:
# --- Evaluate on test set ---
y_test_pred = best_rf.predict(X_test_log)
y_test_prob = best_rf.predict_proba(X_test_log)[:, 1]

print("\n=== Test Set Metrics ===")
print(classification_report(y_test, y_test_pred, digits=4))
roc_auc_test = roc_auc_score(y_test, y_test_prob)
pr_auc_test = average_precision_score(y_test, y_test_prob)
print(f"ROC-AUC Score: {roc_auc_test:.4f}")
print(f"PR-AUC Score:  {pr_auc_test:.4f}")

cm_test = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (Test Set):")
print(cm_test)


=== Test Set Metrics ===
              precision    recall  f1-score   support

           0     0.9911    0.9589    0.9747     95415
           1     0.1344    0.4265    0.2044      1428

    accuracy                         0.9510     96843
   macro avg     0.5628    0.6927    0.5896     96843
weighted avg     0.9785    0.9510    0.9634     96843

ROC-AUC Score: 0.8629
PR-AUC Score:  0.1346

Confusion Matrix (Test Set):
[[91492  3923]
 [  819   609]]


### Use original train with class weighting

#### Baseline

In [22]:
# --- Baseline Random Forest Model ---
rf_baseline = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    class_weight = "balanced"
)

rf_baseline.fit(X_train_log, y_train)

# --- Evaluate on training set ---
y_train_pred = rf_baseline.predict(X_train_log)
y_train_prob = rf_baseline.predict_proba(X_train_log)[:, 1]

print("\n=== Training Set Metrics ===")
print(classification_report(y_train, y_train_pred, digits=4))
roc_auc_train = roc_auc_score(y_train, y_train_prob)
pr_auc_train = average_precision_score(y_train, y_train_prob)
print(f"ROC-AUC Score: {roc_auc_train:.4f}")
print(f"PR-AUC Score:  {pr_auc_train:.4f}")

cm_train = confusion_matrix(y_train, y_train_pred)
print("\nConfusion Matrix (Training Set):")
print(cm_train)


=== Training Set Metrics ===
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000    786838
           1     1.0000    0.9983    0.9991      8151

    accuracy                         1.0000    794989
   macro avg     1.0000    0.9991    0.9996    794989
weighted avg     1.0000    1.0000    1.0000    794989

ROC-AUC Score: 1.0000
PR-AUC Score:  1.0000

Confusion Matrix (Training Set):
[[786838      0]
 [    14   8137]]


In [23]:
# --- Evaluate on validation set ---
y_val_pred = rf_baseline.predict(X_val_log)
y_val_prob = rf_baseline.predict_proba(X_val_log)[:, 1]

print("\n=== Validation Set Metrics ===")
print(classification_report(y_val, y_val_pred, digits=4))
roc_auc_val = roc_auc_score(y_val, y_val_prob)
pr_auc_val = average_precision_score(y_val, y_val_prob)
print(f"ROC-AUC Score: {roc_auc_val:.4f}")
print(f"PR-AUC Score:  {pr_auc_val:.4f}")

cm_val = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix (Validation Set):")
print(cm_val)


=== Validation Set Metrics ===
              precision    recall  f1-score   support

           0     0.9866    1.0000    0.9933    106718
           1     0.0000    0.0000    0.0000      1450

    accuracy                         0.9866    108168
   macro avg     0.4933    0.5000    0.4966    108168
weighted avg     0.9734    0.9866    0.9799    108168

ROC-AUC Score: 0.8097
PR-AUC Score:  0.1092

Confusion Matrix (Validation Set):
[[106718      0]
 [  1450      0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [24]:
# --- Evaluate on test set ---
y_test_pred = rf_baseline.predict(X_test_log)
y_test_prob = rf_baseline.predict_proba(X_test_log)[:, 1]

print("\n=== Test Set Metrics ===")
print(classification_report(y_test, y_test_pred, digits=4))
roc_auc_test = roc_auc_score(y_test, y_test_prob)
pr_auc_test = average_precision_score(y_test, y_test_prob)
print(f"ROC-AUC Score: {roc_auc_test:.4f}")
print(f"PR-AUC Score:  {pr_auc_test:.4f}")

cm_test = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (Test Set):")
print(cm_test)


=== Test Set Metrics ===
              precision    recall  f1-score   support

           0     0.9853    1.0000    0.9926     95415
           1     0.0000    0.0000    0.0000      1428

    accuracy                         0.9853     96843
   macro avg     0.4926    0.5000    0.4963     96843
weighted avg     0.9707    0.9853    0.9779     96843

ROC-AUC Score: 0.7996
PR-AUC Score:  0.1081

Confusion Matrix (Test Set):
[[95415     0]
 [ 1428     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


#### Hyperparameter tuning

In [25]:
def objective(trial):
    # pick from your small grid
    n_estimators = trial.suggest_categorical("n_estimators", [100, 200])
    max_depth = trial.suggest_categorical("max_depth", [10, 12])
    min_samples_split = trial.suggest_categorical("min_samples_split", [2, 5])
    min_samples_leaf = trial.suggest_categorical("min_samples_leaf", [1, 3])
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])

    params = {
        "n_estimators": n_estimators,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf,
        "max_features": max_features,
        "bootstrap": True,      # fix for simplicity
        "random_state": 42,
        "n_jobs": -1,
        "class_weight": "balanced"
    }

    model = RandomForestClassifier(**params)
    model.fit(X_train_log, y_train)
    y_pred_proba = model.predict(X_val_log)
    auc = roc_auc_score(y_val, y_pred_proba)
    return auc

In [26]:
# --- Run Optuna study ---
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=12, show_progress_bar=True)  

print("\n=== Best Trial ===")
print(f"Best ROC-AUC Score: {study.best_value:.4f}")
print("Best Parameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

[I 2025-11-18 12:54:01,903] A new study created in memory with name: no-name-d1f04e38-1476-4c20-99f6-de61492dbeef
Best trial: 0. Best value: 0.791167:   8%|▊         | 1/12 [00:43<07:55, 43.22s/it]

[I 2025-11-18 12:54:45,153] Trial 0 finished with value: 0.7911670525800838 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.7911670525800838.


Best trial: 0. Best value: 0.791167:  17%|█▋        | 2/12 [01:44<08:58, 53.81s/it]

[I 2025-11-18 12:55:46,373] Trial 1 finished with value: 0.7842171472220374 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7911670525800838.


Best trial: 0. Best value: 0.791167:  25%|██▌       | 3/12 [02:09<06:04, 40.49s/it]

[I 2025-11-18 12:56:10,995] Trial 2 finished with value: 0.791073347675569 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.7911670525800838.


Best trial: 0. Best value: 0.791167:  33%|███▎      | 4/12 [02:37<04:44, 35.54s/it]

[I 2025-11-18 12:56:38,958] Trial 3 finished with value: 0.7897905469199844 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.7911670525800838.


Best trial: 0. Best value: 0.791167:  42%|████▏     | 5/12 [10:24<22:19, 191.41s/it]

[I 2025-11-18 13:04:26,742] Trial 4 finished with value: 0.7662284292925409 and parameters: {'n_estimators': 100, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7911670525800838.


Best trial: 0. Best value: 0.791167:  50%|█████     | 6/12 [10:46<13:21, 133.57s/it]

[I 2025-11-18 13:04:48,049] Trial 5 finished with value: 0.7897905469199844 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.7911670525800838.


Best trial: 6. Best value: 0.792929:  58%|█████▊    | 7/12 [11:28<08:38, 103.80s/it]

[I 2025-11-18 13:05:30,544] Trial 6 finished with value: 0.7929286724729241 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 6 with value: 0.7929286724729241.


Best trial: 6. Best value: 0.792929:  67%|██████▋   | 8/12 [11:57<05:19, 79.83s/it] 

[I 2025-11-18 13:05:59,056] Trial 7 finished with value: 0.7812879771437582 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 6 with value: 0.7929286724729241.


Best trial: 6. Best value: 0.792929:  75%|███████▌  | 9/12 [12:33<03:18, 66.19s/it]

[I 2025-11-18 13:06:35,234] Trial 8 finished with value: 0.7597273187278623 and parameters: {'n_estimators': 100, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 6 with value: 0.7929286724729241.


Best trial: 6. Best value: 0.792929:  83%|████████▎ | 10/12 [13:37<02:11, 65.53s/it]

[I 2025-11-18 13:07:39,296] Trial 9 finished with value: 0.7842171472220374 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 6 with value: 0.7929286724729241.


Best trial: 6. Best value: 0.792929:  92%|█████████▏| 11/12 [14:32<01:02, 62.39s/it]

[I 2025-11-18 13:08:34,561] Trial 10 finished with value: 0.7705939921585151 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 6 with value: 0.7929286724729241.


Best trial: 6. Best value: 0.792929: 100%|██████████| 12/12 [15:24<00:00, 77.07s/it]

[I 2025-11-18 13:09:26,778] Trial 11 finished with value: 0.7913816497362368 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 6 with value: 0.7929286724729241.

=== Best Trial ===
Best ROC-AUC Score: 0.7929
Best Parameters:
  n_estimators: 200
  max_depth: 10
  min_samples_split: 2
  min_samples_leaf: 1
  max_features: log2





In [27]:
# --- Retrain best model ---
best_params = study.best_params
best_params["random_state"] = 42
best_params["n_jobs"] = -1

best_rf = RandomForestClassifier(**best_params)
best_rf.fit(X_train_log, y_train)

# --- Evaluate on training set ---
y_train_pred = best_rf.predict(X_train_log)
y_train_prob = best_rf.predict_proba(X_train_log)[:, 1]

print("\n=== Training Set Metrics ===")
print(classification_report(y_train, y_train_pred, digits=4))
roc_auc_train = roc_auc_score(y_train, y_train_prob)
pr_auc_train = average_precision_score(y_train, y_train_prob)
print(f"ROC-AUC Score: {roc_auc_train:.4f}")
print(f"PR-AUC Score:  {pr_auc_train:.4f}")

cm_train = confusion_matrix(y_train, y_train_pred)
print("\nConfusion Matrix (Training Set):")
print(cm_train)


=== Training Set Metrics ===
              precision    recall  f1-score   support

           0     0.9897    1.0000    0.9948    786838
           1     0.0000    0.0000    0.0000      8151

    accuracy                         0.9897    794989
   macro avg     0.4949    0.5000    0.4974    794989
weighted avg     0.9796    0.9897    0.9846    794989



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


ROC-AUC Score: 0.9273
PR-AUC Score:  0.2913

Confusion Matrix (Training Set):
[[786838      0]
 [  8151      0]]


In [28]:
# --- Evaluate on validation set ---
y_val_pred = best_rf.predict(X_val_log)
y_val_prob = best_rf.predict_proba(X_val_log)[:, 1]

print("\n=== Validation Set Metrics ===")
print(classification_report(y_val, y_val_pred, digits=4))
roc_auc_val = roc_auc_score(y_val, y_val_prob)
pr_auc_val = average_precision_score(y_val, y_val_prob)
print(f"ROC-AUC Score: {roc_auc_val:.4f}")
print(f"PR-AUC Score:  {pr_auc_val:.4f}")

cm_val = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix (Validation Set):")
print(cm_val)


=== Validation Set Metrics ===
              precision    recall  f1-score   support

           0     0.9866    1.0000    0.9933    106718
           1     0.0000    0.0000    0.0000      1450

    accuracy                         0.9866    108168
   macro avg     0.4933    0.5000    0.4966    108168
weighted avg     0.9734    0.9866    0.9799    108168

ROC-AUC Score: 0.8746
PR-AUC Score:  0.1534

Confusion Matrix (Validation Set):
[[106718      0]
 [  1450      0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [29]:
# --- Evaluate on test set ---
y_test_pred = best_rf.predict(X_test_log)
y_test_prob = best_rf.predict_proba(X_test_log)[:, 1]

print("\n=== Test Set Metrics ===")
print(classification_report(y_test, y_test_pred, digits=4))
roc_auc_test = roc_auc_score(y_test, y_test_prob)
pr_auc_test = average_precision_score(y_test, y_test_prob)
print(f"ROC-AUC Score: {roc_auc_test:.4f}")
print(f"PR-AUC Score:  {pr_auc_test:.4f}")

cm_test = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (Test Set):")
print(cm_test)


=== Test Set Metrics ===
              precision    recall  f1-score   support

           0     0.9853    1.0000    0.9926     95415
           1     0.0000    0.0000    0.0000      1428

    accuracy                         0.9853     96843
   macro avg     0.4926    0.5000    0.4963     96843
weighted avg     0.9707    0.9853    0.9779     96843

ROC-AUC Score: 0.8765
PR-AUC Score:  0.1819

Confusion Matrix (Test Set):
[[95415     0]
 [ 1428     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Use Non Log-Transformed Features

In [31]:
X_train_resampled_nolog = X_train_resampled.drop(columns=log_transformed_columns)
X_train_nolog = X_train.drop(columns=log_transformed_columns)
X_val_nolog = X_val.drop(columns=log_transformed_columns)
X_test_nolog = X_test.drop(columns=log_transformed_columns)

### Use SMOTE Resampled Training Data

#### Baseline

In [33]:
# --- Baseline Random Forest Model ---
rf_baseline = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf_baseline.fit(X_train_resampled_nolog, y_train_resampled)


# --- Evaluate on training set ---
y_train_pred = rf_baseline.predict(X_train_resampled_nolog)
y_train_prob = rf_baseline.predict_proba(X_train_resampled_nolog)[:, 1]

print("\n=== Training Set Metrics ===")
print(classification_report(y_train_resampled, y_train_pred, digits=4))
roc_auc_train = roc_auc_score(y_train_resampled, y_train_prob)
pr_auc_train = average_precision_score(y_train_resampled, y_train_prob)
print(f"ROC-AUC Score: {roc_auc_train:.4f}")
print(f"PR-AUC Score:  {pr_auc_train:.4f}")

cm_train = confusion_matrix(y_train_resampled, y_train_pred)
print("\nConfusion Matrix (Training Set):")
print(cm_train)


=== Training Set Metrics ===
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000    786838
           1     1.0000    1.0000    1.0000    786838

    accuracy                         1.0000   1573676
   macro avg     1.0000    1.0000    1.0000   1573676
weighted avg     1.0000    1.0000    1.0000   1573676

ROC-AUC Score: 1.0000
PR-AUC Score:  1.0000

Confusion Matrix (Training Set):
[[786838      0]
 [     3 786835]]


In [34]:
# --- Evaluate on validation set ---
y_val_pred = rf_baseline.predict(X_val_nolog)
y_val_prob = rf_baseline.predict_proba(X_val_nolog)[:, 1]

print("\n=== Validation Set Metrics ===")
print(classification_report(y_val, y_val_pred, digits=4))
roc_auc_val = roc_auc_score(y_val, y_val_prob)
pr_auc_val = average_precision_score(y_val, y_val_prob)
print(f"ROC-AUC Score: {roc_auc_val:.4f}")
print(f"PR-AUC Score:  {pr_auc_val:.4f}")

cm_val = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix (Validation Set):")
print(cm_val)


=== Validation Set Metrics ===
              precision    recall  f1-score   support

           0     0.9871    0.9989    0.9929    106718
           1     0.3041    0.0359    0.0642      1450

    accuracy                         0.9860    108168
   macro avg     0.6456    0.5174    0.5285    108168
weighted avg     0.9779    0.9860    0.9805    108168

ROC-AUC Score: 0.8451
PR-AUC Score:  0.1130

Confusion Matrix (Validation Set):
[[106599    119]
 [  1398     52]]


In [35]:
# --- Evaluate on test set ---
y_test_pred = rf_baseline.predict(X_test_nolog)
y_test_prob = rf_baseline.predict_proba(X_test_nolog)[:, 1]

print("\n=== Test Set Metrics ===")
print(classification_report(y_test, y_test_pred, digits=4))
roc_auc_test = roc_auc_score(y_test, y_test_prob)
pr_auc_test = average_precision_score(y_test, y_test_prob)
print(f"ROC-AUC Score: {roc_auc_test:.4f}")
print(f"PR-AUC Score:  {pr_auc_test:.4f}")

cm_test = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (Test Set):")
print(cm_test)


=== Test Set Metrics ===
              precision    recall  f1-score   support

           0     0.9855    0.9995    0.9924     95415
           1     0.3151    0.0161    0.0306      1428

    accuracy                         0.9850     96843
   macro avg     0.6503    0.5078    0.5115     96843
weighted avg     0.9756    0.9850    0.9782     96843

ROC-AUC Score: 0.8507
PR-AUC Score:  0.1270

Confusion Matrix (Test Set):
[[95365    50]
 [ 1405    23]]


#### Hyperparameter tuning

In [36]:
def objective(trial):
    # pick from your small grid
    n_estimators = trial.suggest_categorical("n_estimators", [100, 200])
    max_depth = trial.suggest_categorical("max_depth", [10, 12])
    min_samples_split = trial.suggest_categorical("min_samples_split", [2, 5])
    min_samples_leaf = trial.suggest_categorical("min_samples_leaf", [1, 3])
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])

    params = {
        "n_estimators": n_estimators,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf,
        "max_features": max_features,
        "bootstrap": True,      # fix for simplicity
        "random_state": 42,
        "n_jobs": -1
    }

    model = RandomForestClassifier(**params)
    model.fit(X_train_resampled_nolog, y_train_resampled)
    y_pred_proba = model.predict(X_val_nolog)
    auc = roc_auc_score(y_val, y_pred_proba)
    return auc

In [37]:
# --- Run Optuna study ---
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=12, show_progress_bar=True)  

print("\n=== Best Trial ===")
print(f"Best ROC-AUC Score: {study.best_value:.4f}")
print("Best Parameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

[I 2025-11-18 13:17:20,534] A new study created in memory with name: no-name-87e946fe-0683-4c7e-b97a-f400832c622e
Best trial: 0. Best value: 0.690074:   8%|▊         | 1/12 [01:08<12:32, 68.42s/it]

[I 2025-11-18 13:18:28,955] Trial 0 finished with value: 0.6900744792430713 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.6900744792430713.


Best trial: 0. Best value: 0.690074:  17%|█▋        | 2/12 [02:06<10:23, 62.32s/it]

[I 2025-11-18 13:19:27,020] Trial 1 finished with value: 0.6900744792430713 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.6900744792430713.


Best trial: 0. Best value: 0.690074:  25%|██▌       | 3/12 [04:59<16:55, 112.87s/it]

[I 2025-11-18 13:22:20,023] Trial 2 finished with value: 0.6755074831444263 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.6900744792430713.


Best trial: 0. Best value: 0.690074:  33%|███▎      | 4/12 [08:27<20:02, 150.35s/it]

[I 2025-11-18 13:25:47,834] Trial 3 finished with value: 0.6835527471369921 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6900744792430713.


Best trial: 0. Best value: 0.690074:  42%|████▏     | 5/12 [11:53<19:53, 170.48s/it]

[I 2025-11-18 13:29:14,018] Trial 4 finished with value: 0.687300878693508 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6900744792430713.


Best trial: 0. Best value: 0.690074:  50%|█████     | 6/12 [14:36<16:47, 167.93s/it]

[I 2025-11-18 13:31:57,009] Trial 5 finished with value: 0.6790897505575442 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.6900744792430713.


Best trial: 0. Best value: 0.690074:  58%|█████▊    | 7/12 [15:59<11:41, 140.27s/it]

[I 2025-11-18 13:33:20,329] Trial 6 finished with value: 0.6779549970886856 and parameters: {'n_estimators': 100, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.6900744792430713.


Best trial: 0. Best value: 0.690074:  67%|██████▋   | 8/12 [19:28<10:48, 162.09s/it]

[I 2025-11-18 13:36:49,101] Trial 7 finished with value: 0.6741544037104558 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.6900744792430713.


Best trial: 0. Best value: 0.690074:  75%|███████▌  | 9/12 [22:25<08:20, 166.79s/it]

[I 2025-11-18 13:39:46,267] Trial 8 finished with value: 0.6790897505575442 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.6900744792430713.


Best trial: 0. Best value: 0.690074:  83%|████████▎ | 10/12 [24:23<05:03, 151.63s/it]

[I 2025-11-18 13:41:43,956] Trial 9 finished with value: 0.672120115470292 and parameters: {'n_estimators': 100, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6900744792430713.


Best trial: 0. Best value: 0.690074:  92%|█████████▏| 11/12 [26:22<02:21, 141.54s/it]

[I 2025-11-18 13:43:42,605] Trial 10 finished with value: 0.685982476536615 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6900744792430713.


Best trial: 0. Best value: 0.690074: 100%|██████████| 12/12 [27:39<00:00, 138.29s/it]

[I 2025-11-18 13:45:00,044] Trial 11 finished with value: 0.6900744792430713 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.6900744792430713.

=== Best Trial ===
Best ROC-AUC Score: 0.6901
Best Parameters:
  n_estimators: 100
  max_depth: 10
  min_samples_split: 2
  min_samples_leaf: 3
  max_features: log2





In [38]:
# --- Retrain best model ---
best_params = study.best_params
best_params["random_state"] = 42
best_params["n_jobs"] = -1

best_rf = RandomForestClassifier(**best_params)
best_rf.fit(X_train_resampled_nolog, y_train_resampled)

importances = best_rf.feature_importances_
features = X_train_resampled_log.columns
feat_imp = pd.DataFrame({'Feature': features, 'Importance': importances})
feat_imp = feat_imp.sort_values(by='Importance', ascending=False)

print("\n=== Feature Importances ===")
print(feat_imp.head(10))  # top 10 features

# --- Evaluate on training set ---
y_train_pred = best_rf.predict(X_train_resampled_nolog)
y_train_prob = best_rf.predict_proba(X_train_resampled_nolog)[:, 1]

print("\n=== Training Set Metrics ===")
print(classification_report(y_train_resampled, y_train_pred, digits=4))
roc_auc_train = roc_auc_score(y_train_resampled, y_train_prob)
pr_auc_train = average_precision_score(y_train_resampled, y_train_prob)
print(f"ROC-AUC Score: {roc_auc_train:.4f}")
print(f"PR-AUC Score:  {pr_auc_train:.4f}")

cm_train = confusion_matrix(y_train_resampled, y_train_pred)
print("\nConfusion Matrix (Training Set):")
print(cm_train)


=== Feature Importances ===
                         Feature  Importance
43                    housing_BF    0.203801
55              velocity_24h_log    0.137496
19                        age_30    0.084775
13               foreign_request    0.063663
16                         month    0.060994
33                 employment_CC    0.056072
26                    payment_AA    0.040614
3   current_address_months_count    0.036811
36                 employment_CF    0.031175
0                         income    0.025696

=== Training Set Metrics ===
              precision    recall  f1-score   support

           0     0.9658    0.9525    0.9591    786838
           1     0.9532    0.9663    0.9597    786838

    accuracy                         0.9594   1573676
   macro avg     0.9595    0.9594    0.9594   1573676
weighted avg     0.9595    0.9594    0.9594   1573676

ROC-AUC Score: 0.9922
PR-AUC Score:  0.9920

Confusion Matrix (Training Set):
[[749500  37338]
 [ 26551 760287]]


In [41]:
# --- Evaluate on validation set ---
y_val_pred = best_rf.predict(X_val_nolog)
y_val_prob = best_rf.predict_proba(X_val_nolog)[:, 1]

print("\n=== Validation Set Metrics ===")
print(classification_report(y_val, y_val_pred, digits=4))
roc_auc_val = roc_auc_score(y_val, y_val_prob)
pr_auc_val = average_precision_score(y_val, y_val_prob)
print(f"ROC-AUC Score: {roc_auc_val:.4f}")
print(f"PR-AUC Score:  {pr_auc_val:.4f}")

cm_val = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix (Validation Set):")
print(cm_val)


=== Validation Set Metrics ===
              precision    recall  f1-score   support

           0     0.9919    0.9484    0.9697    106718
           1     0.1021    0.4317    0.1652      1450

    accuracy                         0.9415    108168
   macro avg     0.5470    0.6901    0.5674    108168
weighted avg     0.9800    0.9415    0.9589    108168

ROC-AUC Score: 0.8613
PR-AUC Score:  0.1175

Confusion Matrix (Validation Set):
[[101214   5504]
 [   824    626]]


In [42]:
# --- Evaluate on test set ---
y_test_pred = best_rf.predict(X_test_nolog)
y_test_prob = best_rf.predict_proba(X_test_nolog)[:, 1]

print("\n=== Test Set Metrics ===")
print(classification_report(y_test, y_test_pred, digits=4))
roc_auc_test = roc_auc_score(y_test, y_test_prob)
pr_auc_test = average_precision_score(y_test, y_test_prob)
print(f"ROC-AUC Score: {roc_auc_test:.4f}")
print(f"PR-AUC Score:  {pr_auc_test:.4f}")

cm_test = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (Test Set):")
print(cm_test)


=== Test Set Metrics ===
              precision    recall  f1-score   support

           0     0.9913    0.9575    0.9741     95415
           1     0.1338    0.4384    0.2050      1428

    accuracy                         0.9499     96843
   macro avg     0.5625    0.6979    0.5896     96843
weighted avg     0.9787    0.9499    0.9628     96843

ROC-AUC Score: 0.8639
PR-AUC Score:  0.1354

Confusion Matrix (Test Set):
[[91362  4053]
 [  802   626]]


### Use original train with class weighting

#### Baseline

In [43]:
# --- Baseline Random Forest Model ---
rf_baseline = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)

rf_baseline.fit(X_train_nolog, y_train)

# --- Evaluate on training set ---
y_train_pred = rf_baseline.predict(X_train_nolog)
y_train_prob = rf_baseline.predict_proba(X_train_nolog)[:, 1]

print("\n=== Training Set Metrics ===")
print(classification_report(y_train, y_train_pred, digits=4))
roc_auc_train = roc_auc_score(y_train, y_train_prob)
pr_auc_train = average_precision_score(y_train, y_train_prob)
print(f"ROC-AUC Score: {roc_auc_train:.4f}")
print(f"PR-AUC Score:  {pr_auc_train:.4f}")

cm_train = confusion_matrix(y_train, y_train_pred)
print("\nConfusion Matrix (Training Set):")
print(cm_train)


=== Training Set Metrics ===
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000    786838
           1     1.0000    0.9982    0.9991      8151

    accuracy                         1.0000    794989
   macro avg     1.0000    0.9991    0.9995    794989
weighted avg     1.0000    1.0000    1.0000    794989

ROC-AUC Score: 1.0000
PR-AUC Score:  1.0000

Confusion Matrix (Training Set):
[[786838      0]
 [    15   8136]]


In [44]:
# --- Evaluate on validation set ---
y_val_pred = rf_baseline.predict(X_val_nolog)
y_val_prob = rf_baseline.predict_proba(X_val_nolog)[:, 1]

print("\n=== Validation Set Metrics ===")
print(classification_report(y_val, y_val_pred, digits=4))
roc_auc_val = roc_auc_score(y_val, y_val_prob)
pr_auc_val = average_precision_score(y_val, y_val_prob)
print(f"ROC-AUC Score: {roc_auc_val:.4f}")
print(f"PR-AUC Score:  {pr_auc_val:.4f}")

cm_val = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix (Validation Set):")
print(cm_val)


=== Validation Set Metrics ===
              precision    recall  f1-score   support

           0     0.9866    1.0000    0.9932    106718
           1     0.0000    0.0000    0.0000      1450

    accuracy                         0.9866    108168
   macro avg     0.4933    0.5000    0.4966    108168
weighted avg     0.9734    0.9866    0.9799    108168

ROC-AUC Score: 0.8165
PR-AUC Score:  0.1088

Confusion Matrix (Validation Set):
[[106716      2]
 [  1450      0]]


In [45]:
# --- Evaluate on test set ---
y_test_pred = rf_baseline.predict(X_test_nolog)
y_test_prob = rf_baseline.predict_proba(X_test_nolog)[:, 1]

print("\n=== Test Set Metrics ===")
print(classification_report(y_test, y_test_pred, digits=4))
roc_auc_test = roc_auc_score(y_test, y_test_prob)
pr_auc_test = average_precision_score(y_test, y_test_prob)
print(f"ROC-AUC Score: {roc_auc_test:.4f}")
print(f"PR-AUC Score:  {pr_auc_test:.4f}")

cm_test = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (Test Set):")
print(cm_test)


=== Test Set Metrics ===
              precision    recall  f1-score   support

           0     0.9853    1.0000    0.9926     95415
           1     0.0000    0.0000    0.0000      1428

    accuracy                         0.9853     96843
   macro avg     0.4926    0.5000    0.4963     96843
weighted avg     0.9707    0.9853    0.9779     96843

ROC-AUC Score: 0.8003
PR-AUC Score:  0.1075

Confusion Matrix (Test Set):
[[95415     0]
 [ 1428     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


#### Hyperparameter tuning

In [46]:
def objective(trial):
    # pick from your small grid
    n_estimators = trial.suggest_categorical("n_estimators", [100, 200])
    max_depth = trial.suggest_categorical("max_depth", [10, 12])
    min_samples_split = trial.suggest_categorical("min_samples_split", [2, 5])
    min_samples_leaf = trial.suggest_categorical("min_samples_leaf", [1, 3])
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])

    params = {
        "n_estimators": n_estimators,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf,
        "max_features": max_features,
        "bootstrap": True,      # fix for simplicity
        "random_state": 42,
        "n_jobs": -1,
        "class_weight": "balanced"
    }

    model = RandomForestClassifier(**params)
    model.fit(X_train_nolog, y_train)
    y_pred_proba = model.predict(X_val_nolog)
    auc = roc_auc_score(y_val, y_pred_proba)
    return auc

In [47]:
# --- Run Optuna study ---
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=12, show_progress_bar=True)  

print("\n=== Best Trial ===")
print(f"Best ROC-AUC Score: {study.best_value:.4f}")
print("Best Parameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

[I 2025-11-18 13:49:29,374] A new study created in memory with name: no-name-a59808c4-127b-4e64-8dc6-8d7fa0f1fd48


Best trial: 0. Best value: 0.786326:   8%|▊         | 1/12 [00:44<08:07, 44.32s/it]

[I 2025-11-18 13:50:13,699] Trial 0 finished with value: 0.7863263993858127 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7863263993858127.


Best trial: 1. Best value: 0.786848:  17%|█▋        | 2/12 [02:33<13:43, 82.37s/it]

[I 2025-11-18 13:52:02,698] Trial 1 finished with value: 0.786848342166367 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.786848342166367.


Best trial: 1. Best value: 0.786848:  25%|██▌       | 3/12 [04:04<12:55, 86.21s/it]

[I 2025-11-18 13:53:33,478] Trial 2 finished with value: 0.7696260269572854 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 1 with value: 0.786848342166367.


Best trial: 3. Best value: 0.790218:  33%|███▎      | 4/12 [04:45<09:07, 68.47s/it]

[I 2025-11-18 13:54:14,770] Trial 3 finished with value: 0.790217860671793 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 3 with value: 0.790217860671793.


Best trial: 3. Best value: 0.790218:  42%|████▏     | 5/12 [06:36<09:48, 84.02s/it]

[I 2025-11-18 13:56:06,352] Trial 4 finished with value: 0.7637846441572408 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 3 with value: 0.790217860671793.


Best trial: 3. Best value: 0.790218:  50%|█████     | 6/12 [08:18<09:00, 90.08s/it]

[I 2025-11-18 13:57:48,198] Trial 5 finished with value: 0.7853012871176437 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 3 with value: 0.790217860671793.


Best trial: 3. Best value: 0.790218:  58%|█████▊    | 7/12 [09:57<07:45, 93.04s/it]

[I 2025-11-18 13:59:27,335] Trial 6 finished with value: 0.7853012871176437 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 3 with value: 0.790217860671793.


Best trial: 3. Best value: 0.790218:  67%|██████▋   | 8/12 [10:49<05:18, 79.70s/it]

[I 2025-11-18 14:00:18,481] Trial 7 finished with value: 0.7866524859911168 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 3 with value: 0.790217860671793.


Best trial: 3. Best value: 0.790218:  75%|███████▌  | 9/12 [11:57<03:48, 76.13s/it]

[I 2025-11-18 14:01:26,739] Trial 8 finished with value: 0.7899264190315307 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 3 with value: 0.790217860671793.


Best trial: 3. Best value: 0.790218:  83%|████████▎ | 10/12 [13:51<02:55, 87.95s/it]

[I 2025-11-18 14:03:21,156] Trial 9 finished with value: 0.7653419938206462 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 3 with value: 0.790217860671793.


Best trial: 3. Best value: 0.790218:  92%|█████████▏| 11/12 [14:36<01:14, 74.69s/it]

[I 2025-11-18 14:04:05,793] Trial 10 finished with value: 0.7706183425088744 and parameters: {'n_estimators': 100, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 3 with value: 0.790217860671793.


Best trial: 11. Best value: 0.793219: 100%|██████████| 12/12 [15:20<00:00, 76.70s/it]

[I 2025-11-18 14:04:49,766] Trial 11 finished with value: 0.7932191576769196 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 11 with value: 0.7932191576769196.

=== Best Trial ===
Best ROC-AUC Score: 0.7932
Best Parameters:
  n_estimators: 100
  max_depth: 10
  min_samples_split: 5
  min_samples_leaf: 1
  max_features: log2





In [48]:
# --- Retrain best model ---
best_params = study.best_params
best_params["random_state"] = 42
best_params["n_jobs"] = -1

best_rf = RandomForestClassifier(**best_params)
best_rf.fit(X_train_nolog, y_train)

# --- Evaluate on training set ---
y_train_pred = best_rf.predict(X_train_nolog)
y_train_prob = best_rf.predict_proba(X_train_nolog)[:, 1]

print("\n=== Training Set Metrics ===")
print(classification_report(y_train, y_train_pred, digits=4))
roc_auc_train = roc_auc_score(y_train, y_train_prob)
pr_auc_train = average_precision_score(y_train, y_train_prob)
print(f"ROC-AUC Score: {roc_auc_train:.4f}")
print(f"PR-AUC Score:  {pr_auc_train:.4f}")

cm_train = confusion_matrix(y_train, y_train_pred)
print("\nConfusion Matrix (Training Set):")
print(cm_train)


=== Training Set Metrics ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

           0     0.9897    1.0000    0.9948    786838
           1     0.0000    0.0000    0.0000      8151

    accuracy                         0.9897    794989
   macro avg     0.4949    0.5000    0.4974    794989
weighted avg     0.9796    0.9897    0.9846    794989

ROC-AUC Score: 0.9227
PR-AUC Score:  0.2706

Confusion Matrix (Training Set):
[[786838      0]
 [  8151      0]]


In [49]:
# --- Evaluate on validation set ---
y_val_pred = best_rf.predict(X_val_nolog)
y_val_prob = best_rf.predict_proba(X_val_nolog)[:, 1]

print("\n=== Validation Set Metrics ===")
print(classification_report(y_val, y_val_pred, digits=4))
roc_auc_val = roc_auc_score(y_val, y_val_prob)
pr_auc_val = average_precision_score(y_val, y_val_prob)
print(f"ROC-AUC Score: {roc_auc_val:.4f}")
print(f"PR-AUC Score:  {pr_auc_val:.4f}")

cm_val = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix (Validation Set):")
print(cm_val)


=== Validation Set Metrics ===
              precision    recall  f1-score   support

           0     0.9866    1.0000    0.9933    106718
           1     0.0000    0.0000    0.0000      1450

    accuracy                         0.9866    108168
   macro avg     0.4933    0.5000    0.4966    108168
weighted avg     0.9734    0.9866    0.9799    108168

ROC-AUC Score: 0.8722
PR-AUC Score:  0.1491

Confusion Matrix (Validation Set):
[[106718      0]
 [  1450      0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [50]:
# --- Evaluate on test set ---
y_test_pred = best_rf.predict(X_test_nolog)
y_test_prob = best_rf.predict_proba(X_test_nolog)[:, 1]

print("\n=== Test Set Metrics ===")
print(classification_report(y_test, y_test_pred, digits=4))
roc_auc_test = roc_auc_score(y_test, y_test_prob)
pr_auc_test = average_precision_score(y_test, y_test_prob)
print(f"ROC-AUC Score: {roc_auc_test:.4f}")
print(f"PR-AUC Score:  {pr_auc_test:.4f}")

cm_test = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (Test Set):")
print(cm_test)


=== Test Set Metrics ===
              precision    recall  f1-score   support

           0     0.9853    1.0000    0.9926     95415
           1     0.0000    0.0000    0.0000      1428

    accuracy                         0.9853     96843
   macro avg     0.4926    0.5000    0.4963     96843
weighted avg     0.9707    0.9853    0.9779     96843

ROC-AUC Score: 0.8739
PR-AUC Score:  0.1812

Confusion Matrix (Test Set):
[[95415     0]
 [ 1428     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
