In [1]:
import pickle
import joblib
import xgboost as xgb
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.calibration import CalibratedClassifierCV
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, precision_score, recall_score, average_precision_score
import gc
import optuna

In [2]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install optuna-integration[xgboost]

Collecting optuna-integration[xgboost]
  Downloading optuna_integration-4.2.1-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.2.1-py3-none-any.whl (97 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m97.6/97.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.2.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
with open("/kaggle/input/fraud-detection-procesed-data/XBG_FE_processed_data.pkl", "rb") as f:
    data = joblib.load(f)

X_train, X_test, y_train, y_test = data["X_train"], data["X_test"], data["y_train"], data["y_test"]

In [28]:
import shutil

# Define the paths
source_db_path = "/kaggle/input/optuna-final-db/optuna_fraud_detection_final 135.db"
destination_db_path = "/kaggle/working/optuna_fraud_detection_final 140.db"

# Copy the database to a writable location
shutil.copy(source_db_path, destination_db_path)

# ‚úÖ Update Optuna to use the new writable DB path
DB_PATH = f"sqlite:///{destination_db_path}"

In [30]:
import os
from optuna.integration import XGBoostPruningCallback
from optuna.samplers import TPESampler
from sqlalchemy import create_engine
from optuna.pruners import HyperbandPruner, MedianPruner


gc.collect()

sampler = TPESampler(multivariate=True, warn_independent_sampling=False)

# Load previous trials from SQLite
engine = create_engine(DB_PATH.replace("sqlite:///", "sqlite:///"))

# Create Optuna study with SQLite storage
study = optuna.create_study(
    direction="maximize",
    storage=DB_PATH,  # ‚úÖ Store in SQLite
    study_name="fraud_detection",
    sampler=sampler,
    load_if_exists=True  # ‚úÖ Continue previous search if trials exist
)
#pruner = optuna.pruners.HyperbandPruner()
pruner = MedianPruner(n_warmup_steps=150) if study.trials_dataframe().shape[0] < 150 else HyperbandPruner()
study.pruner = pruner


# Define custom F1-score evaluation function
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    y_pred_binary = (y_pred > 0.5).astype(int)
    f1 = f1_score(y_true, y_pred_binary)
    return "f1", f1


# Define the Optuna objective function
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 11, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.03, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0, step=0.1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.8, step=0.1),
        "scale_pos_weight": trial.suggest_int("scale_pos_weight", 40, 90),
        "min_child_weight": trial.suggest_int("min_child_weight", 6, 10),
        "gamma": trial.suggest_float("gamma", 1e-5, 0.005, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-5, 0.95, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-6, 0.4, log=True),
        "tree_method": "hist",
        "device": "cuda",
        "eval_metric": "aucpr",
        "random_state": 42,
        "nthread": -1,
    }
    
    # Define number of boosting rounds (dynamic based on trial number)
    #if trial.number < 20:
    #    num_boost_round = trial.suggest_int("n_estimators", 100, 500, step=50)  # Fewer rounds for early trials
    #else:
    #    num_boost_round = trial.suggest_int("n_estimators", 1500, 4000, step=500)  # More rounds for fine-tuning

    
    num_boost_round = trial.suggest_int("n_estimators", 1500, 4000, step=500)
    #num_boost_round = trial.suggest_int("n_estimators", 1, 2, step=1)

    print(f"üîç Trial {trial.number}: Testing Params: {params} | Boost Rounds: {num_boost_round}")

    # Convert dataset into DMatrix (XGBoost format)
    dtrain = xgb.DMatrix(X_train, label=y_train)

    # Set up pruning callback
    pruning_callback = XGBoostPruningCallback(trial, "test-aucpr")

    # Perform cross-validation
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        nfold=5,
        custom_metric=f1_eval,  # ‚úÖ Also track F1-score
        maximize=True,  # ‚úÖ Optimize both AUC-PR & F1
        stratified=True,
        seed=42,
        verbose_eval=100,
        callbacks=[pruning_callback],
    )

    # Get best results
    best_aucpr = cv_results["test-aucpr-mean"].max()
    best_f1 = cv_results["test-f1-mean"].max() 
    final_score = (best_aucpr + best_f1) / 2  # Optimize both metrics

    print(f"üî• Best AUC-PR: {best_aucpr:.4f}, Best F1: {best_f1:.4f}, Final Score: {final_score: .4f}")

    # ‚úÖ Save to SQLite
    trial_results = pd.DataFrame([{
        "Trial": trial.number,
        "max_depth": params["max_depth"],
        "learning_rate": params["learning_rate"],
        "subsample": params["subsample"],
        "colsample_bytree": params["colsample_bytree"],
        "scale_pos_weight": params["scale_pos_weight"],
        "min_child_weight": params["min_child_weight"],
        "gamma": params["gamma"],
        "reg_alpha": params["reg_alpha"],
        "reg_lambda": params["reg_lambda"],
        "Boost Rounds": num_boost_round,
        "Best AUC-PR": best_aucpr,
        "Best F1": best_f1,
        "Final Score": final_score
    }])


    return final_score 

# Run Optuna optimization

study.optimize(objective, n_trials=140)

# Print best hyperparameters
print("‚úÖ Best Hyperparameters:", study.best_params)

[I 2025-03-13 16:30:53,504] Using an existing study with name 'fraud_detection' instead of creating a new one.


üîç Trial 140: Testing Params: {'max_depth': 12, 'learning_rate': 0.027900974155015187, 'subsample': 0.7, 'colsample_bytree': 0.8, 'scale_pos_weight': 62, 'min_child_weight': 8, 'gamma': 0.0003407123923165338, 'reg_alpha': 0.04559318256480473, 'reg_lambda': 8.245848508665857e-05, 'tree_method': 'hist', 'device': 'cuda', 'eval_metric': 'aucpr', 'random_state': 42, 'nthread': -1} | Boost Rounds: 3500
[0]	train-aucpr:0.45063+0.01317	train-f1:0.06761+0.00000	test-aucpr:0.39870+0.01222	test-f1:0.06761+0.00000
[100]	train-aucpr:0.82801+0.00163	train-f1:0.39014+0.00258	test-aucpr:0.65577+0.01103	test-f1:0.33845+0.00502


[I 2025-03-13 16:35:33,062] Trial 140 pruned. Trial was pruned at iteration 162.


‚úÖ Best Hyperparameters: {'max_depth': 12, 'learning_rate': 0.017309377551183877, 'subsample': 0.8, 'colsample_bytree': 0.6, 'scale_pos_weight': 40, 'min_child_weight': 9, 'gamma': 0.0010464167006924024, 'reg_alpha': 0.3487091763789718, 'reg_lambda': 9.441098792689792e-05, 'n_estimators': 2500}


In [31]:
with open("optuna_best_params.pkl", "wb") as f:
    pickle.dump(study.best_params, f)

In [32]:
# ‚úÖ Train Final Model Using Best Iteration
xgb_model = xgb.XGBClassifier(**study.best_params)

# ‚úÖ Fit Model with Verbose
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=100,
)

print("‚úÖ XGBoost training complete!")

[0]	validation_0-logloss:0.86775
[100]	validation_0-logloss:0.36906
[200]	validation_0-logloss:0.25500
[300]	validation_0-logloss:0.21329
[400]	validation_0-logloss:0.17912
[500]	validation_0-logloss:0.15346
[600]	validation_0-logloss:0.13170
[700]	validation_0-logloss:0.11432
[800]	validation_0-logloss:0.10164
[900]	validation_0-logloss:0.09114
[1000]	validation_0-logloss:0.08299
[1100]	validation_0-logloss:0.07561
[1200]	validation_0-logloss:0.06959
[1300]	validation_0-logloss:0.06475
[1400]	validation_0-logloss:0.06047
[1500]	validation_0-logloss:0.05711
[1600]	validation_0-logloss:0.05424
[1700]	validation_0-logloss:0.05202
[1800]	validation_0-logloss:0.04990
[1900]	validation_0-logloss:0.04805
[2000]	validation_0-logloss:0.04663
[2100]	validation_0-logloss:0.04534
[2200]	validation_0-logloss:0.04429
[2300]	validation_0-logloss:0.04333
[2400]	validation_0-logloss:0.04258
[2499]	validation_0-logloss:0.04196
‚úÖ XGBoost training complete!


In [33]:
# Apply probability calibration
calibrator = CalibratedClassifierCV(xgb_model, method="sigmoid", cv="prefit")
with tqdm(total=1, desc="Calibrating Model", unit="step") as pbar:
    calibrator.fit(X_train, y_train)  # Train the calibration model
    pbar.update(1)  # Update progress after fit()

print("‚úÖ Calibration complete!")

# Get calibrated probabilities
y_proba_calibrated = calibrator.predict_proba(X_test)[:, 1]

print("‚úÖ Applied Post-Training Calibration!")


Calibrating Model: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:38<00:00, 38.99s/step]


‚úÖ Calibration complete!
‚úÖ Applied Post-Training Calibration!


In [35]:
# Get predictions using threshold 0.4 (adjust if needed)
optimal_threshold = 0.15
y_pred_adjusted = (y_proba_calibrated > optimal_threshold).astype(int)

# Compute Metrics
roc_auc = roc_auc_score(y_test, y_proba_calibrated)
pr_auc = average_precision_score(y_test, y_proba_calibrated)
f1 = f1_score(y_test, y_pred_adjusted)
precision = precision_score(y_test, y_pred_adjusted)
recall = recall_score(y_test, y_pred_adjusted)

# Print Results
print(f"üìä Model Evaluation:")
print(f"üîπ ROC-AUC: {roc_auc:.4f}")
print(f"üîπ Precision: {precision:.4f}")
print(f"üîπ Recall: {recall:.4f}")
print(f"üîπ F1 Score: {f1:.4f}")
print(f"üìå Precision-Recall AUC: {pr_auc:.4f}")

üìä Model Evaluation:
üîπ ROC-AUC: 0.9762
üîπ Precision: 0.9616
üîπ Recall: 0.7026
üîπ F1 Score: 0.8120
üìå Precision-Recall AUC: 0.8784


In [36]:
save_dict = {
    "model": xgb_model,  # Trained XGBoost model
    "calibrator": calibrator,  # ‚úÖ Save the probability calibrator
    "feature_names": X_train.columns.tolist(),  # Ensures correct input order
}

with open("xgb_fraud_detection_Malwi_GPU_Kaggle_13_03.pkl", "wb") as f:
    pickle.dump(save_dict, f)

print("‚úÖ Model saved!")

‚úÖ Model saved!
