In [12]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import mlflow
import mlflow.lightgbm
import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, f1_score,
    accuracy_score, matthews_corrcoef, cohen_kappa_score, log_loss,
    confusion_matrix
)
from datetime import datetime
import os
from sklearn.preprocessing import StandardScaler

## Load data

In [13]:
df = pd.read_csv('../data/01_raw/creditcard.csv')
df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


## Standarizing the Time and Amount Columns

In [14]:
# Scale the 'Amount' and 'Time' columns
df['stdAmount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
df['stdTime'] = StandardScaler().fit_transform(df['Time'].values.reshape(-1, 1))

# Drop the original columns
df = df.drop(['Time', 'Amount'], axis=1)

df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,stdAmount,stdTime
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,0.244964,-1.996583
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.342475,-1.996583
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,1.160686,-1.996562
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,0.140534,-1.996562
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,-0.073403,-1.996541


## Balance classes by UnderSampling

In [15]:
# Separate the majority and minority classes
class_0 = df[df['Class'] == 0]  # Majority class
class_1 = df[df['Class'] == 1]  # Minority class

# Perform undersampling on the majority class
undersampled_class_0 = class_0.sample(n=len(class_1), random_state=42)

# Concatenate the datasets to create a balanced dataset
balanced_df = pd.concat([undersampled_class_0, class_1]).sample(frac=1, random_state=42).reset_index(drop=True)

## Define features and target

In [16]:
# Define features and target
features = balanced_df.drop(columns=['Class'])
target = balanced_df['Class']

## MLflow setup

In [17]:
# Get the absolute path to the `mlruns` folder
tracking_uri = os.path.abspath("mlruns")

# Set the Tracking URI for MLflow
mlflow.set_tracking_uri(f"file:\\{tracking_uri}")

# MLflow setup
experiment_name = "lightgbm-experiment"

# Ensure the experiment exists or create it
mlflow.set_experiment(experiment_name)

# Generate a unique run name based on date and time
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
run_name = f"{experiment_name}_{current_time}"

## Cross-validation setup

In [18]:
# Cross-validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []
oof_preds = np.zeros(len(balanced_df))

## Train the model

In [19]:
# Example parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1],
    'num_leaves': [31, 63],
    'max_depth': [-1, 5],
    'n_estimators': [100, 200],
}

# Your base LightGBM parameters that are *not* being tuned in the grid
base_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'random_state': 42
}

# Create an LGBMClassifier based on your base_params
lgb_estimator = lgb.LGBMClassifier(**base_params)

# Start an MLflow run
run_name = "my_experiment_with_grid_search"

In [20]:
with mlflow.start_run(run_name=run_name):
    print(f"Running experiment with hyperparameter tuning: {run_name}")
    
    # 1. GridSearchCV to find best hyperparameters
    grid_search = GridSearchCV(
        estimator=lgb_estimator,
        param_grid=param_grid,
        scoring='roc_auc',
        cv=kf,            
        n_jobs=-1,         
        verbose=1, 
        return_train_score=True
    )
    
    # 2. Fit on the entire dataset (features, target)
    grid_search.fit(features, target)
    
    # 3. Retrieve and log best hyperparameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print("Best parameters found by GridSearchCV:", best_params)
    print("Best CV AUC found by GridSearchCV:", best_score)
    
    # Log the best hyperparameters
    mlflow.log_params(best_params)
    # Log the best cross-validation AUC
    mlflow.log_metric("best_cv_auc", best_score)
    
    # 4. Retrieve the best model and do final evaluation
    best_model = grid_search.best_estimator_
        
    # For demonstration, let's do a manual loop with the best_params to get OOF predictions
    oof_preds = np.zeros(len(features))
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(features, target)):
        X_train, X_val = features.iloc[train_idx], features.iloc[val_idx]
        y_train, y_val = target.iloc[train_idx], target.iloc[val_idx]
        
        # Convert to Dataset
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        # Merge your base params with best_params
        params = {**base_params, **best_params}
        
        model_fold = lgb.train(
            params,
            train_data,
            valid_sets=[train_data, val_data],
            callbacks=[lgb.early_stopping(stopping_rounds=10)]
        )
        
        val_preds = model_fold.predict(X_val, num_iteration=model_fold.best_iteration)
        oof_preds[val_idx] = val_preds

    
    # Now compute the overall metrics from OOF predictions
    overall_auc = roc_auc_score(target, oof_preds)
    y_pred_overall = (oof_preds > 0.5).astype(int)
    overall_precision = precision_score(target, y_pred_overall)
    overall_recall = recall_score(target, y_pred_overall)
    overall_f1 = f1_score(target, y_pred_overall)
    overall_accuracy = accuracy_score(target, y_pred_overall)
    overall_mcc = matthews_corrcoef(target, y_pred_overall)
    overall_cohen_kappa = cohen_kappa_score(target, y_pred_overall)
    overall_log_loss_ = log_loss(target, oof_preds)

    print(f"Overall AUC: {overall_auc}")
    print(f"Overall Precision: {overall_precision}")
    print(f"Overall Recall: {overall_recall}")
    print(f"Overall F1 Score: {overall_f1}")
    print(f"Overall Accuracy: {overall_accuracy}")
    print(f"Overall MCC: {overall_mcc}")
    print(f"Overall Cohen Kappa: {overall_cohen_kappa}")
    print(f"Overall Log Loss: {overall_log_loss_}")

    # Log these metrics to MLflow
    mlflow.log_metric("overall_auc", overall_auc)
    mlflow.log_metric("overall_precision", overall_precision)
    mlflow.log_metric("overall_recall", overall_recall)
    mlflow.log_metric("overall_f1", overall_f1)
    mlflow.log_metric("overall_accuracy", overall_accuracy)
    mlflow.log_metric("overall_mcc", overall_mcc)
    mlflow.log_metric("overall_cohen_kappa", overall_cohen_kappa)
    mlflow.log_metric("overall_log_loss", overall_log_loss_)

    # Plot confusion matrix
    cm_overall = confusion_matrix(target, y_pred_overall)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_overall, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Class 0", "Class 1"],
                yticklabels=["Class 0", "Class 1"])
    plt.title("Confusion Matrix - Overall")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")

    # Save and log CM plot
    cm_image_path = "extras/confusion_matrix_overall.png"
    plt.savefig(cm_image_path)
    plt.close()
    mlflow.log_artifact(cm_image_path)

    # Log the final best model
    mlflow.lightgbm.log_model(best_model, artifact_path="lightgbm_best_model")

print("Training completed with hyperparameter tuning and MLflow logging.")

Running experiment with hyperparameter tuning: my_experiment_with_grid_search
Fitting 5 folds for each of 16 candidates, totalling 80 fits




[LightGBM] [Info] Number of positive: 492, number of negative: 492
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000809 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 984, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best parameters found by GridSearchCV: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'num_leaves': 31}
Best CV AUC found by GridSearchCV: 0.9827937854176921
[LightGBM] [Info] Number of positive: 393, number of negative: 394
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000690 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7649
[LightGBM] [Info] Number of data points in the train set: 787, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]:



Early stopping, best iteration is:
[14]	training's auc: 0.995011	valid_1's auc: 0.982014
[LightGBM] [Info] Number of positive: 393, number of negative: 394
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 787, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499365 -> initscore=-0.002541
[LightGBM] [Info] Start training from score -0.002541
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[22]	training's auc: 0.997365	valid_1's auc: 0.990311
[LightGBM] [Info] Number of positive: 394, number of negative: 393
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000887 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM]



Training completed with hyperparameter tuning and MLflow logging.
