##1. Download and Import Packages

In [1]:
!pip install optuna skorch torch

import torch
import torch.nn as nn
from skorch import NeuralNetClassifier

import optuna
from optuna.samplers import TPESampler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting skorch
  Downloading skorch-1.2.0-py3-none-any.whl.metadata (11 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading skorch-1.2.0-py3-none-any.whl (263 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.1/263.1 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, skorch, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0 skorch-1.2.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##2. Import Dataset

In [3]:
#read csv
df = pd.read_csv("/content/drive/MyDrive/bt4012/final_dataset.csv")

In [4]:
df

Unnamed: 0,AccidentArea,Sex,Age,Fault,FraudFound_P,Deductible,DriverRating,PoliceReportFiled,WitnessPresent,AgentType,...,VehicleCategory_Sedan,VehicleCategory_Sport,VehicleCategory_Utility,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability,DaysClaimProcessingDelay,DaysAccidentToClaimDelay,InvalidClaimProcessingDelay,DeductibleVehiclePriceRatio
0,1,1,21.0,0,0,5.707110,1,0,0,0,...,0,1,0,0,0,1,0.000000,0.0,1,0.004000
1,1,0,34.0,0,0,5.993961,4,1,0,0,...,0,1,0,0,1,0,1.791759,0.0,0,0.005333
2,1,0,47.0,0,0,5.993961,3,0,0,0,...,0,1,0,0,1,0,2.564949,0.0,0,0.005333
3,0,0,65.0,1,0,5.993961,2,1,0,0,...,0,1,0,0,0,1,3.135494,0.0,0,0.016327
4,1,1,27.0,1,0,5.993961,1,0,0,0,...,0,1,0,0,1,0,2.484907,0.0,0,0.005333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15415,1,0,35.0,0,1,5.993961,4,0,0,0,...,1,0,0,0,1,0,1.386294,0.0,0,0.016327
15416,1,0,30.0,0,0,5.993961,3,0,0,0,...,0,1,0,0,0,1,1.945910,0.0,0,0.011594
15417,0,0,24.0,0,1,5.993961,4,0,0,0,...,1,0,0,0,1,0,1.945910,0.0,0,0.016327
15418,1,1,34.0,1,0,5.993961,4,0,0,0,...,1,0,0,1,0,0,2.397895,0.0,0,0.016327


## 3. Set random state and create train/validation/test split

In [5]:
random_state = 42

X = df.drop(columns=["FraudFound_P"])
y = df["FraudFound_P"]

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=random_state
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, stratify=y_temp, random_state=random_state
)

## 4. Hyperparameter Tuning with Optuna for Multilayer Perceptron (MLP) model

In [6]:
# Standardise numerical features
cols_to_standardise = [
    "Age", "Deductible", "DriverRating", "MappedVehiclePrice",
    "MappedDaysPolicyAccident", "MappedDaysPolicyClaim",
    "MappedPastNumberOfClaims", "MappedAgeOfVehicle",
    "MappedNumberOfSuppliments", "MappedAddressChangeClaim",
    "MappedNumberOfCars", "DaysClaimProcessingDelay", "DeductibleVehiclePriceRatio", "DaysAccidentToClaimDelay"
]

# Performance metrics
scoring = {
    'roc_auc': 'roc_auc',
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# Preprocessor to standardise features in Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), cols_to_standardise)
    ],
    remainder="passthrough"
)

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [8]:
# Define PyTorch MLP Model
class FraudMLP(nn.Module):
    def __init__(self, input_dim, hidden1=64, hidden2=32, dropout=0.3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden1),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden1, hidden2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden2, 2)  # Binary classification (2 logits)
        )

    def forward(self, x):
        x = x.float()
        return self.model(x)

In [9]:
def objective(trial):
    # Define parameters and range of values to test
    hidden1 = trial.suggest_int("hidden1", 32, 256, log=True)
    hidden2 = trial.suggest_int("hidden2", 16, 128, log=True)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    epochs = trial.suggest_int("epochs", 30, 80)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)

    input_dim = preprocessor.fit_transform(X_train).shape[1]

    # Compute class weights to address class imbalance
    n_pos = (y_train == 1).sum()
    n_neg = (y_train == 0).sum()
    scale_pos_weight = n_neg / n_pos
    class_weights = torch.tensor([1.0, scale_pos_weight], dtype=torch.float32).to(device)

    # Define neural network model wrapped in skorch for sklearn compatibility
    net = NeuralNetClassifier(
        module=FraudMLP,
        module__input_dim=input_dim,
        module__hidden1=hidden1,
        module__hidden2=hidden2,
        module__dropout=dropout,
        max_epochs=epochs,
        lr=lr,
        batch_size=batch_size,
        optimizer=torch.optim.Adam,
        optimizer__weight_decay=weight_decay,
        criterion=torch.nn.CrossEntropyLoss,
        criterion__weight=class_weights,
        device=device,
        train_split=None,
        iterator_train__shuffle=True,
        verbose=0
    )

    # Create a pipeline that includes preprocessing + neural network
    pipeline = ImbPipeline([
        ("preprocessor", preprocessor),
        ("mlp", net)
    ])

    # Evaluate model using stratified 5-fold cross-validation on ROC-AUC
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    cv_results = cross_validate(
        pipeline, X_train, y_train,
        cv=cv, scoring=scoring, n_jobs=-1
    )

    return np.mean(cv_results["test_roc_auc"])

study = optuna.create_study(
    direction="maximize",
    sampler=TPESampler(seed=random_state)
)
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest ROC AUC:", study.best_value)
print("Best Parameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

[I 2025-11-10 14:06:32,587] A new study created in memory with name: no-name-fb0b27ef-0fbb-46a7-ad34-f77af3e32f1e


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-10 14:10:29,539] Trial 0 finished with value: 0.7668262320860275 and parameters: {'hidden1': 69, 'hidden2': 116, 'dropout': 0.39279757672456206, 'lr': 0.0015751320499779737, 'batch_size': 32, 'epochs': 74, 'weight_decay': 0.00025378155082656634}. Best is trial 0 with value: 0.7668262320860275.
[I 2025-11-10 14:12:44,348] Trial 1 finished with value: 0.7879183766670383 and parameters: {'hidden1': 139, 'hidden2': 16, 'dropout': 0.4879639408647978, 'lr': 0.004622589001020831, 'batch_size': 32, 'epochs': 45, 'weight_decay': 0.0001256104370001356}. Best is trial 1 with value: 0.7879183766670383.
[I 2025-11-10 14:14:03,017] Trial 2 finished with value: 0.7992127172730082 and parameters: {'hidden1': 78, 'hidden2': 29, 'dropout': 0.34474115788895177, 'lr': 0.00019010245319870352, 'batch_size': 128, 'epochs': 70, 'weight_decay': 6.290644294586145e-06}. Best is trial 2 with value: 0.7992127172730082.
[I 2025-11-10 14:15:30,997] Trial 3 finished with value: 0.7595246430857655 and param

In [11]:
# Extract best parameters
best_params = study.best_params
input_dim = preprocessor.fit_transform(X_train).shape[1]

n_pos = (y_train == 1).sum()
n_neg = (y_train == 0).sum()
scale_pos_weight = n_neg / n_pos
class_weights = torch.tensor([1.0, scale_pos_weight], dtype=torch.float32).to(device)

# Instantiate the MLP model using the best hyperparameters
best_net = NeuralNetClassifier(
    module=FraudMLP,
    module__input_dim=input_dim,
    module__hidden1=best_params["hidden1"],
    module__hidden2=best_params["hidden2"],
    module__dropout=best_params["dropout"],
    optimizer=torch.optim.Adam,
    lr=best_params["lr"],
    optimizer__weight_decay=best_params["weight_decay"],
    criterion=nn.CrossEntropyLoss,
    criterion__weight=class_weights,
    max_epochs=best_params["epochs"],
    batch_size=best_params["batch_size"],
    device=device,
    train_split=None,
    iterator_train__shuffle=True,
    verbose=1,
)

best_mlp_pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("mlp", best_net)
])

# Train the final model on the full training set
best_mlp_pipeline.fit(X_train, y_train)

  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.6840[0m  0.7004
      2        [36m0.6518[0m  1.0220
      3        [36m0.6197[0m  0.9887
      4        [36m0.5899[0m  1.5081
      5        [36m0.5600[0m  1.4410
      6        [36m0.5484[0m  1.2374
      7        [36m0.5308[0m  0.6150
      8        [36m0.5260[0m  0.4810
      9        [36m0.5239[0m  0.4759
     10        [36m0.5106[0m  0.4712
     11        0.5122  0.4629
     12        [36m0.4995[0m  0.4818
     13        [36m0.4965[0m  0.4703
     14        [36m0.4893[0m  0.4751
     15        [36m0.4885[0m  0.4662
     16        [36m0.4861[0m  0.4718
     17        [36m0.4817[0m  0.4605
     18        [36m0.4756[0m  0.4582
     19        [36m0.4715[0m  0.4737
     20        0.4728  0.4689
     21        [36m0.4652[0m  0.4868
     22        [36m0.4574[0m  0.4769
     23        0.4682  0.4754
     24        [36m0.4484[0m  0.4642
     25        0.4589  0.4702
 

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [12]:
import joblib
import os

save_dir = "/content/drive/MyDrive/bt4012"
save_path = os.path.join(save_dir, "best_mlp_pipeline.pkl")

# Save the pipeline
joblib.dump(best_mlp_pipeline, save_path)
print(f"Pipeline saved at: {save_path}")

Pipeline saved at: /content/drive/MyDrive/bt4012/best_mlp_pipeline.pkl


In [25]:
# Get performance metrics
thresh = 0.5

y_train_proba = best_mlp_pipeline.predict_proba(X_train)[:, 1]
y_train_pred = (y_train_proba >= thresh).astype(int)
y_val_proba = best_mlp_pipeline.predict_proba(X_val)[:, 1]
y_val_pred = (y_val_proba >= thresh).astype(int)

train_metrics = {
    "ROC-AUC": roc_auc_score(y_train, y_train_proba),
    "Precision": precision_score(y_train, y_train_pred),
    "Recall": recall_score(y_train, y_train_pred),
    "F1": f1_score(y_train, y_train_pred)
}
val_metrics = {
    "ROC-AUC": roc_auc_score(y_val, y_val_proba),
    "Precision": precision_score(y_val, y_val_pred),
    "Recall": recall_score(y_val, y_val_pred),
    "F1": f1_score(y_val, y_val_pred)
}

metrics_df = pd.DataFrame({
    "Metric": ["ROC-AUC", "Precision", "Recall", "F1"],
    "Train (mean)": list(train_metrics.values()),
    "Validation (mean)": list(val_metrics.values())
})

print("\nTrain & Validation metrics for best hyperparameters:")
print(metrics_df)


Train & Validation metrics for best hyperparameters:
      Metric  Train (mean)  Validation (mean)
0    ROC-AUC      0.899345           0.793493
1  Precision      0.180938           0.139151
2     Recall      0.925697           0.641304
3         F1      0.302708           0.228682


## 5. Conduct Business-Optimal Threshold Tuning

In [23]:
from sklearn.metrics import confusion_matrix
cost_per_false_positive = 1
cost_per_false_negative = 20

best_profit = -float('inf')
best_business_thresh = 0.5

thresholds = np.arange(0.1, 0.9, 0.05)

for thresh in thresholds:
    y_pred_class = (y_val_proba > thresh).astype(int)

    # Confusion matrix values
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred_class).ravel()

    # Calculate profit/loss
    investigation_cost = fp * cost_per_false_positive
    fraud_savings = tp * cost_per_false_negative
    fraud_losses = fn * cost_per_false_negative

    net_benefit = fraud_savings - investigation_cost - fraud_losses

    if net_benefit > best_profit:
        best_profit = net_benefit
        best_business_thresh = thresh

        best_metrics = {
            'Precision': precision_score(y_val, y_pred_class),
            'Recall': recall_score(y_val, y_pred_class),
            'F1': f1_score(y_val, y_pred_class)
        }

print(f"Business-Optimal Threshold: {best_business_thresh:.2f}")
print(f"Net Benefit: {best_profit:,.0f}")
print(f"Metrics: {best_metrics}")

Business-Optimal Threshold: 0.20
Net Benefit: 951
Metrics: {'Precision': 0.12247838616714697, 'Recall': 0.9239130434782609, 'F1': 0.21628498727735368}


In [16]:
save_dir = "/content/drive/MyDrive/bt4012"
save_path = os.path.join(save_dir, "best_mlp_pipeline.pkl")
best_mlp_pipeline = joblib.load(save_path)
print("Pipeline loaded successfully!")

Pipeline loaded successfully!


## 6. View Most/Least Influential Features Using SHAP

In [19]:
import shap
import torch

mlp_model = best_mlp_pipeline.named_steps["mlp"].module_
mlp_model.eval()
preprocessor = best_mlp_pipeline.named_steps["preprocessor"]

X_train_transformed = preprocessor.transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

num_features = cols_to_standardise
other_features = [c for c in X_train.columns if c not in cols_to_standardise]
feature_names = num_features + other_features

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mlp_model.to(device)

# use a manageable background sample for DeepExplainer
bg_idx = np.random.choice(X_train_transformed.shape[0], size=100, replace=False)
background_tensor = torch.tensor(
    X_train_transformed[bg_idx], dtype=torch.float32, device=device
)
# limit validation rows to 500
val_idx = np.random.choice(X_val_transformed.shape[0], size=500, replace=False)
X_val_tensor = torch.tensor(
    X_val_transformed[val_idx], dtype=torch.float32, device=device
)

print("Initializing SHAP DeepExplainer on background sample...")
explainer = shap.DeepExplainer(mlp_model, background_tensor)

print(f"Computing SHAP values for {X_val_tensor.shape[0]} validation samples...")
shap_values = explainer.shap_values(X_val_tensor)[1] #Keep class 1 (fraud)

# ensure shape is (n_samples, n_features)
if shap_values.shape[0] == len(feature_names):
    shap_values = shap_values.T

shap_df = pd.DataFrame(shap_values, columns=feature_names)
mean_abs_shap = shap_df.abs().mean().sort_values(ascending=False)
importance_df = pd.DataFrame({
    "Feature": mean_abs_shap.index,
    "Mean|SHAP|": mean_abs_shap.values
})

print("\nTop 10 Most Influential Features (MLP):")
print(importance_df.head(10))

print("\nBottom 10 Least Influential Features (MLP):")
print(importance_df.tail(10))

Initializing SHAP DeepExplainer on background sample...
Computing SHAP values for 500 validation samples...

Top 10 Most Influential Features (MLP):
                     Feature  Mean|SHAP|
0                      Fault    1.251106
1       BasePolicy_Liability    0.238180
2      BasePolicy_All Perils    0.224113
3   DaysClaimProcessingDelay    0.161288
4       WeekOfMonthClaimed_5    0.147697
5   MappedAddressChangeClaim    0.104445
6  MappedNumberOfSuppliments    0.092252
7    DayOfWeekClaimed_Monday    0.087723
8               Make_Pontiac    0.082429
9           MonthClaimed_Mar    0.080260

Bottom 10 Least Influential Features (MLP):
                    Feature  Mean|SHAP|
88               Make_Dodge         0.0
89             Make_Ferrari         0.0
90                 Make_BMW         0.0
91              Make_Saturn         0.0
92             Make_Mecedes         0.0
93              Make_Nisson         0.0
94  DayOfWeekClaimed_Sunday         0.0
95   MaritalStatus_Divorced        

In [20]:
mlp_val_pred = pd.DataFrame({"y_val_proba": y_val_proba})
mlp_val_pred.to_csv("/content/drive/MyDrive/bt4012/mlp_val_pred.csv", index=False)