## 1. Download and Import Packages

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_validate, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
# from imblearn.over_sampling import SMOTE
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
import optuna
from optuna.samplers import TPESampler


## 2. Import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#read csv
df = pd.read_csv("/content/drive/MyDrive/bt4012/final_dataset.csv")

In [None]:
df

Unnamed: 0,AccidentArea,Sex,Age,Fault,FraudFound_P,Deductible,DriverRating,PoliceReportFiled,WitnessPresent,AgentType,...,VehicleCategory_Sedan,VehicleCategory_Sport,VehicleCategory_Utility,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability,DaysClaimProcessingDelay,DaysAccidentToClaimDelay,InvalidClaimProcessingDelay,DeductibleVehiclePriceRatio
0,1,1,21.0,0,0,5.707110,1,0,0,0,...,0,1,0,0,0,1,0.000000,0.0,1,0.004000
1,1,0,34.0,0,0,5.993961,4,1,0,0,...,0,1,0,0,1,0,1.791759,0.0,0,0.005333
2,1,0,47.0,0,0,5.993961,3,0,0,0,...,0,1,0,0,1,0,2.564949,0.0,0,0.005333
3,0,0,65.0,1,0,5.993961,2,1,0,0,...,0,1,0,0,0,1,3.135494,0.0,0,0.016327
4,1,1,27.0,1,0,5.993961,1,0,0,0,...,0,1,0,0,1,0,2.484907,0.0,0,0.005333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15415,1,0,35.0,0,1,5.993961,4,0,0,0,...,1,0,0,0,1,0,1.386294,0.0,0,0.016327
15416,1,0,30.0,0,0,5.993961,3,0,0,0,...,0,1,0,0,0,1,1.945910,0.0,0,0.011594
15417,0,0,24.0,0,1,5.993961,4,0,0,0,...,1,0,0,0,1,0,1.945910,0.0,0,0.016327
15418,1,1,34.0,1,0,5.993961,4,0,0,0,...,1,0,0,1,0,0,2.397895,0.0,0,0.016327


## 3. Set random state and create train/validation/test split

In [None]:
random_state = 42

cols_to_standardise = [
    "Age", "Deductible", "DriverRating", "MappedVehiclePrice",
    "MappedDaysPolicyAccident", "MappedDaysPolicyClaim",
    "MappedPastNumberOfClaims", "MappedAgeOfVehicle",
    "MappedNumberOfSuppliments", "MappedAddressChangeClaim",
    "MappedNumberOfCars", "DaysClaimProcessingDelay", "DeductibleVehiclePriceRatio",
    "DaysAccidentToClaimDelay"
]

X = df.drop(columns=["FraudFound_P"])
y = df["FraudFound_P"]

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=random_state
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, stratify=y_temp, random_state=random_state
)

scoring = {
    'roc_auc': 'roc_auc',
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), cols_to_standardise)
    ],
    remainder="passthrough"
)

## 4. Baseline model with no tuning

In [None]:
pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("rf", RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    min_samples_leaf=10,
    min_samples_split=15,
    class_weight='balanced_subsample',
    random_state=random_state
))
])

scoring = {
    "roc_auc": "roc_auc",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1"
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

cv_results = cross_validate(
    pipeline,
    X_train, y_train,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=True
)

metrics_df = pd.DataFrame({
    "Metric": ["ROC-AUC", "Precision", "Recall", "F1"],
    "Train (mean)": [
        np.mean(cv_results['train_roc_auc']),
        np.mean(cv_results['train_precision']),
        np.mean(cv_results['train_recall']),
        np.mean(cv_results['train_f1'])
    ],
    "CV Validation (mean)": [
        np.mean(cv_results['test_roc_auc']),
        np.mean(cv_results['test_precision']),
        np.mean(cv_results['test_recall']),
        np.mean(cv_results['test_f1'])
    ]
})

print("\nCross-Validation Metrics:")
print(metrics_df)

pipeline.fit(X_train, y_train)

y_val_pred = pipeline.predict(X_val)
y_val_proba = pipeline.predict_proba(X_val)[:, 1]

val_metrics = {
    "ROC-AUC": roc_auc_score(y_val, y_val_proba),
    "Precision": precision_score(y_val, y_val_pred),
    "Recall": recall_score(y_val, y_val_pred),
    "F1": f1_score(y_val, y_val_pred)
}

print("\nValidation Metrics:")
print(pd.DataFrame(val_metrics, index=["Score"]).T)



Cross-Validation Metrics:
      Metric  Train (mean)  CV Validation (mean)
0    ROC-AUC      0.915198              0.804928
1  Precision      0.157752              0.138598
2     Recall      0.953170              0.840549
3         F1      0.270640              0.237935

Validation Metrics:
              Score
ROC-AUC    0.788486
Precision  0.133690
Recall     0.815217
F1         0.229709


## 5. Hyperparameter Tuning with Optuna for Random Forest model

In [None]:
def objective(trial):
    # Suggest parameters
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 20),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "bootstrap": bootstrap,
        "class_weight": "balanced",
        "random_state": random_state,
        "n_jobs": -1
    }

    # Only sample `max_samples` when bootstrap=True
    if bootstrap:
        params["max_samples"] = trial.suggest_float("max_samples", 0.6, 1.0)

    # Build pipeline
    pipeline = ImbPipeline([
        ("preprocessor", preprocessor),
        ("rf", RandomForestClassifier(**params))
    ])

    # 5-fold Stratified CV
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    cv_results = cross_validate(
        pipeline, X_train, y_train,
        cv=cv, scoring=scoring, n_jobs=-1
    )

    # Return mean ROC-AUC across folds
    return np.mean(cv_results['test_roc_auc'])

#optuna optimiazation
study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=random_state))
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best ROC AUC:", study.best_value)
print("Best Params:", study.best_params)

[I 2025-11-10 02:35:19,585] A new study created in memory with name: no-name-2a72efcc-4ff2-479d-a537-19751ad28da6


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-10 02:35:33,409] Trial 0 finished with value: 0.8034956087230268 and parameters: {'bootstrap': False, 'n_estimators': 393, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 0 with value: 0.8034956087230268.
[I 2025-11-10 02:35:37,670] Trial 1 finished with value: 0.8022368287702972 and parameters: {'bootstrap': False, 'n_estimators': 108, 'max_depth': 15, 'min_samples_split': 17, 'min_samples_leaf': 6, 'max_features': 'log2'}. Best is trial 0 with value: 0.8034956087230268.
[I 2025-11-10 02:35:45,714] Trial 2 finished with value: 0.8053299763445001 and parameters: {'bootstrap': False, 'n_estimators': 273, 'max_depth': 6, 'min_samples_split': 13, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 2 with value: 0.8053299763445001.
[I 2025-11-10 02:35:52,204] Trial 3 finished with value: 0.8049204698258455 and parameters: {'bootstrap': False, 'n_estimators': 180, 'max_depth': 9, 'min_samples_split': 13, 'min_sample

In [None]:
best_params = study.best_params
best_params.update({
    "class_weight": "balanced",
    "random_state": random_state,
    "n_jobs": -1
})

final_pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("rf", RandomForestClassifier(**best_params))
])

final_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



## predictions for train and validation set

In [None]:
y_train_pred = final_pipeline.predict(X_train)
y_train_proba = final_pipeline.predict_proba(X_train)[:, 1]

y_val_pred = final_pipeline.predict(X_val)
y_val_proba = final_pipeline.predict_proba(X_val)[:, 1]

train_roc_auc = roc_auc_score(y_train, y_train_proba)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

val_roc_auc = roc_auc_score(y_val, y_val_proba)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

metrics_df = pd.DataFrame({
    "Metric": ["ROC-AUC", "Precision", "Recall", "F1"],
    "Train (mean)": [train_roc_auc, train_precision, train_recall, train_f1],
    "Validation (mean)": [val_roc_auc, val_precision, val_recall, val_f1]
})

print("Best params:", study.best_params)
print("\nTrain & Validation metrics for best hyperparameters:")
print(metrics_df)

Best params: {'bootstrap': False, 'n_estimators': 369, 'max_depth': 7, 'min_samples_split': 14, 'min_samples_leaf': 14, 'max_features': 'sqrt'}

Train & Validation metrics for best hyperparameters:
      Metric  Train (mean)  Validation (mean)
0    ROC-AUC      0.890922           0.798396
1  Precision      0.136524           0.130081
2     Recall      0.958204           0.869565
3         F1      0.238996           0.226308


## 6. Business-Optimal Threshold Tuning

In [None]:
# Use the Optuna-trained Random Forest pipeline
best_pipeline = final_pipeline

# Predicted probabilities for the positive (fraud) class
y_pred_proba_val = best_pipeline.predict_proba(X_val)[:, 1]

# Define evaluation thresholds
thresholds = np.arange(0.1, 0.9, 0.05)

# # Initialize best metrics for F1 optimization
# best_f1 = 0
# best_thresh = 0.5
# best_precision = 0
# best_recall = 0
# best_auc = 0

# Initialize best metrics for business optimization
cost_per_false_positive = 1
cost_per_false_negative = 20
best_profit = -float('inf')
best_business_thresh = 0.5
best_business_auc = 0
best_metrics = {}

for thresh in thresholds:
    # Convert probabilities to binary predictions at current threshold
    y_pred_class = (y_pred_proba_val > thresh).astype(int)

    # Compute performance metrics
    current_f1 = f1_score(y_val, y_pred_class, zero_division=0)
    current_precision = precision_score(y_val, y_pred_class, zero_division=0)
    current_recall = recall_score(y_val, y_pred_class, zero_division=0)
    current_auc = roc_auc_score(y_val, y_pred_proba_val)

    # # Identify threshold that maximizes F1
    # if current_f1 > best_f1:
    #     best_f1 = current_f1
    #     best_thresh = thresh
    #     best_precision = current_precision
    #     best_recall = current_recall
    #     best_auc = current_auc

    # Compute confusion matrix and business profit
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred_class).ravel()

    investigation_cost = fp * cost_per_false_positive
    fraud_savings = tp * cost_per_false_negative
    fraud_losses = fn * cost_per_false_negative

    net_benefit = fraud_savings - investigation_cost - fraud_losses

    # Identify threshold that maximizes net business benefit
    if net_benefit > best_profit:
        best_profit = net_benefit
        best_business_thresh = thresh
        best_business_auc = current_auc
        best_metrics = {
            'Precision': current_precision,
            'Recall': current_recall,
            'F1': current_f1,
            'ROC-AUC': current_auc
        }

# print(f"Optimal Threshold for F1 Score: {best_thresh:.2f}")
# print(f"Max F1 Score at this threshold: {best_f1:.4f}")
# print(f"Precision at this threshold: {best_precision:.4f}")
# print(f"Recall at this threshold: {best_recall:.4f}")
# print(f"ROC-AUC: {best_auc:.4f}")

print("\n--- Business Objective Results ---")
print(f"Business-Optimal Threshold: {best_business_thresh:.2f}")
print(f"Net Benefit: {best_profit:,.0f}")
print(f"ROC-AUC: {best_business_auc:.4f}")
print(f"Metrics: {best_metrics}")


Optimal Threshold for F1 Score: 0.60
Max F1 Score at this threshold: 0.2623
Precision at this threshold: 0.1752
Recall at this threshold: 0.5217
ROC-AUC: 0.7984

--- Business Objective Results ---
Business-Optimal Threshold: 0.40
Net Benefit: 1,045
ROC-AUC: 0.7984
Metrics: {'Precision': 0.12756598240469208, 'Recall': 0.9456521739130435, 'F1': 0.2248062015503876, 'ROC-AUC': np.float64(0.7983958020989504)}


## 7. View Most/Least Influential Features Using SHAP

In [None]:
import shap
import pandas as pd
import numpy as np

# Extract model and preprocessor from your pipeline
rf_model = final_pipeline.named_steps["rf"]
preprocessor = final_pipeline.named_steps["preprocessor"]

# Transform validation data
X_val_transformed = preprocessor.transform(X_val)

# Convert to dense array if needed
if hasattr(X_val_transformed, "toarray"):
    X_val_transformed = X_val_transformed.toarray()

# Get feature names
feature_names = preprocessor.get_feature_names_out()

# Use TreeExplainer for tree-based models
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_val_transformed)

# Handle multiclass (list) or 3D output
if isinstance(shap_values, list):
    # Take mean absolute shap values across classes
    shap_values = np.mean(np.abs(np.array(shap_values)), axis=0)
elif shap_values.ndim == 3:
    # If shape = (n_samples, n_features, n_classes)
    shap_values = np.mean(np.abs(shap_values), axis=2)

# Compute mean absolute SHAP values
shap_df = pd.DataFrame(shap_values, columns=feature_names)
mean_abs_shap = shap_df.abs().mean().sort_values(ascending=False)

# Create importance DataFrame
importance_df = pd.DataFrame({
    "Feature": mean_abs_shap.index,
    "Mean|SHAP|": mean_abs_shap.values
})

print("\nTop 10 Most Influential Features (Random Forest):")
print(importance_df.head(10))

print("\nBottom 10 Least Influential Features (Random Forest):")
print(importance_df.tail(10))



Top 10 Most Influential Features (Random Forest):
                            Feature  Mean|SHAP|
0                  remainder__Fault    0.076093
1   remainder__BasePolicy_Liability    0.052554
2  remainder__VehicleCategory_Sport    0.043062
3  remainder__VehicleCategory_Sedan    0.035162
4  remainder__BasePolicy_All Perils    0.027042
5     num__DaysClaimProcessingDelay    0.018381
6   remainder__BasePolicy_Collision    0.010816
7     num__MappedPastNumberOfClaims    0.007800
8    num__MappedNumberOfSuppliments    0.006884
9  num__DeductibleVehiclePriceRatio    0.006653

Bottom 10 Least Influential Features (Random Forest):
                               Feature  Mean|SHAP|
88  remainder__DayOfWeekClaimed_Sunday    0.000012
89          num__MappedDaysPolicyClaim    0.000008
90              remainder__Make_Porche    0.000000
91              remainder__Make_Jaguar    0.000000
92             remainder__Make_Mecedes    0.000000
93              remainder__Make_Nisson    0.000000
94       

In [None]:
# Save Random Forest validation predictions to CSV
rf_val_pred = pd.DataFrame({"y_val_proba": y_val_proba})
rf_val_pred.to_csv("/content/drive/MyDrive/bt4012/rf_val_pred.csv", index=False)

print("Random Forest validation predictions saved to rf_val_pred.csv")

Random Forest validation predictions saved to rf_val_pred.csv
