<a href="https://colab.research.google.com/github/khuiqian21/BT4012-Group-7-Insurance-Fraud-Detection-with-Machine-Learning/blob/main/BT4012_Group07_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##1. Download and Import Packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
!pip install optuna
import optuna
from optuna.samplers import TPESampler
from optuna.trial import Trial
import shap

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


##2. Import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#read csv
df = pd.read_csv("/content/drive/MyDrive/bt4012/final_dataset.csv")

In [None]:
df

Unnamed: 0,AccidentArea,Sex,Age,Fault,FraudFound_P,Deductible,DriverRating,PoliceReportFiled,WitnessPresent,AgentType,...,VehicleCategory_Sedan,VehicleCategory_Sport,VehicleCategory_Utility,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability,DaysClaimProcessingDelay,DaysAccidentToClaimDelay,InvalidClaimProcessingDelay,DeductibleVehiclePriceRatio
0,1,1,21.0,0,0,5.707110,1,0,0,0,...,0,1,0,0,0,1,0.000000,0.0,1,0.004000
1,1,0,34.0,0,0,5.993961,4,1,0,0,...,0,1,0,0,1,0,1.791759,0.0,0,0.005333
2,1,0,47.0,0,0,5.993961,3,0,0,0,...,0,1,0,0,1,0,2.564949,0.0,0,0.005333
3,0,0,65.0,1,0,5.993961,2,1,0,0,...,0,1,0,0,0,1,3.135494,0.0,0,0.016327
4,1,1,27.0,1,0,5.993961,1,0,0,0,...,0,1,0,0,1,0,2.484907,0.0,0,0.005333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15415,1,0,35.0,0,1,5.993961,4,0,0,0,...,1,0,0,0,1,0,1.386294,0.0,0,0.016327
15416,1,0,30.0,0,0,5.993961,3,0,0,0,...,0,1,0,0,0,1,1.945910,0.0,0,0.011594
15417,0,0,24.0,0,1,5.993961,4,0,0,0,...,1,0,0,0,1,0,1.945910,0.0,0,0.016327
15418,1,1,34.0,1,0,5.993961,4,0,0,0,...,1,0,0,1,0,0,2.397895,0.0,0,0.016327


##3. Set random state and create train/validation/test split

In [None]:
random_state = 42

X = df.drop(columns=["FraudFound_P"])
y = df["FraudFound_P"]

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=random_state
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, stratify=y_temp, random_state=random_state
)

##4. Hyperparameter Tuning with Optuna for Logistic Regression Model

In [None]:
# Standardise numerical features
cols_to_standardise = [
    "Age", "Deductible", "DriverRating", "MappedVehiclePrice",
    "MappedDaysPolicyAccident", "MappedDaysPolicyClaim",
    "MappedPastNumberOfClaims", "MappedAgeOfVehicle",
    "MappedNumberOfSuppliments", "MappedAddressChangeClaim",
    "MappedNumberOfCars", "DaysClaimProcessingDelay",
    "DeductibleVehiclePriceRatio", "DaysAccidentToClaimDelay"
]

# Performance metrics
scoring = {
    'roc_auc': 'roc_auc',
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# Preprocessor to standardise features in Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), cols_to_standardise)
    ],
    remainder="passthrough"
)

# StratifiedKFold with 5 splits on shuffle
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

In [None]:
def logreg_objective(trial: Trial):
    # Define parameters and range of values to test
    C = trial.suggest_float("C", 1e-2, 1e2, log=True)
    penalty_choice = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"])

    solver = None
    l1_ratio = None

    if penalty_choice == "l1":
        solver = trial.suggest_categorical("solver_l1", ["liblinear", "saga"])

    elif penalty_choice == "elasticnet":
        solver = "saga"
        l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)

    else:
        solver = trial.suggest_categorical("solver_l2", ["lbfgs", "newton-cholesky", "liblinear", "saga"])

    logreg = LogisticRegression(
        C=C,
        penalty=penalty_choice,
        solver=solver,
        l1_ratio=l1_ratio,
        class_weight="balanced",
        max_iter=1000,
        random_state=random_state
    )

    logreg_pipeline = ImbPipeline([
        ('preprocessor', preprocessor),
        ('logreg', logreg)
    ])

    logreg_cv_results = cross_validate(
            logreg_pipeline,
            X_train,
            y_train,
            cv=cv,
            scoring='roc_auc',
            return_train_score=True,
            n_jobs=-1
    )

    return logreg_cv_results['test_score'].mean()

sampler = optuna.samplers.TPESampler(seed=random_state)
logreg_study = optuna.create_study(direction='maximize', study_name="logreg_tuning", sampler=sampler)

logreg_study.optimize(
    logreg_objective,
    n_trials=50,
    show_progress_bar=True
)

[I 2025-11-10 04:19:15,636] A new study created in memory with name: logreg_tuning


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-10 04:19:25,994] Trial 0 finished with value: 0.7990365984311433 and parameters: {'C': 0.31489116479568624, 'penalty': 'l1', 'solver_l1': 'liblinear'}. Best is trial 0 with value: 0.7990365984311433.
[I 2025-11-10 04:19:48,155] Trial 1 finished with value: 0.7956040026603624 and parameters: {'C': 0.017073967431528128, 'penalty': 'l1', 'solver_l1': 'saga'}. Best is trial 0 with value: 0.7990365984311433.
[I 2025-11-10 04:21:50,468] Trial 2 finished with value: 0.7937581739926964 and parameters: {'C': 21.368329072358772, 'penalty': 'l1', 'solver_l1': 'saga'}. Best is trial 0 with value: 0.7990365984311433.
[I 2025-11-10 04:23:01,985] Trial 3 finished with value: 0.7971896013550642 and parameters: {'C': 0.5342937261279778, 'penalty': 'l2', 'solver_l2': 'saga'}. Best is trial 0 with value: 0.7990365984311433.
[I 2025-11-10 04:23:49,540] Trial 4 finished with value: 0.7957539228976602 and parameters: {'C': 0.0629064429458615, 'penalty': 'l2', 'solver_l2': 'saga'}. Best is trial 0

In [None]:
# Extract best parameters
logreg_best_trial = logreg_study.best_trial
logreg_best_params = logreg_best_trial.params

# Tidy up the best parameters
cleaned_params = {}
penalty_choice = logreg_best_params['penalty']
cleaned_params['C'] = logreg_best_params['C']

cleaned_params['penalty'] = penalty_choice
final_solver = None
final_l1_ratio = None

if penalty_choice == "l1":
    final_solver = logreg_best_params['solver_l1']
elif penalty_choice == "elasticnet":
    final_solver = "saga"
    final_l1_ratio = logreg_best_params['l1_ratio']
else:
    final_solver = logreg_best_params['solver_l2']

cleaned_params['solver'] = final_solver
if final_l1_ratio is not None:
    cleaned_params['l1_ratio'] = final_l1_ratio

best_logreg = LogisticRegression(
    class_weight="balanced",
    max_iter=1000,
    random_state=random_state,
    **cleaned_params
)

best_logreg_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('logreg', best_logreg)
])

best_logreg_pipeline.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
# Get performance metrics
train_y_pred_proba = best_logreg_pipeline.predict_proba(X_train)[:, 1]
train_y_pred = best_logreg_pipeline.predict(X_train)
train_roc_auc = roc_auc_score(y_train, train_y_pred_proba)
train_precision = precision_score(y_train, train_y_pred)
train_recall = recall_score(y_train, train_y_pred)
train_f1 = f1_score(y_train, train_y_pred)

val_y_pred_proba = best_logreg_pipeline.predict_proba(X_val)[:, 1]
val_y_pred = best_logreg_pipeline.predict(X_val)
val_roc_auc = roc_auc_score(y_val, val_y_pred_proba)
val_precision = precision_score(y_val, val_y_pred)
val_recall = recall_score(y_val, val_y_pred)
val_f1 = f1_score(y_val, val_y_pred)

metrics_df = pd.DataFrame({
    "Metric": ["ROC-AUC", "Precision", "Recall", "F1"],
    "Train": [train_roc_auc, train_precision, train_recall, train_f1],
    "Validation": [val_roc_auc, val_precision, val_recall, val_f1]
})

print("Best params:", logreg_best_params)
print("\nTrain & Validation metrics for best hyperparameters:")
print(metrics_df)

Best params: {'C': 0.08964281310219388, 'penalty': 'elasticnet', 'l1_ratio': 0.9465696788114886}

Train & Validation metrics for best hyperparameters:
      Metric     Train  Validation
0    ROC-AUC  0.821157    0.763433
1  Precision  0.132282    0.126280
2     Recall  0.896285    0.804348
3         F1  0.230540    0.218289


##5. Conduct Business-Optimal Threshold Tuning

In [None]:
thresholds = np.arange(0.1, 0.9, 0.05)

val_y_pred_proba = best_logreg_pipeline.predict_proba(X_val)[:, 1]

cost_per_false_positive = 1
cost_per_false_negative = 20

best_profit = -float('inf')
best_business_thresh = 0.5

for thresh in thresholds:
    y_pred_class = (val_y_pred_proba > thresh).astype(int)

    tn, fp, fn, tp = confusion_matrix(y_val, y_pred_class).ravel()

    investigation_cost = fp * cost_per_false_positive
    fraud_savings = tp * cost_per_false_negative
    fraud_losses = fn * cost_per_false_negative

    net_benefit = fraud_savings - investigation_cost - fraud_losses

    if net_benefit > best_profit:
        best_profit = net_benefit
        best_business_thresh = thresh

        best_metrics = {
            'Precision': precision_score(y_val, y_pred_class),
            'Recall': recall_score(y_val, y_pred_class),
            'F1': f1_score(y_val, y_pred_class),
            'ROC-AUC': roc_auc_score(y_val, val_y_pred_proba)
        }

print(f"Business-Optimal Threshold: {best_business_thresh:.2f}")
print(f"Net Benefit: {best_profit:,.0f}")
print(f"Metrics: {best_metrics}")

Business-Optimal Threshold: 0.25
Net Benefit: 901
Metrics: {'Precision': 0.11948790896159317, 'Recall': 0.9130434782608695, 'F1': 0.21132075471698114, 'ROC-AUC': np.float64(0.7634332833583208)}


##6. View Most/Least Influential Features Using SHAP

In [None]:
logreg_model = best_logreg_pipeline.named_steps["logreg"]
X_train_transformed = best_logreg_pipeline.named_steps["preprocessor"].transform(X_train)
X_val_transformed = best_logreg_pipeline.named_steps["preprocessor"].transform(X_val)

num_features = cols_to_standardise
other_features = [c for c in X_train.columns if c not in cols_to_standardise]
feature_names = num_features + other_features

explainer = shap.LinearExplainer(logreg_model, X_train_transformed, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_val_transformed)
shap_df = pd.DataFrame(shap_values, columns=feature_names)

mean_abs_shap = shap_df.abs().mean().sort_values(ascending=False)

importance_df = pd.DataFrame({
    "Feature": mean_abs_shap.index,
    "Mean|SHAP|": mean_abs_shap.values
})

print("\nTop 10 Most Influential Features:")
top_10 = importance_df.head(10)
print(top_10)

print("\nBottom 10 Least Influential Features:")
bottom_10 = importance_df.tail(10)
print(bottom_10)


Top 10 Most Influential Features:
                       Feature  Mean|SHAP|
0         BasePolicy_Liability    1.232154
1                        Fault    1.058284
2        BasePolicy_All Perils    0.187977
3     DaysClaimProcessingDelay    0.177389
4        VehicleCategory_Sport    0.168231
5                          Age    0.103502
6    MappedNumberOfSuppliments    0.102387
7                      Make_VW    0.077528
8                   Deductible    0.069303
9  DeductibleVehiclePriceRatio    0.067836

Bottom 10 Least Influential Features:
                        Feature  Mean|SHAP|
88         WeekOfMonthClaimed_2         0.0
89         WeekOfMonthClaimed_3         0.0
90     DayOfWeekClaimed_Tuesday         0.0
91         MaritalStatus_Single         0.0
92             MonthClaimed_Sep         0.0
93       MaritalStatus_Divorced         0.0
94          MaritalStatus_Widow         0.0
95        VehicleCategory_Sedan         0.0
96         BasePolicy_Collision         0.0
97  InvalidCl



In [None]:
logreg_val_pred = pd.DataFrame({"y_val_proba": val_y_pred_proba})
logreg_val_pred.to_csv("/content/drive/MyDrive/bt4012/logreg_val_pred.csv", index=False)