In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import shap
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


In [26]:
# Load dataset
file_path = r"C:\Users\Kaleb\OneDrive\Desktop\insurance-risk-modeling\acis-insurance-risk-modeling\data\cleaned_insurance_data.csv"
df = pd.read_csv(file_path)



In [28]:
# 📌 Step 3: Create Binary Target
df["HadClaim"] = df["TotalClaims"] > 0

In [29]:
# 🧼 Step 4: Define Feature Columns
feature_cols = [
    'IsVATRegistered', 'LegalType', 'Language', 'Bank', 'AccountType',
    'MaritalStatus', 'Gender', 'Province', 'ItemType', 'VehicleType',
    'RegistrationYear', 'make', 'Model', 'Cylinders', 'cubiccapacity',
    'NumberOfDoors', 'VehicleIntroDate', 'CustomValueEstimate',
    'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding',
    'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'SumInsured',
    'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected'
]

In [30]:
# 🧹 Step 5: Drop Missing Values
df = df.dropna(subset=feature_cols + ['HadClaim'])

In [31]:
# 🔄 Step 6: One-Hot Encode Categorical Features
df_encoded = pd.get_dummies(df[feature_cols], drop_first=True)

In [32]:
# 🎯 Step 7: Prepare X and y
X = df_encoded.astype(float)
y = df["HadClaim"].astype(int)

In [33]:
# ✂️ Step 8: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
}


In [39]:
# 📈 Step 10: Train & Evaluate Models
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds, zero_division=0)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)

    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'AUC-ROC': auc
    }

    print(f"\n🔍 Model: {name}")
    print(f"✅ Accuracy: {accuracy:.4f}")
    print(f"🎯 Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
    print(f"📊 AUC-ROC: {auc:.4f}")
    print(classification_report(y_test, preds))
    print("-" * 60)


🔍 Model: LogisticRegression
✅ Accuracy: 0.7878
🎯 Precision: 0.0055, Recall: 0.7925, F1-score: 0.0110
📊 AUC-ROC: 0.8436
              precision    recall  f1-score   support

           0       1.00      0.79      0.88     71289
           1       0.01      0.79      0.01       106

    accuracy                           0.79     71395
   macro avg       0.50      0.79      0.45     71395
weighted avg       1.00      0.79      0.88     71395

------------------------------------------------------------

🔍 Model: RandomForest
✅ Accuracy: 0.9841
🎯 Precision: 0.0132, Recall: 0.1321, F1-score: 0.0240
📊 AUC-ROC: 0.5876
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     71289
           1       0.01      0.13      0.02       106

    accuracy                           0.98     71395
   macro avg       0.51      0.56      0.51     71395
weighted avg       1.00      0.98      0.99     71395

---------------------------------------------------

In [None]:
new_threshold = 0.3 
adjusted_preds = (probs >= new_threshold).astype(int)

# Evaluates again
print(classification_report(y_test, adjusted_preds))
print("AUC-ROC:", roc_auc_score(y_test, probs))


              precision    recall  f1-score   support

           0       1.00      0.99      0.99     71289
           1       0.01      0.13      0.02       106

    accuracy                           0.98     71395
   macro avg       0.51      0.56      0.51     71395
weighted avg       1.00      0.98      0.99     71395

AUC-ROC: 0.5876247281527729
