# Insurance Fraud Detection â€“ DS3000/DS9000 Project
**Goal:** Exploring Machine Learning Techniques for Insurance Fraud Detection



In [1]:
%pip install -r requirements.txt

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
You should consider upgrading via the '/Users/ashleychen/Documents/western_MDA/DS9000/DS9000-Project/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


## Import & Preprocess the dataset
#### Please check detailed explaination in [other] files

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
df = pd.read_excel('Worksheet in Case Study question 2.xlsx', sheet_name=0)
df=df.replace('?',np.nan)

# Fill null values with 'Unknown' since having a missing value carries information in this context
df['collision_type'] = df['collision_type'].fillna('Unknown')
df['property_damage'] = df['property_damage'].fillna('Unknown')
df['police_report_available'] = df['police_report_available'].fillna('Unknown')
df['authorities_contacted'] = df['authorities_contacted'].fillna('Unknown')

## Checking class balance
df['fraud_reported'].value_counts(normalize=True)
drop_cols = [
        "policy_number", "policy_bind_date", "incident_date",
        "incident_location", "insured_zip"
    ]

for c in drop_cols:
    df = df.drop(columns=c)

## One-hot encode all categorical columns
#from sklearn.preprocessing import LabelEncoder
#categorical_cols = df.select_dtypes(include=['object', 'category']).columns
#df = pd.get_dummies(df, columns=categorical_cols, drop_first=True) 

## Split and Scale
from sklearn.preprocessing import StandardScaler
y = df['fraud_reported']
X = df.drop(columns=['fraud_reported'])
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        
    random_state=42,
    stratify=y            # maintain class balance
)


## Build Neural Networks

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from catboost import CatBoostClassifier, Pool

base = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    verbose=False,
    random_seed=42
)

cat_features = [
    i
    for i, c in enumerate(X.columns)
    if (X[c].dtype == "object" or str(X[c].dtype) == "category")
]

param_grid = {
    "depth": [4, 6, 8],
    "learning_rate": [0.03, 0.06, 0.1],
    "l2_leaf_reg": [1, 3, 7],
    "iterations": [300, 600],
    "bagging_temperature": [0, 0.5, 1.0],
    "random_strength": [1, 2]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=base,
    param_grid=param_grid, 
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train, cat_features=cat_features)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


KeyboardInterrupt: 

## Evaluate on test set

In [None]:
# Predict
from sklearn.metrics import accuracy_score
best_model = grid.best_estimator_
y_pred = grid.best_estimator_.predict(X_test)

mapping = {"Y": 1, "N": 0, "y": 1, "n": 0}
y_test = pd.Series(y_test).map(mapping).astype(int)
y_pred = pd.Series(y_pred).map(mapping).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix
acc_rf = accuracy_score(y_test, y_pred)
auc_rf = roc_auc_score(y_test, y_pred)
precision_rf = precision_score(y_test, y_pred, zero_division=0)
recall_rf = recall_score(y_test, y_pred, zero_division=0)
f1_score_rf = f1_score(y_test, y_pred, zero_division=0)
average_precision_score_rf = average_precision_score(y_test, y_pred)
y_prob = best_model.predict_proba(X_test)[:, 1]

print(f'Accuracy for CatBoost: {acc_rf:.2%}')
print(f'AUC for CatBoost: {auc_rf:.2f}')
print(f'Precision for CatBoost: {precision_rf:.2%}')
print(f'Recall for CatBoost: {recall_rf:.2%}')
print(f'F1 Score for CatBoost: {f1_score_rf:.2%}')
print(f'Average Precision Score for CatBoost: {average_precision_score_rf:.2f}')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('CatBoost Confusion Matrix')

In [None]:
from __future__ import annotations
import json
from pathlib import Path
import joblib
from typing import Any, Dict, Optional

# Model I/O
def save_model(model: Any, path: str) -> str:
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(model, p)
    return str(p)


# Metrics storage
def append_metrics_jsonl(record: Dict[str, Any], path: str = "models/metrics.jsonl") -> str:
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    with p.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record) + "\n")
    return str(p)


def load_metrics_jsonl(path: str = "models/metrics.jsonl") -> list[Dict[str, Any]]:
    p = Path(path)
    if not p.exists():
        return []
    records = []
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))
    return records


from datetime import datetime
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_prob) if (y_prob is not None and y_test.nunique() == 2) else None
ap = average_precision_score(y_test, y_prob) if (y_prob is not None and y_test.nunique() == 2) else None
cm = confusion_matrix(y_test, y_pred)
metrics_record = {
        "timestamp": datetime.utcnow().isoformat(),
        "model": "CatBoost",
        "best_params": grid.best_params_,
        "cv_best_accuracy": float(grid.best_score_),
        "test_metrics": {
            "accuracy": float(acc),
            "precision": float(prec),
            "recall": float(rec),
            "f1": float(f1),
            "roc_auc": float(roc_auc) if roc_auc is not None else None,
            "average_precision": float(ap) if ap is not None else None,
            "confusion_matrix": cm.tolist(),
        },
    }
save_model(best_model, "models/nn.joblib")
append_metrics_jsonl(metrics_record, "models/metrics.jsonl")