# Insurance Fraud Detection â€“ DS3000/DS9000 Project
**Goal:** Exploring Machine Learning Techniques for Insurance Fraud Detection



In [None]:
%pip install -r requirements.txt

## Import the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
df = pd.read_excel('Worksheet in Case Study question 2.xlsx', sheet_name=0)
df.head()

## Cleaning missing values

It is observed that the missing values are given by ? instead of null values.

In [None]:
# Replace ? with NaN
df=df.replace('?',np.nan)

df.head()

In [None]:
df.isnull().sum()

In [None]:
# Fill null values with 'Unknown' since having a missing value carries information in this context
df['collision_type'] = df['collision_type'].fillna('Unknown')
df['property_damage'] = df['property_damage'].fillna('Unknown')
df['police_report_available'] = df['police_report_available'].fillna('Unknown')
df['authorities_contacted'] = df['authorities_contacted'].fillna('Unknown')

In [None]:
df.isnull().sum()

## Checking class balance

In [None]:
df['fraud_reported'].value_counts(normalize=True)

The split is roughly 75-25 in favor of no fraud.

## Drop identifiers and free-text location fields (high-cardinality / leak risk)

In [None]:
drop_cols = [
        "policy_number", "policy_bind_date", "incident_date",
        "incident_location", "insured_zip"
    ]

for c in drop_cols:
    df = df.drop(columns=c)


## Check data types

In [None]:
df.info()

In [None]:
df.describe()

## Categorical Encoding

In [None]:
df.head(20)

In [None]:
from sklearn.preprocessing import LabelEncoder

# One-hot encode all categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True) 

df.head(20)

In [None]:
df.info()

## Look at distribution and correlation

In [None]:
import seaborn as sns
df.hist(figsize=(24, 20))

for col in df:
    sns.histplot(df[col], kde=True)


In [None]:
corr = df.corr(numeric_only=True)
sns.heatmap(corr, cmap='coolwarm', annot=False)

## Split and Scale

In [81]:
from sklearn.preprocessing import StandardScaler

y = df['fraud_reported_Y']
X = df.drop(columns=['fraud_reported_Y'])

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [None]:
X.head()

In [None]:
y.head()

### Train/test split

In [None]:
# Do a train/test split but with validation, since we will be tuning hyperparameters
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.3,        # 30% for temp (val + test)
    random_state=42,
    stratify=y            # maintain class balance
)


X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

## Check class balance of sets

In [None]:
train_survival_rate = y_train.mean() * 100
val_survival_rate = y_val.mean() * 100
test_survival_rate = y_test.mean() * 100

print(f"Train set survival rate: {train_survival_rate:.2f}%")
print(f"Validation set survival rate:  {val_survival_rate:.2f}%")
print(f"Test set survival rate:  {test_survival_rate:.2f}%")

## Build XGB

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],        # number of boosting rounds
    'max_depth': [3, 4, 5, 6],              # depth of each tree
    'learning_rate': [0.01, 0.05, 0.1],     # step size shrinkage
    'subsample': [0.8, 1.0],                # fraction of samples used per tree
    'colsample_bytree': [0.8, 1.0],         # fraction of features used per tree
    'gamma': [0, 0.5, 1],                   # minimum loss reduction to make a split
    'reg_lambda': [1, 5, 10]                # L2 regularization strength
}

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state = 42)

grid = GridSearchCV(
    estimator=xgb,
    scoring='accuracy',
    cv=3,
    param_grid=param_grid,
    n_jobs=-1
)

In [80]:
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_params_)

0.8328564616118265
{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 0.8}
