In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import joblib
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [2]:
# Load Data
df = pd.read_csv('data/fraud_oracle_processed.csv')
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

# Define Categorical Columns for Encoding
categorical_cols = [
    'Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed',
    'Fault', 'PolicyType', 'VehicleCategory', 'BasePolicy'
]

# One-Hot Encode Categorical Columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Convert Boolean Columns to Integers
boolean_cols = ['MaritalStatus_Married', 'MaritalStatus_Single', 'MaritalStatus_Widow']
df_encoded[boolean_cols] = df_encoded[boolean_cols].astype(int)

# Separate Fraud and Non-Fraud Data
df_fraud = df_encoded[df_encoded["FraudFound_P"] == 1]
df_no_fraud = df_encoded[df_encoded["FraudFound_P"] == 0]

# Split 15% Holdout for Final Evaluation
df_fraud_train, df_fraud_holdout = train_test_split(
    df_fraud, test_size=0.15, random_state=42, stratify=df_fraud["FraudFound_P"]
)

df_no_fraud_train, df_no_fraud_holdout = train_test_split(
    df_no_fraud, test_size=0.15, random_state=42, stratify=df_no_fraud["FraudFound_P"]
)

# Combine Holdout Data for Final Evaluation
df_holdout = pd.concat([df_fraud_holdout, df_no_fraud_holdout]).sample(frac=1, random_state=42).reset_index(drop=True)

# Standardize Entire Dataset Before Splitting
scaler = StandardScaler()
X = df_encoded.drop(columns=['FraudFound_P'])
y = df_encoded['FraudFound_P']

X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
df_scaled['FraudFound_P'] = y.values

# Split Data for Hyperparameter Tuning (85% Train, 15% Holdout)
df_train, df_holdout = train_test_split(df_scaled, test_size=0.15, random_state=42, stratify=df_scaled['FraudFound_P'])

# Split df_train into X and y for Grid Search
X_train_grid = df_train.drop(columns=['FraudFound_P'])
y_train_grid = df_train['FraudFound_P']

# Grid Search

In [3]:
# # Logistic Regression Hyperparameters
# lr_param_grid = {
#     'C': [0.01, 0.1, 1, 10],
#     'solver': ['saga', 'liblinear'],
#     'max_iter': [1000, 5000]
# }

# lr_grid = GridSearchCV(LogisticRegression(random_state=42), param_grid=lr_param_grid, cv=5, scoring='f1', n_jobs=-1)
# lr_grid.fit(X_train_grid, y_train_grid)

# # Best Parameters for Logistic Regression
# best_lr_params = lr_grid.best_params_
# print(f"Best Logistic Regression Params: {best_lr_params}")

# Random Forest Hyperparameters
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=rf_param_grid, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train_grid, y_train_grid)

# Best Parameters for Random Forest
best_rf_params = rf_grid.best_params_
print(f"Best Random Forest Params: {best_rf_params}")

# XGBoost Hyperparameters
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0.01, 0.1, 1]
}

xgb_grid = GridSearchCV(XGBClassifier(eval_metric='logloss', random_state=42), param_grid=xgb_param_grid, cv=5, scoring='f1', n_jobs=-1)
xgb_grid.fit(X_train_grid, y_train_grid)

# Best Parameters for XGBoost
best_xgb_params = xgb_grid.best_params_
print(f"Best XGBoost Params: {best_xgb_params}")

Best Random Forest Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best XGBoost Params: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 0.1}


In [4]:
# Generate 10 Balanced Datasets for Training
datasets = []
for i in range(10):
    df_no_fraud_sampled = df_no_fraud_train.sample(n=len(df_fraud_train), random_state=i, replace=False)
    df_combined = pd.concat([df_fraud_train, df_no_fraud_sampled]).sample(frac=1, random_state=i).reset_index(drop=True)
    datasets.append(df_combined)

# Standardize the Datasets
scaled_datasets = []
for df_combined in datasets:
    X = df_combined.drop(columns=['FraudFound_P'])
    y = df_combined['FraudFound_P']
    
    X_scaled = scaler.transform(X)
    df_combined_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    df_combined_scaled['FraudFound_P'] = y.values
    
    scaled_datasets.append(df_combined_scaled)

# Standardize Holdout Set
X_holdout = df_holdout.drop(columns=['FraudFound_P'])
y_holdout = df_holdout['FraudFound_P']

X_holdout_scaled = scaler.transform(X_holdout)
df_holdout_scaled = pd.DataFrame(X_holdout_scaled, columns=X_holdout.columns)
df_holdout_scaled['FraudFound_P'] = y_holdout.values

# Train model using chosen Hyperparams

In [5]:
rf_models, xgb_models = [], []

for i, df_combined_scaled in enumerate(scaled_datasets):
    X = df_combined_scaled.drop(columns=['FraudFound_P'])
    y = df_combined_scaled['FraudFound_P']

    
    rf_model = RandomForestClassifier(**best_rf_params, random_state=42)
    rf_model.fit(X, y)
    rf_models.append(rf_model)
    
    xgb_model = XGBClassifier(**best_xgb_params, eval_metric='logloss', random_state=42)
    xgb_model.fit(X, y)
    xgb_models.append(xgb_model)


In [6]:
# Create Voting Ensembles for Each Model Type
rf_model_list = [(f'rf_model_{i+1}', rf_models[i]) for i in range(10)]
xgb_model_list = [(f'xgb_model_{i+1}', xgb_models[i]) for i in range(10)]

# Voting Classifiers for Each Model Type
voting_model_rf = VotingClassifier(estimators=rf_model_list, voting='soft')
voting_model_xgb = VotingClassifier(estimators=xgb_model_list, voting='soft')

# Prepare Final Dataset for Voting Models
final_X = scaled_datasets[-1].drop(columns=['FraudFound_P'])
final_y = scaled_datasets[-1]['FraudFound_P']

# Fit Voting Models
voting_model_rf.fit(final_X, final_y)
voting_model_xgb.fit(final_X, final_y)


In [7]:
# Evaluate Individual Voting Models on Holdout
X_holdout_final = df_holdout_scaled.drop(columns=['FraudFound_P'])
y_holdout_final = df_holdout_scaled['FraudFound_P']

y_pred_rf = voting_model_rf.predict(X_holdout_final)
print("Random Forest Ensemble Classification Report on Holdout Data:")
print(classification_report(y_holdout_final, y_pred_rf))

y_pred_xgb = voting_model_xgb.predict(X_holdout_final)
print("XGBoost Ensemble Classification Report on Holdout Data:")
print(classification_report(y_holdout_final, y_pred_xgb))

Random Forest Ensemble Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       0.98      0.64      0.77      2175
           1       0.12      0.82      0.22       138

    accuracy                           0.65      2313
   macro avg       0.55      0.73      0.49      2313
weighted avg       0.93      0.65      0.74      2313

XGBoost Ensemble Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       1.00      0.55      0.71      2175
           1       0.12      0.96      0.21       138

    accuracy                           0.58      2313
   macro avg       0.56      0.76      0.46      2313
weighted avg       0.94      0.58      0.68      2313



In [8]:
# Final Ensemble of All Models (LR + RF + XGB)
all_models_list =  rf_model_list + xgb_model_list
final_voting_model = VotingClassifier(estimators=all_models_list, voting='soft')

# Fit Final Voting Model on Full 85% Data
final_voting_model.fit(final_X, final_y)

# Evaluate Final Voting Model on Holdout Data
y_pred_final = final_voting_model.predict(X_holdout_final)
print("Final Ensemble Model (LR + RF + XGB) Classification Report on Holdout Data:")
print(classification_report(y_holdout_final, y_pred_final))

Final Ensemble Model (LR + RF + XGB) Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       1.00      0.56      0.72      2175
           1       0.12      0.96      0.22       138

    accuracy                           0.58      2313
   macro avg       0.56      0.76      0.47      2313
weighted avg       0.94      0.58      0.69      2313



# Top Features 

In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


# Load Data
df = pd.read_csv('data/fraud_oracle_processed.csv')
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

# Define Top Selected Features
top_features = [
    'Fault_Third Party',                  # Encoded version of 'Fault'
    'VehicleCategory_Sport',              # Encoded version of 'VehicleCategory'
    'PolicyType_Sedan - Collision',       # Example of PolicyType (you may want to select more)
    'PriceToAgeRatio',                     # Derived feature added manually
    'AgeOfVehicle',                        # Numeric feature
    'AccidentArea_Urban',                  # Encoded version of 'AccidentArea'
    'MonthClaimed',                        # Kept as original (not one-hot encoded)
    'AddressChange_Claim',                 # Kept as original (not one-hot encoded)
    'VehiclePrice',                        # Numeric feature
    'BasePolicy_Collision'                 # Encoded version of 'BasePolicy'
]


# One-Hot Encode Categorical Columns
df_encoded = pd.get_dummies(df, columns=['Month', 'DayOfWeek', 'Make', 'AccidentArea',
                                          'DayOfWeekClaimed', 'Fault', 'PolicyType',
                                          'VehicleCategory', 'BasePolicy'], drop_first=True)

# Convert Boolean Columns to Integers
boolean_cols = ['MaritalStatus_Married', 'MaritalStatus_Single', 'MaritalStatus_Widow']
df_encoded[boolean_cols] = df_encoded[boolean_cols].astype(int)

# Add PriceToAgeRatio Feature
df_encoded['PriceToAgeRatio'] = df_encoded['VehiclePrice'] / (df_encoded['AgeOfVehicle'] + 1)

# Filter Selected Features + Target Column
df_selected = df_encoded[top_features + ['FraudFound_P']]

# Split Fraud and Non-Fraud Data
df_fraud = df_selected[df_selected["FraudFound_P"] == 1]
df_no_fraud = df_selected[df_selected["FraudFound_P"] == 0]

# Split 15% Holdout for Final Evaluation
df_fraud_train, df_fraud_holdout = train_test_split(
    df_fraud, test_size=0.15, random_state=42, stratify=df_fraud["FraudFound_P"])

df_no_fraud_train, df_no_fraud_holdout = train_test_split(
    df_no_fraud, test_size=0.15, random_state=42, stratify=df_no_fraud["FraudFound_P"])

# Combine Holdout Data for Final Evaluation
df_holdout = pd.concat([df_fraud_holdout, df_no_fraud_holdout]).sample(frac=1, random_state=42).reset_index(drop=True)

# Standardize Selected Features Before Splitting
scaler = StandardScaler()
X = df_selected.drop(columns=['FraudFound_P'])
y = df_selected['FraudFound_P']

X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
df_scaled['FraudFound_P'] = y.values

# Split Data for Hyperparameter Tuning (85% Train, 15% Holdout)
df_train, df_holdout = train_test_split(df_scaled, test_size=0.15, random_state=42, stratify=df_scaled['FraudFound_P'])

# Split df_train into X and y for Grid Search
X_train_grid = df_train.drop(columns=['FraudFound_P'])
y_train_grid = df_train['FraudFound_P']

# Logistic Regression Hyperparameters
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['saga', 'liblinear'],
    'max_iter': [1000, 5000]
}

lr_grid = GridSearchCV(LogisticRegression(random_state=42), param_grid=lr_param_grid, cv=5, scoring='f1', n_jobs=-1)
lr_grid.fit(X_train_grid, y_train_grid)
best_lr_params = lr_grid.best_params_

# Random Forest Hyperparameters
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=rf_param_grid, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train_grid, y_train_grid)
best_rf_params = rf_grid.best_params_

# XGBoost Hyperparameters
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0.01, 0.1, 1]
}

xgb_grid = GridSearchCV(XGBClassifier(eval_metric='logloss', random_state=42), param_grid=xgb_param_grid, cv=5, scoring='f1', n_jobs=-1)
xgb_grid.fit(X_train_grid, y_train_grid)
best_xgb_params = xgb_grid.best_params_

# Generate 10 Balanced Datasets for Training
datasets = []
for i in range(10):
    df_no_fraud_sampled = df_no_fraud_train.sample(n=len(df_fraud_train), random_state=i, replace=False)
    df_combined = pd.concat([df_fraud_train, df_no_fraud_sampled]).sample(frac=1, random_state=i).reset_index(drop=True)
    datasets.append(df_combined)

# Standardize the Datasets
scaled_datasets = []
for df_combined in datasets:
    X = df_combined.drop(columns=['FraudFound_P'])
    y = df_combined['FraudFound_P']
    X_scaled = scaler.transform(X)
    df_combined_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    df_combined_scaled['FraudFound_P'] = y.values
    scaled_datasets.append(df_combined_scaled)

# Standardize Holdout Set
X_holdout = df_holdout.drop(columns=['FraudFound_P'])
y_holdout = df_holdout['FraudFound_P']

X_holdout_scaled = scaler.transform(X_holdout)
df_holdout_scaled = pd.DataFrame(X_holdout_scaled, columns=X_holdout.columns)
df_holdout_scaled['FraudFound_P'] = y_holdout.values

# Train Models and Create Voting Ensembles
rf_models, xgb_models = [], []
for i, df_combined_scaled in enumerate(scaled_datasets):
    X = df_combined_scaled.drop(columns=['FraudFound_P'])
    y = df_combined_scaled['FraudFound_P']
    
    rf_model = RandomForestClassifier(**best_rf_params, random_state=42)
    rf_model.fit(X, y)
    rf_models.append(rf_model)
    
    xgb_model = XGBClassifier(**best_xgb_params, eval_metric='logloss', random_state=42)
    xgb_model.fit(X, y)
    xgb_models.append(xgb_model)

rf_model_list = [(f'rf_model_{i+1}', rf_models[i]) for i in range(10)]
xgb_model_list = [(f'xgb_model_{i+1}', xgb_models[i]) for i in range(10)]

voting_model_rf = VotingClassifier(estimators=rf_model_list, voting='soft')
voting_model_xgb = VotingClassifier(estimators=xgb_model_list, voting='soft')

final_X = scaled_datasets[-1].drop(columns=['FraudFound_P'])
final_y = scaled_datasets[-1]['FraudFound_P']

voting_model_rf.fit(final_X, final_y)
voting_model_xgb.fit(final_X, final_y)

# Evaluate Individual Voting Models on Holdout
X_holdout_final = df_holdout_scaled.drop(columns=['FraudFound_P'])
y_holdout_final = df_holdout_scaled['FraudFound_P']


y_pred_rf = voting_model_rf.predict(X_holdout_final)
print("Random Forest Ensemble Classification Report on Holdout Data:")
print(classification_report(y_holdout_final, y_pred_rf))

y_pred_xgb = voting_model_xgb.predict(X_holdout_final)
print("XGBoost Ensemble Classification Report on Holdout Data:")
print(classification_report(y_holdout_final, y_pred_xgb))

# Final Ensemble of All Models (LR + RF + XGB)
all_models_list =  rf_model_list + xgb_model_list
final_voting_model = VotingClassifier(estimators=all_models_list, voting='soft')

# Fit Final Voting Model on Full 85% Data
final_voting_model.fit(final_X, final_y)

# Evaluate Final Voting Model on Holdout Data
y_pred_final = final_voting_model.predict(X_holdout_final)
print("Final Ensemble Model (RF + XGB) Classification Report on Holdout Data:")
print(classification_report(y_holdout_final, y_pred_final))


Random Forest Ensemble Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       0.98      0.59      0.74      2175
           1       0.11      0.83      0.20       138

    accuracy                           0.61      2313
   macro avg       0.55      0.71      0.47      2313
weighted avg       0.93      0.61      0.71      2313

XGBoost Ensemble Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       0.99      0.58      0.73      2175
           1       0.12      0.91      0.21       138

    accuracy                           0.60      2313
   macro avg       0.56      0.75      0.47      2313
weighted avg       0.94      0.60      0.70      2313

Final Ensemble Model (RF + XGB) Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       0.99      0.57      0.73      2175
           1       0.12      0.92      0.21     

In [12]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, precision_recall_curve
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.combine import SMOTETomek
from sklearn.ensemble import IsolationForest
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score

# Load Data
df = pd.read_csv('data/fraud_oracle_processed.csv')
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

# Define Top Selected Features
top_features = [
    'Fault_Third Party', 'VehicleCategory_Sport', 'PolicyType_Sedan - Collision',
    'PriceToAgeRatio', 'AgeOfVehicle', 'AccidentArea_Urban',
    'MonthClaimed', 'AddressChange_Claim', 'VehiclePrice', 'BasePolicy_Collision'
]

# One-Hot Encode Categorical Columns
df_encoded = pd.get_dummies(df, columns=['Month', 'DayOfWeek', 'Make', 'AccidentArea',
                                          'DayOfWeekClaimed', 'Fault', 'PolicyType',
                                          'VehicleCategory', 'BasePolicy'], drop_first=True)

# Convert Boolean Columns to Integers
boolean_cols = ['MaritalStatus_Married', 'MaritalStatus_Single', 'MaritalStatus_Widow']
df_encoded[boolean_cols] = df_encoded[boolean_cols].astype(int)

# Add PriceToAgeRatio Feature
df_encoded['PriceToAgeRatio'] = df_encoded['VehiclePrice'] / (df_encoded['AgeOfVehicle'] + 1)

# Filter Selected Features + Target Column
df_selected = df_encoded[top_features + ['FraudFound_P']]

# Split Data for Resampling
X = df_selected.drop(columns=['FraudFound_P'])
y = df_selected['FraudFound_P']

# Apply SMOTE + Tomek Links
tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = tomek.fit_resample(X, y)

# Split Data for Training and Holdout
X_train, X_holdout, y_train, y_holdout = train_test_split(X_resampled, y_resampled, test_size=0.15, random_state=42, stratify=y_resampled)

# Standardize the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)

# Train Isolation Forest for Anomaly Detection
iso_forest = IsolationForest(contamination=0.02, random_state=42)
iso_forest.fit(X_train_scaled)

# Generate Anomaly Scores
X_train['anomaly_score'] = iso_forest.decision_function(X_train_scaled)
X_holdout['anomaly_score'] = iso_forest.decision_function(X_holdout_scaled)

# Prepare Final Features After Adding Anomaly Score
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)

# Train Base Models
lr_model = LogisticRegression(max_iter=1000, random_state=42)
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, eval_metric='logloss', random_state=42)

# Fit Models
lr_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train_scaled, y_train)
xgb_model.fit(X_train_scaled, y_train)

# Create Stacking Model
from sklearn.ensemble import StackingClassifier
stacked_model = StackingClassifier(
    estimators=[
        ('lr', lr_model),
        ('rf', rf_model),
        ('xgb', xgb_model)
    ],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42)
)

# Fit Stacking Model
stacked_model.fit(X_train_scaled, y_train)

# Fine-Tune Classification Threshold
y_probs = stacked_model.predict_proba(X_holdout_scaled)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_holdout, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

# Apply Optimal Threshold
y_pred_final = (y_probs >= optimal_threshold).astype(int)
print(f"Optimal Threshold: {optimal_threshold}")
print("Final Classification Report on Holdout Data:")
print(classification_report(y_holdout, y_pred_final))

# Calibrate Model Probabilities
calibrated_model = CalibratedClassifierCV(stacked_model, method='isotonic', cv=5)
calibrated_model.fit(X_train_scaled, y_train)
y_probs_calibrated = calibrated_model.predict_proba(X_holdout_scaled)[:, 1]

# Final Predictions with Calibrated Model
y_pred_calibrated = (y_probs_calibrated >= optimal_threshold).astype(int)
print("Calibrated Model Classification Report on Holdout Data:")
print(classification_report(y_holdout, y_pred_calibrated))


Optimal Threshold: 0.4088134862627753
Final Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       0.91      0.74      0.82      2175
           1       0.78      0.92      0.85      2174

    accuracy                           0.83      4349
   macro avg       0.84      0.83      0.83      4349
weighted avg       0.84      0.83      0.83      4349

Calibrated Model Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       0.91      0.74      0.82      2175
           1       0.78      0.92      0.85      2174

    accuracy                           0.83      4349
   macro avg       0.84      0.83      0.83      4349
weighted avg       0.84      0.83      0.83      4349



# Apply SMOTE and Tomek to balance the classes as  form or resampling

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, precision_recall_curve
from xgboost import XGBClassifier
from imblearn.combine import SMOTETomek
from sklearn.ensemble import IsolationForest
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
from collections import Counter

In [26]:
'''
Apply SMOTE and Tomek to balance classes as resampling methods.
Use PyTorch for autoencoder to detect anomalies.
Train base models (Logistic Regression, Random Forest, XGBoost) and stack them using LGBM as the meta-learner.
Fine-tune classification threshold based on F1 score.
'''

# Load Data
df = pd.read_csv('data/fraud_oracle_processed.csv')
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

# Define Top Selected Features
top_features = [
    'Fault_Third Party', 'VehicleCategory_Sport', 'PolicyType_Sedan - Collision',
    'PriceToAgeRatio', 'AgeOfVehicle', 'AccidentArea_Urban',
    'MonthClaimed', 'AddressChange_Claim', 'VehiclePrice', 'BasePolicy_Collision'
]

# One-Hot Encode Categorical Columns
df_encoded = pd.get_dummies(df, columns=['Month', 'DayOfWeek', 'Make', 'AccidentArea',
                                          'DayOfWeekClaimed', 'Fault', 'PolicyType',
                                          'VehicleCategory', 'BasePolicy'], drop_first=True)

# Convert Boolean Columns to Integers
boolean_cols = ['MaritalStatus_Married', 'MaritalStatus_Single', 'MaritalStatus_Widow']
df_encoded[boolean_cols] = df_encoded[boolean_cols].astype(int)

# Add PriceToAgeRatio Feature
df_encoded['PriceToAgeRatio'] = df_encoded['VehiclePrice'] / (df_encoded['AgeOfVehicle'] + 1)

# Filter Selected Features + Target Column
df_selected = df_encoded[top_features + ['FraudFound_P']]

# Split Data for Resampling
X = df_selected.drop(columns=['FraudFound_P'])
y = df_selected['FraudFound_P']

# Apply SMOTE + Tomek Links
tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = tomek.fit_resample(X, y)

# Check class balance after resampling
print(f"Class Distribution After Resampling: {Counter(y_resampled)}")

# Split Data for Training and Holdout
X_train, X_holdout, y_train, y_holdout = train_test_split(X_resampled, y_resampled, test_size=0.15, random_state=42, stratify=y_resampled)

# Standardize the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)



Class Distribution After Resampling: Counter({0: 14495, 1: 14495})


In [27]:
# Define PyTorch Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(8, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim)
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Prepare Data for PyTorch
def to_tensor(data):
    return torch.tensor(data, dtype=torch.float32)

X_train_tensor = to_tensor(X_train_scaled)
X_holdout_tensor = to_tensor(X_holdout_scaled)

# Convert y_train to NumPy Array for Indexing
y_train_np = y_train.to_numpy()

# Train PyTorch Autoencoder
input_dim = X_train_scaled.shape[1]
autoencoder = Autoencoder(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

# Train Autoencoder on Normal Data Only
normal_data = X_train_tensor[y_train_np == 0]
for epoch in range(10):
    optimizer.zero_grad()
    outputs = autoencoder(normal_data)
    loss = criterion(outputs, normal_data)
    loss.backward()
    optimizer.step()

# Reconstruction Error as Anomaly Score
def compute_reconstruction_error(autoencoder, data):
    with torch.no_grad():
        reconstructed = autoencoder(data)
        mse = torch.mean((data - reconstructed) ** 2, axis=1)
    return mse.numpy()

reconstruction_error_train = compute_reconstruction_error(autoencoder, X_train_tensor)
reconstruction_error_holdout = compute_reconstruction_error(autoencoder, X_holdout_tensor)

# Add Anomaly Scores
X_train_scaled = np.hstack((X_train_scaled, reconstruction_error_train.reshape(-1, 1)))
X_holdout_scaled = np.hstack((X_holdout_scaled, reconstruction_error_holdout.reshape(-1, 1)))

# Define Updated Feature Names
feature_names = list(X_train.columns) + ['anomaly_score']

# Convert Scaled Arrays Back to DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_holdout_scaled_df = pd.DataFrame(X_holdout_scaled, columns=feature_names)

In [28]:

# Train Base Models
lr_model = LogisticRegression(max_iter=1000, random_state=42)
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, eval_metric='logloss', random_state=42)

# Check Feature Names Before Fitting
print(f"Feature Names Before Training:\n{X_train_scaled_df.columns.tolist()}")

# Check Feature Names Before Prediction
print(f"Feature Names Before Prediction:\n{X_holdout_scaled_df.columns.tolist()}")

# Fit Models
lr_model.fit(X_train_scaled_df, y_train)
rf_model.fit(X_train_scaled_df, y_train)
xgb_model.fit(X_train_scaled_df, y_train)

# Create Stacking Model with LGBM as Meta-Learner
stacked_model = StackingClassifier(
    estimators=[
        ('lr', lr_model),
        ('rf', rf_model),
        ('xgb', xgb_model)
    ],
    final_estimator=LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42, verbose=-1)
)

# Fit Stacking Model
stacked_model.fit(X_train_scaled_df, y_train)

# Fine-Tune Classification Threshold
y_probs = stacked_model.predict_proba(X_holdout_scaled_df)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_holdout, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

# Apply Optimal Threshold
y_pred_final = (y_probs >= optimal_threshold).astype(int)
print(f"Optimal Threshold: {optimal_threshold}")
print("Final Classification Report on Holdout Data:")
print(classification_report(y_holdout, y_pred_final))

# Calibrate Model Probabilities
calibrated_model = CalibratedClassifierCV(stacked_model, method='isotonic', cv=5)
calibrated_model.fit(X_train_scaled_df, y_train)
y_probs_calibrated = calibrated_model.predict_proba(X_holdout_scaled_df)[:, 1]

# Recalculate Optimal Threshold for Calibrated Model
precisions_cal, recalls_cal, thresholds_cal = precision_recall_curve(y_holdout, y_probs_calibrated)
f1_scores_cal = 2 * (precisions_cal * recalls_cal) / (precisions_cal + recalls_cal)
optimal_idx_cal = np.argmax(f1_scores_cal)
optimal_threshold_cal = thresholds_cal[optimal_idx_cal]

# Apply New Optimal Threshold for Calibrated Model
y_pred_calibrated = (y_probs_calibrated >= optimal_threshold_cal).astype(int)
print(f"Optimal Threshold for Calibrated Model: {optimal_threshold_cal}")
print("Calibrated Model Classification Report on Holdout Data:")
print(classification_report(y_holdout, y_pred_calibrated))

Feature Names Before Training:
['Fault_Third Party', 'VehicleCategory_Sport', 'PolicyType_Sedan - Collision', 'PriceToAgeRatio', 'AgeOfVehicle', 'AccidentArea_Urban', 'MonthClaimed', 'AddressChange_Claim', 'VehiclePrice', 'BasePolicy_Collision', 'anomaly_score']
Feature Names Before Prediction:
['Fault_Third Party', 'VehicleCategory_Sport', 'PolicyType_Sedan - Collision', 'PriceToAgeRatio', 'AgeOfVehicle', 'AccidentArea_Urban', 'MonthClaimed', 'AddressChange_Claim', 'VehiclePrice', 'BasePolicy_Collision', 'anomaly_score']




Optimal Threshold: 0.4744091071907198
Final Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       0.89      0.77      0.83      2175
           1       0.80      0.91      0.85      2174

    accuracy                           0.84      4349
   macro avg       0.85      0.84      0.84      4349
weighted avg       0.85      0.84      0.84      4349





Optimal Threshold for Calibrated Model: 0.41052093245125787
Calibrated Model Classification Report on Holdout Data:
              precision    recall  f1-score   support

           0       0.92      0.74      0.82      2175
           1       0.78      0.94      0.85      2174

    accuracy                           0.84      4349
   macro avg       0.85      0.84      0.84      4349
weighted avg       0.85      0.84      0.84      4349



