In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the dataset you downloaded
df = pd.read_csv("../data/insurance_claims.csv") # Adjust path

# --- Feature Engineering ---
# For this dataset, let's predict 'fraud_reported' as our "eligibility"
# In a real case, this might be a different target.
df = df.replace('?', pd.NA)

# Simple preprocessing
# Convert categorical variables to numbers
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].astype(str) # Handle mixed types
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Fill missing values (simple)
df = df.fillna(0)

# Define features (X) and target (y)
TARGET = 'fraud_reported' # 0 = Eligible (No Fraud), 1 = Not Eligible (Fraud)
features = [col for col in df.columns if col not in [TARGET, 'policy_number']]

X = df[features]
y = df[TARGET]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train LightGBM model
d_train = lgb.Dataset(X_train, label=y_train)
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'seed': 42
}
model = lgb.train(params, d_train, 100)

# Save model to disk
joblib.dump(model, '../backend/eligibility_model.pkl')

print("Eligibility model trained and saved to backend/eligibility_model.pkl")

[LightGBM] [Info] Number of positive: 192, number of negative: 608
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001529 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2880
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.240000 -> initscore=-1.152680
[LightGBM] [Info] Start training from score -1.152680
Eligibility model trained and saved to backend/eligibility_model.pkl


In [2]:
# ... (at the end of the notebook)
from sklearn.ensemble import IsolationForest

# Use the same features X from before
# Isolation Forest doesn't need a target (y)
anomaly_model = IsolationForest(contamination=0.05, random_state=42) # Assume 5% of data are anomalies
anomaly_model.fit(X)

# Save the model
joblib.dump(anomaly_model, '../backend/fraud_model.pkl')
print("Fraud/Anomaly model trained and saved to backend/fraud_model.pkl")

Fraud/Anomaly model trained and saved to backend/fraud_model.pkl
