In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib


df = pd.read_csv("../data/insurance_claims.csv") 


df = df.replace('?', pd.NA)


categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].astype(str)
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


df = df.fillna(0)

TARGET = 'fraud_reported'
features = [col for col in df.columns if col not in [TARGET, 'policy_number']]

X = df[features]
y = df[TARGET]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


d_train = lgb.Dataset(X_train, label=y_train)
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'seed': 42
}
model = lgb.train(params, d_train, 100)

joblib.dump(model, '../backend/eligibility_model.pkl')

print("Eligibility model trained and saved to backend/eligibility_model.pkl")

[LightGBM] [Info] Number of positive: 192, number of negative: 608
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001529 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2880
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.240000 -> initscore=-1.152680
[LightGBM] [Info] Start training from score -1.152680
Eligibility model trained and saved to backend/eligibility_model.pkl


In [None]:

from sklearn.ensemble import IsolationForest


anomaly_model = IsolationForest(contamination=0.05, random_state=42)
anomaly_model.fit(X)


joblib.dump(anomaly_model, '../backend/fraud_model.pkl')
print("Fraud/Anomaly model trained and saved to backend/fraud_model.pkl")

Fraud/Anomaly model trained and saved to backend/fraud_model.pkl
