In [1]:
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression

# Load data
df = pd.read_csv('../data/raw/Car_Insurance_Claim.csv')

# Load preprocessor
preprocessor = joblib.load('../models/preprocessor.pkl')

# Define feature columns
categorical_cols = ['AGE', 'GENDER', 'RACE', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME', 'VEHICLE_YEAR', 'VEHICLE_TYPE']
numerical_cols = ['CREDIT_SCORE', 'VEHICLE_OWNERSHIP', 'MARRIED', 'CHILDREN', 'ANNUAL_MILEAGE', 'SPEEDING_VIOLATIONS', 'DUIS', 'PAST_ACCIDENTS']
X = df[categorical_cols + numerical_cols]

# Invert labels to match SOW: 1 = low-risk (approved), 0 = high-risk (denied)
y = 1 - df['OUTCOME']  # Invert OUTCOME (if 1 was high-risk, make it 0)

# Transform features
X_transformed = preprocessor.transform(X)

# Train model
model = LogisticRegression()
model.fit(X_transformed, y)

# Save model
joblib.dump(model, '../models/logistic_regression_model.pkl')

# Print feature count and label distribution
print(f"Model trained on {X_transformed.shape[1]} features")
print("Label distribution:", pd.Series(y).value_counts())

Model trained on 31 features
Label distribution: OUTCOME
1.0    6867
0.0    3133
Name: count, dtype: int64
