In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import os

# Load data
df = pd.read_csv('../data/raw/Car_Insurance_Claim.csv')
df.drop(['ID', 'POSTAL_CODE'], axis=1, inplace=True)

# Define features
categorical_features = ['AGE', 'GENDER', 'RACE', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME', 'VEHICLE_YEAR', 'VEHICLE_TYPE']
numerical_features = ['CREDIT_SCORE', 'ANNUAL_MILEAGE', 'SPEEDING_VIOLATIONS', 'DUIS', 'PAST_ACCIDENTS']
binary_features = ['VEHICLE_OWNERSHIP', 'MARRIED', 'CHILDREN']
target = 'OUTCOME'

# Preprocessing pipeline
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', num_imputer), ('scaler', StandardScaler())]), numerical_features),
        ('cat', Pipeline(steps=[('imputer', cat_imputer), ('encoder', OneHotEncoder(handle_unknown='ignore'))]), categorical_features),
        ('bin', 'passthrough', binary_features)
    ])

# Preprocess data
X = df.drop(target, axis=1)
y = df[target]
X_preprocessed = preprocessor.fit_transform(X)

# Get feature names
cat_encoder = preprocessor.named_transformers_['cat'].named_steps['encoder']
cat_feature_names = cat_encoder.get_feature_names_out(categorical_features)
feature_names = numerical_features + list(cat_feature_names) + binary_features

# Create preprocessed DataFrame
preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)
preprocessed_df['OUTCOME'] = y.values

# Save preprocessed data
preprocessed_df.to_csv('../data/processed/preprocessed_data.csv', index=False)

# Save preprocessor
joblib.dump(preprocessor, '../models/preprocessor.pkl')

# Print preprocessing outputs
print("Preprocessed Data Sample (First 5 Rows):")
print(preprocessed_df.head())
print("\nFeature Names:")
print(feature_names)

Preprocessed Data Sample (First 5 Rows):
   CREDIT_SCORE  ANNUAL_MILEAGE  SPEEDING_VIOLATIONS     DUIS  PAST_ACCIDENTS  \
0      0.865914        0.113057            -0.661462 -0.43102       -0.639263   
1     -1.208879        1.605576            -0.661462 -0.43102       -0.639263   
2     -0.173367       -0.260073            -0.661462 -0.43102       -0.639263   
3     -2.369485       -0.260073            -0.661462 -0.43102       -0.639263   
4     -0.974770        0.113057             0.230657 -0.43102       -0.034072   

   AGE_16-25  AGE_26-39  AGE_40-64  AGE_65+  GENDER_female  ...  \
0        0.0        0.0        0.0      1.0            1.0  ...   
1        1.0        0.0        0.0      0.0            0.0  ...   
2        1.0        0.0        0.0      0.0            1.0  ...   
3        1.0        0.0        0.0      0.0            0.0  ...   
4        0.0        1.0        0.0      0.0            0.0  ...   

   INCOME_upper class  INCOME_working class  VEHICLE_YEAR_after 2015 