## Step 1.1 — Load Cleaned Dataset

We'll start by loading the processed dataset created after data cleaning and EDA.
This dataset has no missing values and is ready for transformation.


In [3]:
import pandas as pd

# Load cleaned data
df = pd.read_csv("../data/processed/car_insurance_cleaned.csv")

print("Shape:", df.shape)
df.head()


Shape: (10302, 27)


Unnamed: 0,ID,KIDSDRIV,BIRTH,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,63581743,0,16MAR39,60.0,0,11.0,67349.0,0,0.0,no,...,minivan,1,4461.0,2,0,3.0,0.0,18.0,0,highly urban/ urban
1,132761049,0,21JAN56,43.0,0,11.0,91449.0,0,257252.0,no,...,minivan,1,0.0,0,0,0.0,0.0,1.0,0,highly urban/ urban
2,921317019,0,18NOV51,48.0,0,11.0,52881.0,0,0.0,no,...,van,1,0.0,0,0,2.0,0.0,10.0,0,highly urban/ urban
3,727598473,0,05MAR64,35.0,1,10.0,16039.0,0,124191.0,yes,...,suv,0,11618.75,2,0,3.0,0.0,10.0,0,highly urban/ urban
4,450221861,0,05JUN48,51.0,0,14.0,53529.0,0,306251.0,yes,...,minivan,1,0.0,0,0,0.0,0.0,6.0,0,highly urban/ urban


## Step 1.2 — Drop Target Leakage and Irrelevant Columns

The column `CLM_AMT` directly leaks target information and must be excluded.
We'll also remove `ID` or other purely identifier columns.


In [6]:
# Drop leakage and ID columns
df.drop(columns=["CLM_AMT", "ID"], inplace=True, errors="ignore")
df.shape


(10302, 25)

## Step 1.3 — Separate Features (X) and Target (y)
We'll split the dataset into:
- `X` → independent variables
- `y` → dependent variable (`CLAIM_FLAG`)


In [11]:
X = df.drop(columns=["CLAIM_FLAG"])
y = df["CLAIM_FLAG"]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (10302, 24)
y shape: (10302,)


## Step 1.4 — Identify Categorical and Numerical Features

We'll categorize features based on their data types for encoding and scaling.


In [17]:
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["number"]).columns.tolist()

print("Categorical columns:", cat_cols)
print("Numerical columns:", num_cols)


Categorical columns: ['BIRTH', 'MSTATUS', 'GENDER', 'EDUCATION', 'OCCUPATION', 'CAR_USE', 'CAR_TYPE', 'URBANICITY']
Numerical columns: ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'INCOME', 'PARENT1', 'HOME_VAL', 'TRAVTIME', 'BLUEBOOK', 'TIF', 'RED_CAR', 'OLDCLAIM', 'CLM_FREQ', 'REVOKED', 'MVR_PTS', 'CAR_AGE']


## Step 1.5 — Encode Categorical Variables

We’ll apply One-Hot Encoding to categorical columns using `ColumnTransformer`
to convert them into numeric format suitable for machine learning models.


In [22]:
# after X_encoded = encoder.fit_transform(X)
import pandas as pd
import numpy as np

# get the fitted OneHotEncoder
ohe = encoder.named_transformers_['cat']

# 1) encoded cat feature names (safe method)
try:
    ohe_feature_names = ohe.get_feature_names_out(cat_cols).tolist()
except Exception:
    # fallback for old sklearn versions (build names manually)
    categories = ohe.categories_
    ohe_feature_names = []
    for col, cats in zip(cat_cols, categories):
        ohe_feature_names += [f"{col}__{str(c)}" for c in cats]

# 2) passthrough columns (original numeric features)
passthrough_cols = [c for c in X.columns if c not in cat_cols]

# 3) final feature name order: transformers order in ColumnTransformer
#    by default columntransformer applies transformers in the order defined,
#    then appends remainder (passthrough) columns.
feature_names = ohe_feature_names + passthrough_cols

# 4) convert array -> DataFrame with these column names
X_encoded = pd.DataFrame(X_encoded, columns=feature_names, index=X.index)

print("Encoded feature shape:", X_encoded.shape)
X_encoded.head()


Encoded feature shape: (10302, 6604)


Unnamed: 0,BIRTH_01APR39,BIRTH_01APR40,BIRTH_01APR41,BIRTH_01APR44,BIRTH_01APR45,BIRTH_01APR46,BIRTH_01APR47,BIRTH_01APR50,BIRTH_01APR51,BIRTH_01APR52,...,HOME_VAL,TRAVTIME,BLUEBOOK,TIF,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CAR_AGE
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,14.0,14230.0,11.0,1.0,4461.0,2.0,0.0,3.0,18.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,257252.0,22.0,14940.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,26.0,21970.0,1.0,1.0,0.0,0.0,0.0,2.0,10.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,124191.0,5.0,4010.0,4.0,0.0,11618.75,2.0,0.0,3.0,10.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,306251.0,32.0,15440.0,7.0,1.0,0.0,0.0,0.0,0.0,6.0


In [None]:
# -------------------------
# Scale numeric columns in X_encoded (only original numeric features)
# -------------------------

from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Ensure X_encoded is a DataFrame (built earlier)
# X_encoded = pd.DataFrame(...)

# `num_cols` should be the list of original numeric columns from X before encoding,
# e.g. num_cols = X.select_dtypes(include=["number"]).columns.tolist()
# If you don't have it, you can compute it again from original X (if still available).
try:
    num_cols  # noqa: F821
except NameError:
    # If X (original features DataFrame) still exists in scope, recompute
    if 'X' in globals():
        num_cols = X.select_dtypes(include=["number"]).columns.tolist()
    else:
        num_cols = []  # fallback empty list

# Determine which of those original numeric columns are present in X_encoded
# (they are the passthrough columns added after one-hot encoding).
numeric_to_scale = [c for c in X_encoded.columns if c in num_cols]

print("Numeric columns detected for scaling:", numeric_to_scale)

# If there are no numeric columns to scale, skip scaling
if len(numeric_to_scale) == 0:
    print("No numeric columns to scale. Skipping StandardScaler.")
    X_scaled = X_encoded.copy()
else:
    scaler = StandardScaler()

    # Fit scaler on numeric columns and transform
    X_scaled = X_encoded.copy()
    X_scaled[numeric_to_scale] = scaler.fit_transform(X_encoded[numeric_to_scale])

    # Check results
    print("Scaled numeric columns — new means (approx):")
    print(pd.DataFrame(X_scaled[numeric_to_scale].mean().round(6), columns=["mean"]))
    print("\nScaled numeric columns — new stds (approx):")
    print(pd.DataFrame(X_scaled[numeric_to_scale].std().round(6), columns=["std"]))

# -------------------------
# Optional: drop highly correlated features (absolute corr > 0.90)
# -------------------------
# This step reduces multicollinearity by removing one of highly correlated pairs.
# Set threshold variable to change sensitivity if desired.
corr_threshold = 0.90

# Compute correlation matrix on numeric columns only (this includes scaled numeric + one-hot numeric)
corr_matrix = X_scaled.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find columns with correlation greater than threshold
to_drop = [col for col in upper.columns if any(upper[col] > corr_threshold)]

print("\nHighly correlated features to drop (abs(corr) > {:.2f}):".format(corr_threshold))
print(to_drop)

# Create final dataframe after dropping (do not modify X_scaled in-place unless you want to)
X_final = X_scaled.drop(columns=to_drop, errors="ignore")
print("\nX_final shape after dropping correlated features:", X_final.shape)

# -------------------------
# Save final features + target (y must exist in scope)
# -------------------------
if 'y' in globals():
    out_df = X_final.copy()
    out_df["CLAIM_FLAG"] = y.values  # append target
    output_path = "../data/processed/car_insurance_final_features.csv"
    out_df.to_csv(output_path, index=False)
    print(f"\n✅ Final dataset saved to: {output_path}")
else:
    print("\n⚠️ 'y' (target) not found in scope. Skipping save. If you want to save, ensure y exists and rerun.")


Numeric columns detected for scaling: ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'INCOME', 'PARENT1', 'HOME_VAL', 'TRAVTIME', 'BLUEBOOK', 'TIF', 'RED_CAR', 'OLDCLAIM', 'CLM_FREQ', 'REVOKED', 'MVR_PTS', 'CAR_AGE']
Scaled numeric columns — new means (approx):
          mean
KIDSDRIV   0.0
AGE        0.0
HOMEKIDS  -0.0
YOJ       -0.0
INCOME    -0.0
PARENT1    0.0
HOME_VAL   0.0
TRAVTIME   0.0
BLUEBOOK   0.0
TIF        0.0
RED_CAR    0.0
OLDCLAIM   0.0
CLM_FREQ  -0.0
REVOKED    0.0
MVR_PTS    0.0
CAR_AGE    0.0

Scaled numeric columns — new stds (approx):
               std
KIDSDRIV  1.000049
AGE       1.000049
HOMEKIDS  1.000049
YOJ       1.000049
INCOME    1.000049
PARENT1   1.000049
HOME_VAL  1.000049
TRAVTIME  1.000049
BLUEBOOK  1.000049
TIF       1.000049
RED_CAR   1.000049
OLDCLAIM  1.000049
CLM_FREQ  1.000049
REVOKED   1.000049
MVR_PTS   1.000049
CAR_AGE   1.000049
