# 3. Ingeniería de Características y Selección Estadística Avanzada

En esta fase, transformaremos los hallazgos visuales del EDA en decisiones matemáticas, preparando el dataset para el modelado. A diferencia de enfoques estándar, priorizaremos la **integridad de la distribución original** de los datos, evitando técnicas de sobremuestreo sintético que puedan alterar la realidad del negocio.

## Objetivos Técnicos del Notebook:

1.  **Saneamiento y Reducción de Redundancia:**
    * Ejecución de la "Lista de Eliminación" definida en el EDA (borrado de variables con correlación > 0.90 y variables huérfanas como `torque_rpm`).

2.  **Escalado:**
    * Escalado estandar para evitar darle un peso mayor a alguna caracteristica.

3.  **Selección Estadística de Características (Feature Selection):**
    * Aplicaremos pruebas estadísticas de dependencia no lineal (**Mutual Information Classification**) para rankear matemáticamente las variables.
    * Seleccionaremos el subconjunto óptimo de columnas que maximizan la señal de información con el target `claim_status`.

4.  **Codificación (Encoding):**
    * Transformación de variables categóricas a numéricas. Priorizaremos *One-Hot Encoding* para cardinalidades bajas y *Frequency Encoding* para altas.

In [13]:
# importaciones
import os
from pathlib import Path

import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

In [2]:
# Routes
DATA_PATH = Path("../data")
PROCESSED_DATA_PATH = DATA_PATH / "processed"
CLEANED_DATA_PATH = DATA_PATH / "cleaned"

In [3]:
# Load dataset
try:
    df = pd.read_csv(PROCESSED_DATA_PATH / "train_folds.csv", sep=",")
    print(f"Dataset loaded successfully. Dimensions: {df.shape}")
except FileNotFoundError:
    print("Error! The file does not exist. Check the path.")

# Initial overview
df.head()

Dataset loaded successfully. Dimensions: (45358, 41)


Unnamed: 0,subscription_length,vehicle_age,customer_age,region_density,segment,model,fuel_type,airbags,is_esc,is_adjustable_steering,...,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,torque_nm,torque_rpm,power_bhp,power_rpm,claim_status,kfold
0,5.7,0.2,48,8794,C2,M4,Diesel,6,1,1,...,0,1,1,3,250.0,2750.0,113.45,4000.0,0,2
1,0.7,0.0,44,73430,A,M1,CNG,2,0,0,...,0,0,1,0,60.0,3500.0,40.36,6000.0,0,1
2,3.8,1.6,38,6112,C2,M4,Diesel,6,1,1,...,0,1,1,3,250.0,2750.0,113.45,4000.0,0,3
3,1.1,1.4,52,8794,C2,M4,Diesel,6,1,1,...,0,1,1,3,250.0,2750.0,113.45,4000.0,0,3
4,10.1,0.4,52,8794,B2,M6,Petrol,2,0,1,...,1,1,1,2,113.0,4400.0,88.5,6000.0,0,2


In [5]:
# Feature Selection - Drop Redundant and Low-Variance Columns
cols_to_drop = [
    # High correlation with other features
    "displacement",
    "cylinder",
    "width",
    "turning_radius",
    "torque_rpm",
    "torque_nm",
    # Simiular binary features
    "is_ecw",
    "is_power_door_locks",
    "is_rear_window_washer",
    "is_rear_window_wiper",
    "is_tpms",
    "is_driver_seat_height_adjustable",
    # null variance
    "is_speed_alert",
]

print(f"Dimensions before reduction: {df.shape}")
df_clean = df.drop(
    columns=[c for c in cols_to_drop if c in df.columns], errors="ignore"
)
print(f"Dimensions after reduction: {df_clean.shape}")
df_clean.head()

Dimensions before reduction: (45358, 41)
Dimensions after reduction: (45358, 28)


Unnamed: 0,subscription_length,vehicle_age,customer_age,region_density,segment,model,fuel_type,airbags,is_esc,is_adjustable_steering,...,is_rear_window_defogger,is_brake_assist,is_central_locking,is_power_steering,is_day_night_rear_view_mirror,ncap_rating,power_bhp,power_rpm,claim_status,kfold
0,5.7,0.2,48,8794,C2,M4,Diesel,6,1,1,...,1,1,1,1,0,3,113.45,4000.0,0,2
1,0.7,0.0,44,73430,A,M1,CNG,2,0,0,...,0,0,0,1,0,0,40.36,6000.0,0,1
2,3.8,1.6,38,6112,C2,M4,Diesel,6,1,1,...,1,1,1,1,0,3,113.45,4000.0,0,3
3,1.1,1.4,52,8794,C2,M4,Diesel,6,1,1,...,1,1,1,1,0,3,113.45,4000.0,0,3
4,10.1,0.4,52,8794,B2,M6,Petrol,2,0,1,...,0,1,1,1,1,2,88.5,6000.0,0,2


In [6]:
# Standar scale numerical features

continuous_vars = [
    "subscription_length",
    "vehicle_age",
    "power_bhp",
    "power_rpm",
    "customer_age",
    "region_density",
    "gross_weight",
    "length",
]

scaler = StandardScaler()
df_clean[continuous_vars] = scaler.fit_transform(df_clean[continuous_vars])
df_clean.head()

Unnamed: 0,subscription_length,vehicle_age,customer_age,region_density,segment,model,fuel_type,airbags,is_esc,is_adjustable_steering,...,is_rear_window_defogger,is_brake_assist,is_central_locking,is_power_steering,is_day_night_rear_view_mirror,ncap_rating,power_bhp,power_rpm,claim_status,kfold
0,-0.109439,-1.061265,0.446136,-0.570611,C2,M4,Diesel,6,1,1,...,1,1,1,1,0,3,1.239095,-1.4199,0,2
1,-1.321181,-1.237486,-0.127073,3.093352,A,M1,CNG,2,0,0,...,0,0,0,1,0,0,-1.41792,0.760612,0,1
2,-0.569901,0.172286,-0.986886,-0.722643,C2,M4,Diesel,6,1,1,...,1,1,1,1,0,3,1.239095,-1.4199,0,3
3,-1.224242,-0.003936,1.019345,-0.570611,C2,M4,Diesel,6,1,1,...,1,1,1,1,0,3,1.239095,-1.4199,0,3
4,0.956893,-0.885043,1.019345,-0.570611,B2,M6,Petrol,2,0,1,...,0,1,1,1,1,2,0.332096,0.760612,0,2


In [7]:
df_clean.dtypes

subscription_length              float64
vehicle_age                      float64
customer_age                     float64
region_density                   float64
segment                              str
model                                str
fuel_type                            str
airbags                            int64
is_esc                             int64
is_adjustable_steering             int64
is_parking_sensors                 int64
is_parking_camera                  int64
rear_brakes_type                     str
transmission_type                    str
steering_type                        str
length                           float64
gross_weight                     float64
is_front_fog_lights                int64
is_rear_window_defogger            int64
is_brake_assist                    int64
is_central_locking                 int64
is_power_steering                  int64
is_day_night_rear_view_mirror      int64
ncap_rating                        int64
power_bhp       

In [8]:
# Mutual Information Classification for Feature Selection
X = df_clean.drop(columns=["claim_status", "kfold"])
y = df_clean["claim_status"]

# ordinal encoding for categorical variables
categorical_cols = X.select_dtypes(
    include=["object", "category", "str"]
).columns.tolist()
encoder = OrdinalEncoder()
X[categorical_cols] = encoder.fit_transform(X[categorical_cols])

# Calculate Mutual Information Scores
discrete_features = [True if str(dtype) == "int64" else False for dtype in X.dtypes]
mi_scores = mutual_info_classif(
    X, y, discrete_features=discrete_features, random_state=42
)
mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

# Select top N features
N = 20
print(f"Top {N} features by Mutual Information Score:")
top_features = mi_series.head(N).index.tolist()
print(top_features)
df_final = df_clean[top_features + ["claim_status", "kfold"]]
print(f"Dimensions after feature selection: {df_final.shape}")
df_final.head()

Top 20 features by Mutual Information Score:
['rear_brakes_type', 'transmission_type', 'steering_type', 'fuel_type', 'subscription_length', 'power_bhp', 'vehicle_age', 'segment', 'customer_age', 'model', 'region_density', 'gross_weight', 'is_adjustable_steering', 'is_parking_sensors', 'power_rpm', 'is_front_fog_lights', 'is_brake_assist', 'is_day_night_rear_view_mirror', 'airbags', 'ncap_rating']
Dimensions after feature selection: (45358, 22)


Unnamed: 0,rear_brakes_type,transmission_type,steering_type,fuel_type,subscription_length,power_bhp,vehicle_age,segment,customer_age,model,...,is_adjustable_steering,is_parking_sensors,power_rpm,is_front_fog_lights,is_brake_assist,is_day_night_rear_view_mirror,airbags,ncap_rating,claim_status,kfold
0,Disc,Automatic,Power,Diesel,-0.109439,1.239095,-1.061265,C2,0.446136,M4,...,1,1,-1.4199,1,1,0,6,3,0,2
1,Drum,Manual,Power,CNG,-1.321181,-1.41792,-1.237486,A,-0.127073,M1,...,0,1,0.760612,0,0,0,2,0,0,1
2,Disc,Automatic,Power,Diesel,-0.569901,1.239095,0.172286,C2,-0.986886,M4,...,1,1,-1.4199,1,1,0,6,3,0,3
3,Disc,Automatic,Power,Diesel,-1.224242,1.239095,-0.003936,C2,1.019345,M4,...,1,1,-1.4199,1,1,0,6,3,0,3
4,Drum,Manual,Electric,Petrol,0.956893,0.332096,-0.885043,B2,1.019345,M6,...,1,1,0.760612,1,1,1,2,2,0,2


In [9]:
importance_df = mi_series.head(N).to_frame(name="MI_Score")

print("--- Importance ranking ---")
display(importance_df)

--- Importance ranking ---


Unnamed: 0,MI_Score
rear_brakes_type,0.008515
transmission_type,0.007622
steering_type,0.006931
fuel_type,0.004039
subscription_length,0.003925
power_bhp,0.003394
vehicle_age,0.002808
segment,0.002179
customer_age,0.000956
model,0.000867


In [10]:
THRESHOLD = 0.00002

# Filter features based on threshold
selected_features_names = mi_series[mi_series > THRESHOLD].index.tolist()

print(f"Automatically selected variables (Score > {THRESHOLD}):")
print(selected_features_names)
print(f"Automatically discarded variables (Score <= {THRESHOLD}):")
print(mi_series[mi_series <= THRESHOLD].index.tolist())

# Create the base dataset with the selected features + target
df_final = df_final[selected_features_names + ["claim_status", "kfold"]].copy()
df_final.head()

Automatically selected variables (Score > 2e-05):
['rear_brakes_type', 'transmission_type', 'steering_type', 'fuel_type', 'subscription_length', 'power_bhp', 'vehicle_age', 'segment', 'customer_age', 'model', 'region_density', 'gross_weight', 'is_adjustable_steering', 'is_parking_sensors', 'power_rpm', 'is_front_fog_lights', 'is_brake_assist']
Automatically discarded variables (Score <= 2e-05):
['is_day_night_rear_view_mirror', 'airbags', 'ncap_rating', 'is_esc', 'is_central_locking', 'is_rear_window_defogger', 'is_parking_camera', 'is_power_steering', 'length']


Unnamed: 0,rear_brakes_type,transmission_type,steering_type,fuel_type,subscription_length,power_bhp,vehicle_age,segment,customer_age,model,region_density,gross_weight,is_adjustable_steering,is_parking_sensors,power_rpm,is_front_fog_lights,is_brake_assist,claim_status,kfold
0,Disc,Automatic,Power,Diesel,-0.109439,1.239095,-1.061265,C2,0.446136,M4,-0.570611,1.569567,1,1,-1.4199,1,1,0,2
1,Drum,Manual,Power,CNG,-1.321181,-1.41792,-1.237486,A,-0.127073,M1,3.093352,-0.952365,0,1,0.760612,0,0,0,1
2,Disc,Automatic,Power,Diesel,-0.569901,1.239095,0.172286,C2,-0.986886,M4,-0.722643,1.569567,1,1,-1.4199,1,1,0,3
3,Disc,Automatic,Power,Diesel,-1.224242,1.239095,-0.003936,C2,1.019345,M4,-0.570611,1.569567,1,1,-1.4199,1,1,0,3
4,Drum,Manual,Electric,Petrol,0.956893,0.332096,-0.885043,B2,1.019345,M6,-0.570611,-0.245281,1,1,0.760612,1,1,0,2


In [11]:
# ordinal Encoding for High Cardinality Categorical Features
segment_map = {"A": 0, "B1": 1, "B2": 2, "C1": 3, "C2": 4, "Utility": 5}
df_final["segment"] = df_final["segment"].map(segment_map)

# one-hot encoding for Low Cardinality Categorical Features
nominal_cols = [
    "fuel_type",
    "model",
    "steering_type",
    "rear_brakes_type",
    "transmission_type",
]
df_final = pd.get_dummies(df_final, columns=nominal_cols, drop_first=True, dtype=int)
print(f"Dimensions after encoding: {df_final.shape}")
df_final.head()

Dimensions after encoding: (45358, 30)


Unnamed: 0,subscription_length,power_bhp,vehicle_age,segment,customer_age,region_density,gross_weight,is_adjustable_steering,is_parking_sensors,power_rpm,...,model_M4,model_M5,model_M6,model_M7,model_M8,model_M9,steering_type_Manual,steering_type_Power,rear_brakes_type_Drum,transmission_type_Manual
0,-0.109439,1.239095,-1.061265,4,0.446136,-0.570611,1.569567,1,1,-1.4199,...,1,0,0,0,0,0,0,1,0,0
1,-1.321181,-1.41792,-1.237486,0,-0.127073,3.093352,-0.952365,0,1,0.760612,...,0,0,0,0,0,0,0,1,1,1
2,-0.569901,1.239095,0.172286,4,-0.986886,-0.722643,1.569567,1,1,-1.4199,...,1,0,0,0,0,0,0,1,0,0
3,-1.224242,1.239095,-0.003936,4,1.019345,-0.570611,1.569567,1,1,-1.4199,...,1,0,0,0,0,0,0,1,0,0
4,0.956893,0.332096,-0.885043,2,1.019345,-0.570611,-0.245281,1,1,0.760612,...,0,0,1,0,0,0,0,0,1,1


In [14]:
# save cleaned dataset
os.makedirs(CLEANED_DATA_PATH, exist_ok=True)
df_final.to_csv(CLEANED_DATA_PATH / "cleaned_train_folds.csv", index=False)