In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from utils.transformations import FlaggedImputerTransformer
from utils.filters import ZScoreOutlierFilter


### Cargar datos

In [None]:
df = pd.read_csv("data/preprocessed/combined_data.csv")
df.head()


### Separar features y variable objetivo

In [8]:
X = df.drop(columns=["Price"])
y = df["Price"]


### Aplicar FlaggedImputerTransformer

In [10]:
transformer = FlaggedImputerTransformer()
transformer.fit(X, y)
X_transformed, y_transformed = transformer.transform(X, y)


### Filtrar outliers con ZScoreOutlierFilter

In [11]:
zfilter = ZScoreOutlierFilter(z_thresh=3.0)
X_filtered, y_filtered = zfilter.transform(X_transformed, y_transformed)


In [12]:
# Aplicar One-Hot Encoding a variables categóricas tras el filtrado
X_encoded = pd.get_dummies(X_filtered, drop_first=True)


### Entrenar modelo con HistGradientBoostingRegressor

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_filtered, test_size=0.2, random_state=42)
model = HistGradientBoostingRegressor()
model.fit(X_train, y_train.ravel())
y_pred = model.predict(X_test)


  model.fit(X_train, y_train.ravel())


### Evaluación del modelo

In [14]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape:.2f}%")


MAE: 6187120.66
RMSE: 17904240.58
MAPE: 78.73%


### Conclusión

Este segundo experimento aplica el preprocesador `FlaggedImputerTransformer`, que realiza:
- Imputación con la mediana
- Creación de indicadores de missing
- Expansión no lineal con raíz cuadrada de 'Area'

Junto con `ZScoreOutlierFilter` y el modelo `HistGradientBoostingRegressor`, permite comparar si esta estrategia funciona mejor o peor que la anterior.
