In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from utils.transformations import EncodedKNNTransformer, FlaggedImputerTransformer
from utils.filters import ZScoreOutlierFilter


### Cargar datos

In [2]:
df = pd.read_csv("data/preprocessed/combined_data.csv")
df.head()


Unnamed: 0,Price,city,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Stadium
0,8500000,Mumbai,1614,Kandivali West,3,0,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
1,3700000,Mumbai,440,Mira Road East,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2,3500000,Mumbai,890,Kalyan West,2,1,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
3,8400000,Bangalore,3138,Whitefield Hope Farm Junction,4,0,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
4,56000000,Mumbai,2200,Wadala East Wadala,3,1,1,0,0,1,...,1,1,1,0,0,0,0,0,0,0


### Separar features y variable objetivo

In [3]:
X = df.drop(columns=["Price"])
y = df["Price"]


### Aplicar EncodedKNNTransformer

In [4]:
encoded_knn = EncodedKNNTransformer()
encoded_knn.fit(X, y)
X_encoded, y_encoded = encoded_knn.transform(X, y)


### Filtrar outliers con ZScoreOutlierFilter

In [5]:
zfilter = ZScoreOutlierFilter(z_thresh=3.0)
X_filtered, y_filtered = zfilter.transform(X_encoded, y_encoded)

### Entrenar modelo con HistGradientBoostingRegressor

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.2, random_state=42)
model = HistGradientBoostingRegressor()
model.fit(X_train, y_train.ravel())
y_pred = model.predict(X_test)


### Evaluación del modelo

In [7]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape:.2f}%")


MAE: 0.26
RMSE: 0.76
MAPE: 232.20%


### Conclusión

El modelo `HistGradientBoostingRegressor` ha sido entrenado utilizando las nuevas clases:
- `EncodedKNNTransformer`: para preprocesamiento avanzado.
- `ZScoreOutlierFilter`: para detección y filtrado de outliers.

Se ha evaluado con las métricas MAE, RMSE y MAPE. Este enfoque permite comparar fácilmente el rendimiento respecto a modelos anteriores implementados en el proyecto.
