José Delgado

## Save Models

In [None]:
import dill

In [232]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import TransformedTargetRegressor
import pandas as pd
import numpy as np

In [233]:
def y_transform(y):
    """
    Recibe y como (n,) o (n,1) y devuelve (n,1).
    """
    # Asegurarte de que sea 1D
    y = np.ravel(y)  # pasa a forma (n,)

    s = pd.Series(y)  # para poder usar replace y to_numeric
    s = pd.to_numeric(s, errors="coerce")
    mean_val = s.mean()
    s = s.fillna(mean_val)

    # Regresamos a (n,1) en vez de (n,) 
    return s.to_numpy().reshape(-1, 1)

def y_inverse_transform(y):
    """
    Recibe (n,1), o (n,) y devuelve (n,1). 
    Si no necesitas realmente "des-transformar" nada,
    simplemente devuélvelo en la misma forma 2D.
    """
    if y.ndim == 1:
        y = y.reshape(-1, 1)
    return y

In [234]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Crear y ajustar el imputer
        self.imputer = SimpleImputer()
        self.imputer.fit(X[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

        # Crear y ajustar el scaler
        self.scaler = StandardScaler()
        self.scaler.fit(X[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

        # Crear y ajustar el onehot encoder
        self.onehot = OneHotEncoder(handle_unknown="ignore")
        self.onehot.fit(X[["PostCode"]])

        return self

    def transform(self, X):
        # Hacer una copia del DataFrame original para evitar modificarlo
        X = X.copy()

        # Eliminar filas con valores nulos en las columnas objetivo
        # X = X.dropna()
        # OJO: NO se eliminan filas aquí porque da lugar a misalignment con 'y'

        # **Manejo de errores en conversiones**
        # Conversión explícita de tipos, asegurándose de consistencia
        X = X.astype({"PostCode": "string", "TransactionDate": "string"})

        # Transformar "TransactionDate" en columnas de año y mes
        X.loc[:, "TransactionYear"] = X["TransactionDate"].apply(lambda x: int(x.split(".")[0]))
        X.loc[:, "TransactionMonth"] = X["TransactionDate"].apply(lambda x: int(x.split(".")[1]))
        X = X.drop(columns=["TransactionDate"])

        # Aplicar el imputer
        X[["HouseAge", "DistanceToStation", "NumberOfPubs"]] = self.imputer.transform(
            X[["HouseAge", "DistanceToStation", "NumberOfPubs"]]
        )

        # Aplicar el escalador
        X[["HouseAge", "DistanceToStation", "NumberOfPubs"]] = self.scaler.transform(
            X[["HouseAge", "DistanceToStation", "NumberOfPubs"]]
        )

        # Aplicar el onehot encoder
        onehot_encoded = self.onehot.transform(X[["PostCode"]])
        X = X.drop(columns=["PostCode"])
        X[self.onehot.get_feature_names_out()] = onehot_encoded.toarray().astype(int)

        return X

# 5. Modelling

In [235]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [236]:
data = pd.read_csv("./data/regressiondata.csv", index_col="ID")

Hacemos un debug explícito antes de entrenar, no es elegante pero es lo que hay.

In [237]:
data = data[data["HousePrice"] != "??"]

In [238]:
data = data.dropna()

Todo lo que sea eliminar filas hay que hacerlo antes del pipeline...

In [239]:
# Eliminar valores extremos
data = data[data["NumberOfPubs"] <= 20]

In [240]:
train, test = train_test_split(data, test_size=.3, random_state=1234)

## Create X and y values

In [241]:
X_train = train.drop(columns=["HousePrice"])
y_train = train["HousePrice"]

In [242]:
print("Forma de X:", X_train.shape)
print("Forma de y:", y_train.shape)

Forma de X: (6539, 5)
Forma de y: (6539,)


In [243]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

## Import ML Dependencies

In [244]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor

## Create Pipelines

In [245]:
pipelines = {
    "ridge": make_pipeline(Preprocessor(), TransformedTargetRegressor(regressor=Ridge(), func=y_transform, inverse_func=y_inverse_transform)),
    "random_forest": make_pipeline(Preprocessor(), TransformedTargetRegressor(regressor=RandomForestRegressor(), func=y_transform, inverse_func=y_inverse_transform)),
    "gradient_boosting": make_pipeline(Preprocessor(), TransformedTargetRegressor(regressor=GradientBoostingRegressor(), func=y_transform, inverse_func=y_inverse_transform)),
    "xgboost": make_pipeline(Preprocessor(), TransformedTargetRegressor(regressor=XGBRegressor(), func=y_transform, inverse_func=y_inverse_transform))
}

In [246]:
pipelines['ridge']

In [247]:
pipelines['ridge'].named_steps

{'preprocessor': Preprocessor(),
 'transformedtargetregressor': TransformedTargetRegressor(func=<function y_transform at 0x12d750fe0>,
                            inverse_func=<function y_inverse_transform at 0x12d751260>,
                            regressor=Ridge())}

In [248]:
pipelines['ridge'].get_params()

{'memory': None,
 'steps': [('preprocessor', Preprocessor()),
  ('transformedtargetregressor',
   TransformedTargetRegressor(func=<function y_transform at 0x12d750fe0>,
                              inverse_func=<function y_inverse_transform at 0x12d751260>,
                              regressor=Ridge()))],
 'verbose': False,
 'preprocessor': Preprocessor(),
 'transformedtargetregressor': TransformedTargetRegressor(func=<function y_transform at 0x12d750fe0>,
                            inverse_func=<function y_inverse_transform at 0x12d751260>,
                            regressor=Ridge()),
 'transformedtargetregressor__check_inverse': True,
 'transformedtargetregressor__func': <function __main__.y_transform(y)>,
 'transformedtargetregressor__inverse_func': <function __main__.y_inverse_transform(y)>,
 'transformedtargetregressor__regressor__alpha': 1.0,
 'transformedtargetregressor__regressor__copy_X': True,
 'transformedtargetregressor__regressor__fit_intercept': True,
 'transforme

In [249]:
pipelines["ridge"]["transformedtargetregressor"].get_params()["regressor"].alpha

1.0

In [250]:
pipelines["ridge"].named_steps["transformedtargetregressor"].regressor.alpha

1.0

## Create Tuning Grids

In [251]:
grid = {
    "ridge": {
        "transformedtargetregressor__regressor__alpha": [0.05, 0.1, 0.25, 0.5, 1, 5, 10]
    },
    "random_forest": {
        "transformedtargetregressor__regressor__n_estimators": [100, 200, 400, 500, 600],
        "transformedtargetregressor__regressor__max_features": ["sqrt"],
        "transformedtargetregressor__regressor__max_depth": [5, 6, 7, None],
    },
    "gradient_boosting": {
        "transformedtargetregressor__regressor__n_estimators": [100, 200, 400, 500, 600],
        "transformedtargetregressor__regressor__learning_rate": [.05, .1],
        "transformedtargetregressor__regressor__max_depth": [5, 6, 7, None]
    },
    "xgboost": {
        "transformedtargetregressor__regressor__n_estimators": [100, 200, 400, 500, 600],
        "transformedtargetregressor__regressor__learning_rate": [.05, .1],
        "transformedtargetregressor__regressor__max_depth": [5, 6, 7, None]
    }
}

## Train Models and Perform HPO

In [252]:
from sklearn.model_selection import GridSearchCV

In [253]:
fit_models = {}

In [254]:
for algorithm, pipeline in pipelines.items():
    try:
        print(f"Entrenando {algorithm}")
        model = GridSearchCV(pipeline, grid[algorithm], cv=10, n_jobs=3, scoring="r2")
        model.fit(X_train, y_train)
        fit_models[algorithm] = model
    except Exception as e:
        print(f"Error al entrenar {algorithm}: {e}")

Entrenando ridge
Entrenando random_forest




Entrenando gradient_boosting




Entrenando xgboost


In [256]:
fit_models

{'ridge': GridSearchCV(cv=10,
              estimator=Pipeline(steps=[('preprocessor', Preprocessor()),
                                        ('transformedtargetregressor',
                                         TransformedTargetRegressor(func=<function y_transform at 0x12d750fe0>,
                                                                    inverse_func=<function y_inverse_transform at 0x12d751260>,
                                                                    regressor=Ridge()))]),
              n_jobs=3,
              param_grid={'transformedtargetregressor__regressor__alpha': [0.05,
                                                                           0.1,
                                                                           0.25,
                                                                           0.5,
                                                                           1, 5,
                                                                   

## Save Models

In [257]:
import dill

In [261]:
for model_name, model_gs in fit_models.items():
    filename = f"./models/{model_name}_gridsearch.pkl"
    with open(filename, "wb") as f:
        dill.dump(model_gs, f)
    print(f"Modelo {model_name} guardado en {filename}")

Modelo ridge guardado en ./models/ridge_gridsearch.pkl
Modelo random_forest guardado en ./models/random_forest_gridsearch.pkl
Modelo gradient_boosting guardado en ./models/gradient_boosting_gridsearch.pkl
Modelo xgboost guardado en ./models/xgboost_gridsearch.pkl
