José Delgado

In [72]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import TransformedTargetRegressor
import pandas as pd

In [73]:
def y_transform(y):
    """
    Transformar y: convertir de string a float, manejar valores no válidos y
    reemplazar NaN por la media.
    
    - Convierte 'y' a float.
    - Reemplaza valores no válidos (NaN) por la media.
    """
    # Convertir a float, asignando NaN para valores no válidos
    y = pd.to_numeric(y, errors="coerce")

    # Calcular la media de los valores válidos
    mean_value = y.mean()

    # Reemplazar NaN por la media
    y = y.fillna(mean_value)

    return y

def y_inverse_transform(y):
    """
    Inversa de la transformación (sin cambios aquí).
    
    En este caso, simplemente devuelve 'y' tal cual porque no se aplica una transformación no reversible.
    """
    return y

In [74]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Crear y ajustar el imputer
        self.imputer = SimpleImputer()
        self.imputer.fit(X[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

        # Crear y ajustar el scaler
        self.scaler = StandardScaler()
        self.scaler.fit(X[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

        # Crear y ajustar el onehot encoder
        self.onehot = OneHotEncoder(handle_unknown="ignore")
        self.onehot.fit(X[["PostCode"]])

        return self

    def transform(self, X):
        # Hacer una copia del DataFrame original para evitar modificarlo
        X = X.copy()

        # Eliminar filas con valores nulos en las columnas objetivo
        X = X.dropna()

        # **Manejo de errores en conversiones**
        # Conversión explícita de tipos, asegurándose de consistencia
        X = X.astype({"PostCode": "string", "TransactionDate": "string", "HousePrice": "float"})

        # Eliminar valores extremos
        X = X[X["NumberOfPubs"] <= 20]

        # Transformar "TransactionDate" en columnas de año y mes
        X.loc[:, "TransactionYear"] = X["TransactionDate"].apply(lambda x: int(x.split(".")[0]))
        X.loc[:, "TransactionMonth"] = X["TransactionDate"].apply(lambda x: int(x.split(".")[1]))
        X = X.drop(columns=["TransactionDate"])

        # Aplicar el imputer
        X[["HouseAge", "DistanceToStation", "NumberOfPubs"]] = self.imputer.transform(
            X[["HouseAge", "DistanceToStation", "NumberOfPubs"]]
        )

        # Aplicar el escalador
        X[["HouseAge", "DistanceToStation", "NumberOfPubs"]] = self.scaler.transform(
            X[["HouseAge", "DistanceToStation", "NumberOfPubs"]]
        )

        # Aplicar el onehot encoder
        onehot_encoded = self.onehot.transform(X[["PostCode"]])
        X = X.drop(columns=["PostCode"])
        X[self.onehot.get_feature_names_out()] = onehot_encoded.toarray().astype(int)

        return X

# 5. Modelling

In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [76]:
data = pd.read_csv("./data/regressiondata.csv", index_col="ID")

In [77]:
train, test = train_test_split(data, test_size=.3, random_state=1234)

## Create X and y values

In [78]:
X_train = train.drop(columns=["HousePrice"])
y_train = train["HousePrice"]

## Import ML Dependencies

In [79]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor

## Create Pipelines

In [80]:
pipeline = make_pipeline(
    Preprocessor(),
    TransformedTargetRegressor(
        regressor=RandomForestRegressor(),
        func=y_transform,
        inverse_func=lambda y: y  # La inversa no hace nada en este caso
    )
)

In [81]:
pipelines = {
    "ridge": make_pipeline(Preprocessor(), TransformedTargetRegressor(regressor=Ridge(), func=y_transform, inverse_func=y_inverse_transform)),
    "random_forest": make_pipeline(Preprocessor(), TransformedTargetRegressor(regressor=RandomForestRegressor(), func=y_transform, inverse_func=y_inverse_transform)),
    "gradient_boosting": make_pipeline(Preprocessor(), TransformedTargetRegressor(regressor=GradientBoostingRegressor(), func=y_transform, inverse_func=y_inverse_transform)),
    "xgboost": make_pipeline(Preprocessor(), TransformedTargetRegressor(regressor=XGBRegressor(), func=y_transform, inverse_func=y_inverse_transform))
}

In [82]:
pipelines['ridge']

In [83]:
pipelines['ridge'].named_steps

{'preprocessor': Preprocessor(),
 'transformedtargetregressor': TransformedTargetRegressor(func=<function y_transform at 0x120a8ad40>,
                            inverse_func=<function y_inverse_transform at 0x121a0d580>,
                            regressor=Ridge())}

In [84]:
pipelines['ridge'].get_params()

{'memory': None,
 'steps': [('preprocessor', Preprocessor()),
  ('transformedtargetregressor',
   TransformedTargetRegressor(func=<function y_transform at 0x120a8ad40>,
                              inverse_func=<function y_inverse_transform at 0x121a0d580>,
                              regressor=Ridge()))],
 'verbose': False,
 'preprocessor': Preprocessor(),
 'transformedtargetregressor': TransformedTargetRegressor(func=<function y_transform at 0x120a8ad40>,
                            inverse_func=<function y_inverse_transform at 0x121a0d580>,
                            regressor=Ridge()),
 'transformedtargetregressor__check_inverse': True,
 'transformedtargetregressor__func': <function __main__.y_transform(y)>,
 'transformedtargetregressor__inverse_func': <function __main__.y_inverse_transform(y)>,
 'transformedtargetregressor__regressor__alpha': 1.0,
 'transformedtargetregressor__regressor__copy_X': True,
 'transformedtargetregressor__regressor__fit_intercept': True,
 'transforme

In [85]:
pipelines["ridge"]["transformedtargetregressor"].get_params()["regressor"].alpha

1.0

In [90]:
pipelines["ridge"].named_steps["transformedtargetregressor"].regressor.alpha

1.0

## Create Tuning Grids

In [86]:
grid = {
    "ridge": {
        "transformedtargetregressor__regressor__alpha": [0.05, 0.1, 0.25, 0.5, 1, 5, 10]
    },
    "random_forest": {
        "transformedtargetregressor__regressor__n_estimators": [100, 200, 400, 500, 600],
        "transformedtargetregressor__regressor__max_features": ["auto", "sqrt"],
        "transformedtargetregressor__regressor__max_depth": [5, 6, 7, None],
    },
    "gradient_boosting": {
        "transformedtargetregressor__regressor__n_estimators": [100, 200, 400, 500, 600],
        "transformedtargetregressor__regressor__learning_rate": [.05, .1],
        "transformedtargetregressor__regressor__max_depth": [5, 6, 7, None]
    },
    "xgboost": {
        "transformedtargetregressor__regressor__n_estimators": [100, 200, 400, 500, 600],
        "transformedtargetregressor__regressor__learning_rate": [.05, .1],
        "transformedtargetregressor__regressor__max_depth": [5, 6, 7, None]
    }
}

## Train Models and Perform HPO

In [87]:
from sklearn.model_selection import GridSearchCV

In [88]:
fit_models = {}

In [89]:
for algorithm, pipeline in pipelines.items():
    try:
        print(f"Entrenando {algorithm}")
        model = GridSearchCV(pipeline, grid[algorithm], cv=10, n_jobs=-1, scoring="r2")
        model.fit(X_train, y_train)
        fit_models[algorithm] = model
    except Exception as e:
        print(f"Error al entrenar {algorithm}: {e}")

Entrenando ridge
Error al entrenar ridge: Invalid parameter 'ridge' for estimator Pipeline(steps=[('preprocessor', Preprocessor()),
                ('transformedtargetregressor',
                 TransformedTargetRegressor(func=<function y_transform at 0x12411dc60>,
                                            inverse_func=<function y_inverse_transform at 0x12413f880>,
                                            regressor=Ridge()))]). Valid parameters are: ['memory', 'steps', 'verbose'].
Entrenando random_forest
Error al entrenar random_forest: Invalid parameter 'randomforestregressor' for estimator Pipeline(steps=[('preprocessor', Preprocessor()),
                ('transformedtargetregressor',
                 TransformedTargetRegressor(func=<function y_transform at 0x1202a9c60>,
                                            inverse_func=<function y_inverse_transform at 0x1202f6020>,
                                            regressor=RandomForestRegressor()))]). Valid parameters are: 