José Delgado

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Crear y ajustar el imputer
        self.imputer = SimpleImputer()
        self.imputer.fit(X[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

        # Crear y ajustar el scaler
        self.scaler = StandardScaler()
        self.scaler.fit(X[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

        # Crear y ajustar el onehot encoder
        self.onehot = OneHotEncoder(handle_unknown="ignore")
        self.onehot.fit(X[["PostCode"]])

    def transform(self, X):
        # Hacer una copia del DataFrame original para evitar modificarlo
        X = X.copy()

        # Eliminar filas con valores nulos en las columnas objetivo
        X = X.dropna()

        # **Manejo de errores en conversiones**
        # Detectar valores problemáticos antes de convertir a float
        invalid_house_prices = X[~X["HousePrice"].str.replace(".", "", 1).str.isdigit()]
        if not invalid_house_prices.empty:
            print(f"Valores no válidos encontrados en 'HousePrice': {invalid_house_prices}")
            # Eliminar las filas con valores no válidos
            X = X[X["HousePrice"].str.replace(".", "", 1).str.isdigit()]

        # Conversión explícita de tipos, asegurándose de consistencia
        X = X.astype({"PostCode": "string", "TransactionDate": "string", "HousePrice": "float"})

        # Eliminar valores extremos
        X = X[X["HousePrice"] <= 3000000]
        X = X[X["NumberOfPubs"] <= 20]

        # Transformar "TransactionDate" en columnas de año y mes
        X.loc[:, "TransactionYear"] = X["TransactionDate"].apply(lambda x: int(x.split(".")[0]))
        X.loc[:, "TransactionMonth"] = X["TransactionDate"].apply(lambda x: int(x.split(".")[1]))
        X = X.drop(columns=["TransactionDate"])

        # Aplicar el imputer
        X[["HouseAge", "DistanceToStation", "NumberOfPubs"]] = self.imputer.transform(
            X[["HouseAge", "DistanceToStation", "NumberOfPubs"]]
        )

        # Aplicar el escalador
        X[["HouseAge", "DistanceToStation", "NumberOfPubs"]] = self.scaler.transform(
            X[["HouseAge", "DistanceToStation", "NumberOfPubs"]]
        )

        # Aplicar el onehot encoder
        onehot_encoded = self.onehot.transform(X[["PostCode"]])
        X = X.drop(columns=["PostCode"])
        X[self.onehot.get_feature_names_out()] = onehot_encoded.toarray().astype(int)

        return X

# 5. Modelling

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("./data/regressiondata.csv", index_col="ID")

In [3]:
train, test = train_test_split(data, test_size=.3, random_state=1234)

## Create X and y values

In [4]:
X_train = train.drop(columns=["HousePrice"])
y_train = train["HousePrice"]

## Import ML Dependencies

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor

## Create Pipelines

### Training Outside of a Pipeline

## Create Tuning Grids

## Train Models and Perform HPO