José Delgado

In [64]:
import pandas as pd

In [65]:
data = pd.read_csv("./data/regressiondata.csv", index_col="ID")
data

Unnamed: 0_level_0,TransactionDate,HouseAge,DistanceToStation,NumberOfPubs,PostCode,HousePrice
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2020.12,17.0,467.644775,4.0,5222.0,467104
1,2021.04,36.0,659.924963,3.0,5222.0,547714
2,2019.04,38.0,305.475941,7.0,5213.0,277232
3,2021.10,11.0,607.034754,5.0,5213.0,295958
4,2021.02,14.0,378.827222,5.0,5614.0,439963
...,...,...,...,...,...,...
9351,2019.07,36.0,554.324820,3.0,5217.0,420246
9352,2021.02,21.0,2296.349397,4.0,5614.0,256087
9353,2020.11,18.0,856.174897,0.0,5614.0,257663
9354,2021.10,6.0,87.260667,9.0,5614.0,681072


# 4. Data Preprocessing

In [66]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [67]:
imputer = SimpleImputer()

In [68]:
imputer.fit(data[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

In [69]:
imputer.transform(data[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

array([[ 17.        , 467.6447748 ,   4.        ],
       [ 36.        , 659.9249634 ,   3.        ],
       [ 38.        , 305.4759413 ,   7.        ],
       ...,
       [ 18.        , 856.1748968 ,   0.        ],
       [  6.        ,  87.26066662,   9.        ],
       [ 20.        , 584.0071457 ,   4.        ]])

In [70]:
data[data["HouseAge"].isnull()][["HouseAge", "DistanceToStation", "NumberOfPubs"]]

Unnamed: 0_level_0,HouseAge,DistanceToStation,NumberOfPubs
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24,,,4.0
2416,,,
6168,,568.369197,8.0
7673,,3830.892098,0.0


In [71]:
imputer.transform(data[data["HouseAge"].isnull()][["HouseAge", "DistanceToStation", "NumberOfPubs"]])

array([[  17.43402481, 1099.93412927,    4.        ],
       [  17.43402481, 1099.93412927,  538.58499038],
       [  17.43402481,  568.3691972 ,    8.        ],
       [  17.43402481, 3830.892098  ,    0.        ]])

In [72]:
data["HouseAge"].mean()

np.float64(17.434024807527802)

In [73]:
imputer.statistics_

array([  17.43402481, 1099.93412927,  538.58499038])

In [74]:
(data.iloc[120]["HouseAge"]-data["HouseAge"].mean()) / data["HouseAge"].std()

np.float64(-1.2658835273650424)

In [75]:
onehot = OneHotEncoder()
onehot.fit(data[["PostCode"]])

In [76]:
onehot.categories_

[array([5212., 5213., 5217., 5222., 5614.,   nan])]

In [77]:
onehot.transform(data[["PostCode"]]).toarray()

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.]])

In [78]:
onehot.get_feature_names_out()

array(['PostCode_5212.0', 'PostCode_5213.0', 'PostCode_5217.0',
       'PostCode_5222.0', 'PostCode_5614.0', 'PostCode_nan'], dtype=object)

## Build Preprocessing Function

In [84]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Crear y ajustar el imputer
        self.imputer = SimpleImputer()
        self.imputer.fit(X[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

        # Crear y ajustar el scaler
        self.scaler = StandardScaler()
        self.scaler.fit(X[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

        # Crear y ajustar el onehot encoder
        self.onehot = OneHotEncoder(handle_unknown="ignore")
        self.onehot.fit(X[["PostCode"]])

    def transform(self, X):
        # Hacer una copia del DataFrame original para evitar modificarlo
        X = X.copy()

        # Eliminar filas con valores nulos en las columnas objetivo
        X = X.dropna()

        # **Manejo de errores en conversiones**
        # Detectar valores problemáticos antes de convertir a float
        invalid_house_prices = X[~X["HousePrice"].str.replace(".", "", 1).str.isdigit()]
        if not invalid_house_prices.empty:
            print(f"Valores no válidos encontrados en 'HousePrice': {invalid_house_prices}")
            # Eliminar las filas con valores no válidos
            X = X[X["HousePrice"].str.replace(".", "", 1).str.isdigit()]

        # Conversión explícita de tipos, asegurándose de consistencia
        X = X.astype({"PostCode": "string", "TransactionDate": "string", "HousePrice": "float"})

        # Eliminar valores extremos
        X = X[X["HousePrice"] <= 3000000]
        X = X[X["NumberOfPubs"] <= 20]

        # Transformar "TransactionDate" en columnas de año y mes
        X.loc[:, "TransactionYear"] = X["TransactionDate"].apply(lambda x: int(x.split(".")[0]))
        X.loc[:, "TransactionMonth"] = X["TransactionDate"].apply(lambda x: int(x.split(".")[1]))
        X = X.drop(columns=["TransactionDate"])

        # Aplicar el imputer
        X[["HouseAge", "DistanceToStation", "NumberOfPubs"]] = self.imputer.transform(
            X[["HouseAge", "DistanceToStation", "NumberOfPubs"]]
        )

        # Aplicar el escalador
        X[["HouseAge", "DistanceToStation", "NumberOfPubs"]] = self.scaler.transform(
            X[["HouseAge", "DistanceToStation", "NumberOfPubs"]]
        )

        # Aplicar el onehot encoder
        onehot_encoded = self.onehot.transform(X[["PostCode"]])
        X = X.drop(columns=["PostCode"])
        X[self.onehot.get_feature_names_out()] = onehot_encoded.toarray().astype(int)

        return X

## Preview Preprocessed Data

In [80]:
from sklearn.model_selection import train_test_split

In [81]:
train, test = train_test_split(data, test_size=.3, random_state=1234)

In [85]:
preproccesor = Preprocessor()
preproccesor.fit(train)

In [86]:
preproccesor.transform(train)

Valores no válidos encontrados en 'HousePrice':       TransactionDate  HouseAge  DistanceToStation  NumberOfPubs  PostCode  \
ID                                                                           
8718          2020.08      16.0         122.432508           5.0    5213.0   

     HousePrice  
ID               
8718         ??  


Unnamed: 0_level_0,HouseAge,DistanceToStation,NumberOfPubs,HousePrice,TransactionYear,TransactionMonth,PostCode_5212.0,PostCode_5213.0,PostCode_5217.0,PostCode_5222.0,PostCode_5614.0,PostCode_nan
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
8450,1.449606,-0.496688,-0.012296,460286.0,2021,2,0,0,0,0,0,0
5758,-0.032729,-0.375535,-0.012377,253332.0,2020,1,0,0,0,0,0,0
4030,1.449606,-0.634570,-0.012280,486532.0,2021,6,0,0,0,0,0,0
8447,-1.515063,-0.705974,-0.012329,595168.0,2020,12,0,0,0,0,0,0
4370,1.449606,-0.761986,-0.012296,250495.0,2019,3,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
664,1.711195,-0.564067,-0.012410,301492.0,2020,8,0,0,0,0,0,0
7540,1.449606,-0.769640,-0.012345,174239.0,2020,8,0,0,0,0,0,0
7221,-0.119925,-0.382636,-0.012393,327666.0,2020,2,0,0,0,0,0,0
1318,1.623998,-0.832461,-0.012312,392231.0,2020,11,0,0,0,0,0,0


## Clean up Analysis Features

## Create X and y values