José Delgado

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv("./data/regressiondata.csv", index_col="ID")
data

Unnamed: 0_level_0,TransactionDate,HouseAge,DistanceToStation,NumberOfPubs,PostCode,HousePrice
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2020.12,17.0,467.644775,4.0,5222.0,467104
1,2021.04,36.0,659.924963,3.0,5222.0,547714
2,2019.04,38.0,305.475941,7.0,5213.0,277232
3,2021.10,11.0,607.034754,5.0,5213.0,295958
4,2021.02,14.0,378.827222,5.0,5614.0,439963
...,...,...,...,...,...,...
9351,2019.07,36.0,554.324820,3.0,5217.0,420246
9352,2021.02,21.0,2296.349397,4.0,5614.0,256087
9353,2020.11,18.0,856.174897,0.0,5614.0,257663
9354,2021.10,6.0,87.260667,9.0,5614.0,681072


# 4. Data Preprocessing

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
imputer = SimpleImputer()

In [6]:
imputer.fit(data[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

In [7]:
imputer.transform(data[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

array([[ 17.        , 467.6447748 ,   4.        ],
       [ 36.        , 659.9249634 ,   3.        ],
       [ 38.        , 305.4759413 ,   7.        ],
       ...,
       [ 18.        , 856.1748968 ,   0.        ],
       [  6.        ,  87.26066662,   9.        ],
       [ 20.        , 584.0071457 ,   4.        ]])

In [21]:
data[data["HouseAge"].isnull()][["HouseAge", "DistanceToStation", "NumberOfPubs"]]

Unnamed: 0_level_0,HouseAge,DistanceToStation,NumberOfPubs
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24,,,4.0
2416,,,
6168,,568.369197,8.0
7673,,3830.892098,0.0


In [22]:
imputer.transform(data[data["HouseAge"].isnull()][["HouseAge", "DistanceToStation", "NumberOfPubs"]])

array([[  17.43402481, 1099.93412927,    4.        ],
       [  17.43402481, 1099.93412927,  538.58499038],
       [  17.43402481,  568.3691972 ,    8.        ],
       [  17.43402481, 3830.892098  ,    0.        ]])

In [28]:
data["HouseAge"].mean()

np.float64(17.434024807527802)

In [29]:
imputer.statistics_

array([  17.43402481, 1099.93412927,  538.58499038])

In [36]:
(data.iloc[120]["HouseAge"]-data["HouseAge"].mean()) / data["HouseAge"].std()

np.float64(-1.2658835273650424)

In [37]:
onehot = OneHotEncoder()
onehot.fit(data[["PostCode"]])

In [39]:
onehot.categories_

[array([5212., 5213., 5217., 5222., 5614.,   nan])]

In [43]:
onehot.transform(data[["PostCode"]]).toarray()

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.]])

In [45]:
onehot.get_feature_names_out()

array(['PostCode_5212.0', 'PostCode_5213.0', 'PostCode_5217.0',
       'PostCode_5222.0', 'PostCode_5614.0', 'PostCode_nan'], dtype=object)

## Build Preprocessing Function

In [58]:
class Preprocessor(BaseEstimator, TransformerMixin):
    # Train the imputer and the scaler
    def fit(self, X, y=None):

        # Create and fit the imputer
        self.imputer = SimpleImputer()
        self.imputer.fit(X[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

        # Create and fit the scaler
        self.scaler = StandardScaler()
        self.scaler.fit(X[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

        # Create and fit the onehot encoder
        self.onehot = OneHotEncoder(handle_unknown="ignore")
        self.onehot.fit(X[["PostCode"]])

    # Apply the imputer and the scaler
    def transform(self, X):

        # Drop rows with missing target values
        X.dropna(inplace=True)

        X = X[X["HousePrice"] != "??"]

        # Type cast the columns
        X["PostCode"] = X["PostCode"].astype("str")
        X["TransactionDate"] = X["TransactionDate"].astype("str")
        X["HousePrice"] = X["HousePrice"].astype("float")


        X = X.drop(X[X["HousePrice"] > 3000000].index)

        X.drop(X[X["NumberOfPubs"] > 20].index, inplace=True)

        # Apply year and month transformation
        X["TransactionYear"] = X["TransactionDate"].apply(lambda x: int(x.split(".")[0])).astype(int)
        X["TransactionMonth"] = X["TransactionDate"].apply(lambda x: int(x.split(".")[1])).astype(int)
        X = X.drop(columns=["TransactionDate"])
        
        # Apply the imputer
        X[["HouseAge", "DistanceToStation", "NumberOfPubs"]] = self.imputer.transform(X[["HouseAge", "DistanceToStation", "NumberOfPubs"]])

        # Apply the scaler
        X[['HouseAge', 'DistanceToStation', 'NumberOfPubs']] = self.scaler.transform(X[['HouseAge', 'DistanceToStation', 'NumberOfPubs']])

        # Apply the onehot encoder
        onehot_encoded = self.onehot.transform(X[["PostCode"]])
        X = X.drop(columns=["PostCode"])
        X[self.onehot.get_feature_names_out()] = onehot_encoded.toarray().astype(int)
        
        return X

## Preview Preprocessed Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(data, test_size=.3, random_state=1234)

In [59]:
preproccesor = Preprocessor()
preproccesor.fit(data)

In [60]:
preproccesor.transform(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["PostCode"] = X["PostCode"].astype("str")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["TransactionDate"] = X["TransactionDate"].astype("str")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["HousePrice"] = X["HousePrice"].astype("float")


Unnamed: 0_level_0,HouseAge,DistanceToStation,NumberOfPubs,HousePrice,TransactionYear,TransactionMonth,PostCode_5212.0,PostCode_5213.0,PostCode_5217.0,PostCode_5222.0,PostCode_5614.0
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,-0.038159,-0.498767,-0.010346,467104.0,2020,12,0,0,0,1,0
1,1.627939,-0.347078,-0.010365,547714.0,2021,4,0,0,0,1,0
2,1.803318,-0.626702,-0.010288,277232.0,2019,4,0,1,0,0,0
3,-0.564295,-0.388803,-0.010326,295958.0,2021,1,0,1,0,0,0
4,-0.301227,-0.568835,-0.010326,439963.0,2021,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
9351,1.627939,-0.430385,-0.010365,420246.0,2019,7,0,0,1,0,0
9352,0.312599,0.943896,-0.010346,256087.0,2021,2,0,0,0,0,1
9353,0.049531,-0.192256,-0.010423,257663.0,2020,11,0,0,0,0,1
9354,-1.002742,-0.798852,-0.010249,681072.0,2021,1,0,0,0,0,1


## Clean up Analysis Features

## Create X and y values