# Preprocessing. `sklearn` Pipeline

## Libraries Import

In [1]:
import math

import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

## Configuration

In [2]:
DATASET_FILENAME = "datasets/housing.csv"
TRAIN_SPLIT_RATE = 0.8

## Imputation of Missing Values

In [3]:
housing_df = pd.read_csv(DATASET_FILENAME)

In [4]:
print("The missing values are:\n\n{}".format(housing_df.isnull().sum()))

The missing values are:

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [5]:
total_bedrooms_median = housing_df["total_bedrooms"].median()
housing_df["total_bedrooms"].fillna(total_bedrooms_median, inplace=True)

## Creation of New Features

In [6]:
housing_df["total_rooms_per_housefolds"] = housing_df["total_rooms"] / housing_df["households"]
housing_df["total_bedrooms_per_total_rooms"] = housing_df["total_bedrooms"] / housing_df["total_rooms"]
housing_df["population_per_households"] = housing_df["population"] / housing_df["households"]

In [7]:
correlation_df = housing_df.corr()
print("The correlation table is:\n\n{}".format(correlation_df))

The correlation table is:

                                longitude  latitude  housing_median_age  \
longitude                        1.000000 -0.924664           -0.108197   
latitude                        -0.924664  1.000000            0.011173   
housing_median_age              -0.108197  0.011173            1.000000   
total_rooms                      0.044568 -0.036100           -0.361262   
total_bedrooms                   0.069120 -0.066484           -0.319026   
population                       0.099773 -0.108785           -0.296244   
households                       0.055310 -0.071035           -0.302916   
median_income                   -0.015176 -0.079809           -0.119034   
median_house_value              -0.045967 -0.144160            0.105623   
total_rooms_per_housefolds      -0.027540  0.106389           -0.153277   
total_bedrooms_per_total_rooms   0.081205 -0.098619            0.135622   
population_per_households        0.002476  0.002366            0.013191  

In [8]:
house_value_corr_df = correlation_df["median_house_value"].sort_values(ascending=False, key=abs)
print("The correlations between the target feature and the others are:\n\n{}".format(house_value_corr_df))

The correlations between the target feature and the others are:

median_house_value                1.000000
median_income                     0.688075
total_bedrooms_per_total_rooms   -0.233303
total_rooms_per_housefolds        0.151948
latitude                         -0.144160
total_rooms                       0.134153
housing_median_age                0.105623
households                        0.065843
total_bedrooms                    0.049457
longitude                        -0.045967
population                       -0.024650
population_per_households        -0.023737
Name: median_house_value, dtype: float64


## Pipeline Creation for Further Preprocessing and Model Training

In [9]:
median_house_value_df = housing_df["median_house_value"]
housing_df.drop(columns=["median_house_value"], inplace=True)

In [10]:
class FixedLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        encoder = LabelBinarizer(sparse_output=self.sparse_output)

        return encoder.fit_transform(X)

In [11]:
numeric_features = housing_df.select_dtypes(include=["int64", "float64"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("ocean_proximity_encoder", FixedLabelBinarizer(), "ocean_proximity"),
        ("numeric_scalar", StandardScaler(), numeric_features),
    ],
    remainder ="passthrough"
)
regressor = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

In [12]:
X = housing_df
y = median_house_value_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TRAIN_SPLIT_RATE)

In [13]:
regressor.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ocean_proximity_encoder',
                                                  FixedLabelBinarizer(),
                                                  'ocean_proximity'),
                                                 ('numeric_scalar',
                                                  StandardScaler(),
                                                  Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'total_rooms_per_housefolds', 'total_bedrooms_per_total_rooms',
       'population_per_households'],
      dtype='object'))])),
                ('regressor', LinearRegression())])

In [14]:
y_pred = regressor.predict(X_test)
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
print("The RMSE of the trained model is {}.".format(rmse))

The RMSE of the trained model is 201025.95891424106.
