In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import sklearn
from sklearn.model_selection import train_test_split

## Ridge regression

Ridge regression is a type of regression that introduces the concept of regularization through the alpha parameter in the ERM; at the end of this project we will see how to calibrate the value of this parameter

A regressor is a function

<img src = "images/regressor.png" width = "200">

 where <em>w</em> is a vector of real coefficients (weights), and <em>x</em> refers to the <em>x</em> component of training set

As anticipated before, the ERM is regularized by parameter <b>alpha</b>

<img src= "images/ridge_regression.png" width = "350"/>

where S is the <b>design matrix</b>

The regularization is needed since the vector is not stable; in particular, it can change a lot when the dataset is perturbed and let's keep in mind that we had to deal with missing values, hence we changed a fex examples and this could have led to a variance error.

In [12]:
def erm(S, w, y, alfa):
    return np.argmin(np.square(np.norm(S*w - y)) + alfa * np.square(np.norm(w)))

### Loading preprocessed datasets

Let's load the preprocessed datasets, we will handle 4 cases:
1. the complete dataset with replaced missing values normalized with min-max
2. the reduced dataset (missing values have been removed) with min-max normalization
3. the complete dataset with z-score normalization
4. the reduced dataset normalized with z-score

In [13]:
min_max_norm_dataset = pd.read_csv("datasets/min_max_norm_dataset")
min_max_norm_reduced_dataset = pd.read_csv("datasets/min_max_norm_reduced_dataset")

zscore_norm_dataset = pd.read_csv("datasets/zscore_norm_dataset")
zscore_norm_reduced_dataset = pd.read_csv("datasets/zscore_norm_reduced_dataset")

In [14]:
del min_max_norm_dataset["Unnamed: 0"]
del min_max_norm_reduced_dataset["Unnamed: 0"]
del zscore_norm_dataset["Unnamed: 0"]
del zscore_norm_reduced_dataset["Unnamed: 0"]

### Affine transformation

In order to adjust the input, we add a dimension to the *x* vector by putting all the values equal to 1

In [15]:
min_max_norm_dataset["added"] = 1
min_max_norm_reduced_dataset["added"] = 1

zscore_norm_dataset["added"] = 1
zscore_norm_reduced_dataset["added"] = 1

min_max_norm_dataset.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,added
0,0.211155,0.567481,0.784314,0.022331,0.019863,0.008941,0.020556,0.539668,0.902266,0.25,1
1,0.212151,0.565356,0.392157,0.180503,0.171477,0.06721,0.186976,0.538027,0.708247,0.25,1
2,0.210159,0.564293,1.0,0.03726,0.02933,0.013818,0.028943,0.466028,0.695051,0.25,1
3,0.209163,0.564293,1.0,0.032352,0.036313,0.015555,0.035849,0.354699,0.672783,0.25,1
4,0.209163,0.564293,1.0,0.04133,0.043296,0.015752,0.042427,0.230776,0.674638,0.25,1
5,0.209163,0.564293,1.0,0.023323,0.032899,0.011491,0.031574,0.243921,0.525155,0.25,1
6,0.209163,0.563231,1.0,0.064423,0.075729,0.030578,0.084361,0.217873,0.585979,0.25,1
7,0.209163,0.563231,1.0,0.078895,0.106456,0.032344,0.106233,0.180694,0.466804,0.25,1
8,0.208167,0.563231,0.803922,0.064932,0.103042,0.033717,0.097681,0.108998,0.436495,0.25,1
9,0.209163,0.563231,1.0,0.090213,0.109559,0.043387,0.11725,0.220087,0.507423,0.25,1


### Splitting in training and test set

In [6]:
def split_dataset(dataset):
    y = dataset["median_house_value"]
    x = dataset.drop("median_house_value", axis = 1)
    return train_test_split(x, y, test_size = 0.2)

### First dataset

In [7]:
x_train, x_test, y_train, y_test = split_dataset(min_max_norm_dataset)