In [1]:
cd ..

/home/jovyan


In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import lib.lm as lm
from lib.preprocessing import BoxCoxTransformer
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning

In [3]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [4]:
boston = load_boston()

In [5]:
X = pd.DataFrame(boston.data, columns=boston.feature_names)

In [6]:
y = boston.target

In [7]:
X.sample(4)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
247,0.19657,22.0,5.86,0.0,0.431,6.226,79.2,8.0555,7.0,330.0,19.1,376.14,10.15
309,0.3494,0.0,9.9,0.0,0.544,5.972,76.7,3.1025,4.0,304.0,18.4,396.24,9.97
405,67.9208,0.0,18.1,0.0,0.693,5.683,100.0,1.4254,24.0,666.0,20.2,384.97,22.98
391,5.29305,0.0,18.1,0.0,0.7,6.051,82.5,2.1678,24.0,666.0,20.2,378.38,18.76


## Standardization

Regularized linear models require standardization and possibly skew normalization to function properly.

Note from both the statistical description that features differ significantly in terms of mean and standard deviation. This will impact regularization performance.

In [8]:
samp_stats = X.describe().T
samp_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CRIM,506.0,3.593761,8.596783,0.00632,0.082045,0.25651,3.647423,88.9762
ZN,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
INDUS,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
CHAS,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
NOX,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
RM,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
AGE,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
DIS,506.0,3.795043,2.10571,1.1296,2.100175,3.20745,5.188425,12.1265
RAD,506.0,9.549407,8.707259,1.0,4.0,5.0,24.0,24.0
TAX,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0


## Standardized Model


In [9]:
results = pd.DataFrame()

In [10]:
results, _, _ = \
    lm.simple_alpha_grid_search(make_pipeline(StandardScaler(), LinearRegression()), X, y, results, n=1000)

100%|██████████| 1000/1000 [00:04<00:00, 243.42it/s]


In [11]:
results

Unnamed: 0,alpha,data_preprocessing,model,model_name,test_score,train_score
0,,[standardscaler],"Pipeline(steps=[('standardscaler', StandardSca...",linearregression,0.711009,0.743956


In [12]:
results, _, _ = \
    lm.simple_alpha_grid_search(make_pipeline(StandardScaler(), Lasso()), X, y, results, n=1000)

100%|██████████| 1000/1000 [00:03<00:00, 268.28it/s]
100%|██████████| 1000/1000 [00:05<00:00, 169.23it/s]
100%|██████████| 1000/1000 [00:04<00:00, 242.80it/s]
100%|██████████| 1000/1000 [00:03<00:00, 328.70it/s]
100%|██████████| 1000/1000 [00:03<00:00, 283.68it/s]
100%|██████████| 1000/1000 [00:03<00:00, 305.79it/s]
100%|██████████| 1000/1000 [00:02<00:00, 350.96it/s]
100%|██████████| 1000/1000 [00:03<00:00, 321.66it/s]
100%|██████████| 1000/1000 [00:03<00:00, 319.74it/s]
100%|██████████| 1000/1000 [00:03<00:00, 329.47it/s]
100%|██████████| 1000/1000 [00:02<00:00, 361.21it/s]
100%|██████████| 1000/1000 [00:03<00:00, 293.61it/s]


In [13]:
results, _, _ = \
    lm.simple_alpha_grid_search(make_pipeline(StandardScaler(), Ridge()), X, y, results, n=1000)

100%|██████████| 1000/1000 [00:03<00:00, 316.85it/s]
100%|██████████| 1000/1000 [00:03<00:00, 319.65it/s]
100%|██████████| 1000/1000 [00:03<00:00, 300.90it/s]
100%|██████████| 1000/1000 [00:03<00:00, 287.73it/s]
100%|██████████| 1000/1000 [00:03<00:00, 262.36it/s]
100%|██████████| 1000/1000 [00:03<00:00, 252.14it/s]
100%|██████████| 1000/1000 [00:03<00:00, 268.82it/s]
100%|██████████| 1000/1000 [00:03<00:00, 305.21it/s]
100%|██████████| 1000/1000 [00:03<00:00, 332.90it/s]
100%|██████████| 1000/1000 [00:02<00:00, 339.24it/s]
100%|██████████| 1000/1000 [00:03<00:00, 293.79it/s]
100%|██████████| 1000/1000 [00:03<00:00, 318.62it/s]


In [14]:
results

Unnamed: 0,alpha,data_preprocessing,model,model_name,test_score,train_score
0,,[standardscaler],"Pipeline(steps=[('standardscaler', StandardSca...",linearregression,0.711009,0.743956
0,0.01,[standardscaler],"Pipeline(steps=[('standardscaler', StandardSca...",lasso,0.711215,0.74388
0,10.0,[standardscaler],"Pipeline(steps=[('standardscaler', StandardSca...",ridge,0.711298,0.742562
