# Entry 39 notebook - Lasso Regression

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import openml

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso

In [2]:
auto_mpg = openml.datasets.get_dataset(42372).get_data()[0]
auto_mpg.head()

Unnamed: 0,Displacement,Horse_power,Weight,Acceleration,Model_year,Mpg
0,91.0,70.0,1955.0,20.5,71.0,26.0
1,232.0,100.0,2789.0,15.0,73.0,18.0
2,350.0,145.0,4055.0,12.0,76.0,13.0
3,318.0,140.0,4080.0,13.7,78.0,17.5
4,113.0,95.0,2372.0,15.0,70.0,24.0


In [5]:
def split_data(df, target, train_size):
    y = df[[target]]
    X = df.drop(target, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, random_state=12)
    return X_train, X_test, y_train, y_test

In [10]:
X_train, X_test, y_train, y_test = split_data(auto_mpg, 'Mpg', 0.8)
pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), Lasso())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7775410799630239

In [11]:
cross_validate(pipe, X_train, y_train, cv=10, scoring=['explained_variance', 'r2', 'max_error',
                                                      'neg_median_absolute_error', 'neg_mean_absolute_error',
                                                      'neg_mean_squared_error', 'neg_mean_squared_log_error',
                                                      'neg_root_mean_squared_error', 'neg_mean_poisson_deviance',
                                                      'neg_mean_gamma_deviance'])

{'fit_time': array([0.01100326, 0.00899291, 0.00499725, 0.0059936 , 0.00599599,
        0.00499654, 0.00499606, 0.00599623, 0.00499678, 0.01050425]),
 'score_time': array([0.02298546, 0.01799035, 0.01199484, 0.01200438, 0.00999308,
        0.01199389, 0.01099157, 0.00899434, 0.01499248, 0.01250362]),
 'test_explained_variance': array([0.74420373, 0.5099557 , 0.73099909, 0.68574597, 0.84817765,
        0.83692851, 0.88958368, 0.64463845, 0.87797686, 0.84752695]),
 'test_r2': array([0.73069703, 0.50983266, 0.72240683, 0.56572218, 0.83992868,
        0.83492247, 0.88787067, 0.64277753, 0.87167303, 0.77083425]),
 'test_max_error': array([ -6.91695053, -12.39988472, -12.52348508,  -8.29355691,
         -5.38421628,  -5.72519369,  -2.71144837, -13.38126664,
         -4.71849147,  -6.63658635]),
 'test_neg_median_absolute_error': array([-2.19840094, -3.55914229, -2.32942059, -1.38221191, -1.37707867,
        -1.86410256, -0.63494608, -2.82978494, -1.90753917, -2.71767648]),
 'test_neg_mean_ab