# Entry 39 notebook - Lasso Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import openml

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge

## Regression

In [2]:
auto_mpg = openml.datasets.get_dataset(42372).get_data()[0]
auto_mpg.head()

Unnamed: 0,Displacement,Horse_power,Weight,Acceleration,Model_year,Mpg
0,91.0,70.0,1955.0,20.5,71.0,26.0
1,232.0,100.0,2789.0,15.0,73.0,18.0
2,350.0,145.0,4055.0,12.0,76.0,13.0
3,318.0,140.0,4080.0,13.7,78.0,17.5
4,113.0,95.0,2372.0,15.0,70.0,24.0


In [3]:
def split_data(df, target, train_size):
    y = df[[target]]
    X = df.drop(target, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, random_state=12)
    return X_train, X_test, y_train, y_test

In [4]:
X_train, X_test, y_train, y_test = split_data(auto_mpg, 'Mpg', 0.8)
pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), Ridge())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7952241582592139

In [5]:
cross_validate(pipe, X_train, y_train, cv=10, scoring=['explained_variance', 'r2', 'max_error',
                                                      'neg_median_absolute_error', 'neg_mean_absolute_error',
                                                      'neg_mean_squared_error', 'neg_mean_squared_log_error',
                                                      'neg_root_mean_squared_error', 'neg_mean_poisson_deviance',
                                                      'neg_mean_gamma_deviance'])

{'fit_time': array([0.00999284, 0.01399016, 0.00699902, 0.00598955, 0.00699568,
        0.00599718, 0.00599527, 0.00599933, 0.00599837, 0.00699592]),
 'score_time': array([0.01489043, 0.01898837, 0.0159924 , 0.0089941 , 0.00899386,
        0.0099957 , 0.00899625, 0.00998998, 0.0139904 , 0.00899172]),
 'test_explained_variance': array([0.83128433, 0.60689203, 0.80370128, 0.69517281, 0.73150325,
        0.91739607, 0.78382119, 0.7096364 , 0.84486155, 0.82653895]),
 'test_r2': array([0.81045283, 0.60583915, 0.78618695, 0.51535798, 0.72851554,
        0.91596174, 0.77512014, 0.70636946, 0.8067257 , 0.73656715]),
 'test_max_error': array([ -4.85243676, -11.33429   , -11.10555761,  -8.39018571,
         -4.93522033,  -3.78434049,  -3.55791604, -11.30443158,
         -6.51524644,  -6.54671858]),
 'test_neg_median_absolute_error': array([-2.11760491, -2.51693084, -1.7716712 , -1.36396416, -3.34015662,
        -1.60576451, -1.70001065, -2.57416412, -1.34371706, -4.21178764]),
 'test_neg_mean_ab

## Classification

In [6]:
from sklearn.linear_model import RidgeClassifier

from sklearn.datasets import load_breast_cancer

In [7]:
cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
target = cancer.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df, target, train_size=0.8, random_state=12)
pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), RidgeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.956140350877193

In [9]:
cross_validate(pipe, X_train, y_train, cv=10, scoring=['precision', 'average_precision', 'recall', 'f1'])

{'fit_time': array([0.03002548, 0.01498866, 0.00799489, 0.00699544, 0.00899506,
        0.00999808, 0.00699878, 0.01351786, 0.00899887, 0.00999546]),
 'score_time': array([0.0139792 , 0.00599527, 0.00599694, 0.00699568, 0.0059948 ,
        0.00849938, 0.00699234, 0.0069809 , 0.00699186, 0.0059948 ]),
 'test_precision': array([0.90625   , 0.90625   , 1.        , 0.93548387, 0.96666667,
        0.93548387, 0.96551724, 0.96666667, 0.90625   , 1.        ]),
 'test_average_precision': array([0.98823104, 0.99885057, 1.        , 0.99642999, 0.97666351,
        1.        , 0.99885057, 0.99885057, 0.99885057, 1.        ]),
 'test_recall': array([0.96666667, 1.        , 1.        , 1.        , 1.        ,
        1.        , 0.96551724, 1.        , 1.        , 1.        ]),
 'test_f1': array([0.93548387, 0.95081967, 1.        , 0.96666667, 0.98305085,
        0.96666667, 0.96551724, 0.98305085, 0.95081967, 1.        ])}