The complete explanation for this notebook is available at https://youranalystbuddy.com/regularization/

## Overfitting demonstration - cubic models in auto-mpg data

data and pipeline are just like before 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('auto-mpg.csv')

train, test = train_test_split(data, test_size=0.2)

num_cols = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']
cat_cols = ['origin']
target = 'mpg'

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

#pipeline for class features
cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder())
])

### measurements in training and testing data

In [8]:
#pipeline for numeric features
num_pipeline_poly3 = Pipeline([    
    ('impute', SimpleImputer(strategy='median')),
    ('polynomial', PolynomialFeatures(degree=3)),
    ('standardize', StandardScaler()),
])

#full processing pipeline
data_pipeline_poly3 = ColumnTransformer([
    ('numeric', num_pipeline_poly3, num_cols),
    ('class', cat_pipeline, cat_cols)
])

#model pipeline
poly3_reg_pipeline = Pipeline([
    ('processing', data_pipeline_poly3),
    ('modeling', LinearRegression())
])

poly3_reg_pipeline.fit(train, train[[target]])

from sklearn.model_selection import cross_val_score
r2_10cv = cross_val_score(poly3_reg_pipeline, train, train[[target]], cv=10, scoring='r2')
np.mean(r2_10cv)

0.7284627674958539

## Ridge regression on cubic data

In [4]:
from sklearn.linear_model import Ridge

ridge_reg_pipeline = Pipeline([
    ('processing', data_pipeline_poly3),
    ('modeling', Ridge())
])

r2_10cv = cross_val_score(ridge_reg_pipeline, train, train[[target]], cv=10, scoring='r2')
np.mean(r2_10cv)

0.8531922643781689

In [9]:
data_pipeline_poly3.transform(train).shape

(318, 87)

## Ridge regression on quadratic data

In [5]:
num_pipeline_poly2 = Pipeline([    
    ('impute', SimpleImputer(strategy='median')),
    ('polynomial', PolynomialFeatures(degree=2)),
    ('standardize', StandardScaler()),
])

data_pipeline_poly2 = ColumnTransformer([
    ('numeric', num_pipeline_poly2, num_cols),
    ('class', cat_pipeline, cat_cols)
])

ridge_reg_pipeline_2 = Pipeline([
    ('processing', data_pipeline_poly2),
    ('modeling', Ridge())
])

r2_10cv = cross_val_score(ridge_reg_pipeline_2, train, train[[target]], cv=10, scoring='r2')
np.mean(r2_10cv)

0.8518732915328598

In [11]:
data_pipeline_poly2.fit_transform(train).shape

(318, 31)

## How about bi-quadratic?

In [6]:
num_pipeline_poly4 = Pipeline([    
    ('impute', SimpleImputer(strategy='median')),
    ('polynomial', PolynomialFeatures(degree=4)),
    ('standardize', StandardScaler()),
])

data_pipeline_poly4 = ColumnTransformer([
    ('numeric', num_pipeline_poly4, num_cols),
    ('class', cat_pipeline, cat_cols)
])

ridge_reg_pipeline_4 = Pipeline([
    ('processing', data_pipeline_poly4),
    ('modeling', Ridge())
])

r2_10cv = cross_val_score(ridge_reg_pipeline_4, train, train[[target]], cv=10, scoring='r2')
np.mean(r2_10cv)

0.8556905493636462

In [13]:
data_pipeline_poly4.fit_transform(train).shape

(318, 213)

### Linear model for reference

In [7]:
num_pipeline_linear = Pipeline([    
    ('impute', SimpleImputer(strategy='median')),
    ('standardize', StandardScaler()),
])

data_pipeline_linear = ColumnTransformer([
    ('numeric', num_pipeline_linear, num_cols),
    ('class', cat_pipeline, cat_cols)
])

linear_reg_pipeline = Pipeline([
    ('processing', data_pipeline_linear),
    ('modeling', LinearRegression())
])

r2_10cv = cross_val_score(linear_reg_pipeline, train, train[[target]], cv=10, scoring='r2')
np.mean(r2_10cv)

0.8060313624814313