The complete explanation for this notebook is available at https://youranalystbuddy.com/the-overfitting-problem/

## Overfitting demonstration

data and pipelines are just like before 

### Loading data and categorical pipeline

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('auto-mpg.csv')

train, test = train_test_split(data, test_size=0.2)

num_cols = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']
cat_cols = ['origin']
target = 'mpg'

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

#pipeline for class features
cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder())
])

### Linear model

In [2]:
#pipeline for numeric features
num_pipeline_linear = Pipeline([    
    ('impute', SimpleImputer(strategy='median')),
    ('standardize', StandardScaler())
])

#full data pipeline
data_pipeline_linear = ColumnTransformer([
    ('numeric', num_pipeline_linear, num_cols),
    ('class', cat_pipeline, cat_cols)
])

#model pipeline
linear_reg_pipeline = Pipeline([
    ('processing', data_pipeline_linear),
    ('modeling', LinearRegression())
])

#### measurements in training and testing data

In [3]:
linear_reg_pipeline.fit(train, train[[target]])

print('training R2:', linear_reg_pipeline.score(train, train[[target]]))
print('testing R2:', linear_reg_pipeline.score(test, test[[target]]))

training R2: 0.835723005971064
testing R2: 0.7760959508464778


### Quadratic model

In [4]:
#pipeline for numeric features
num_pipeline_poly2 = Pipeline([    
    ('impute', SimpleImputer(strategy='median')),
    ('quadratic features', PolynomialFeatures(degree=2)),
    ('standardize', StandardScaler())
])

#full data pipeline
data_pipeline_poly2 = ColumnTransformer([
    ('numeric', num_pipeline_poly2, num_cols),
    ('class', cat_pipeline, cat_cols)
])

#model pipeline
poly2_reg_pipeline = Pipeline([
    ('processing', data_pipeline_poly2),
    ('modeling', LinearRegression())
])

#### measurements in training and testing data

In [5]:
poly2_reg_pipeline.fit(train, train[[target]])

print('training R2:', poly2_reg_pipeline.score(train, train[[target]]))
print('testing R2:', poly2_reg_pipeline.score(test, test[[target]]))

training R2: 0.8920039412387385
testing R2: 0.8115434439880399


### Cubic model

In [6]:
#pipeline for numeric features
num_pipeline_poly3 = Pipeline([    
    ('impute', SimpleImputer(strategy='median')),
    ('quadratic features', PolynomialFeatures(degree=3)),
    ('standardize', StandardScaler())
])

#full data pipeline
data_pipeline_poly3 = ColumnTransformer([
    ('numeric', num_pipeline_poly3, num_cols),
    ('class', cat_pipeline, cat_cols)
])

#model pipeline
poly3_reg_pipeline = Pipeline([
    ('processing', data_pipeline_poly3),
    ('modeling', LinearRegression())
])

#### measurements in training and testing data

In [7]:
poly3_reg_pipeline.fit(train, train[[target]])

print('training R2:', poly3_reg_pipeline.score(train, train[[target]]))
print('testing R2:', poly3_reg_pipeline.score(test, test[[target]]))

training R2: 0.9353098210465611
testing R2: 0.5525286388346851


### Bi-quadratic model

In [8]:
#pipeline for numeric features
num_pipeline_poly4 = Pipeline([    
    ('impute', SimpleImputer(strategy='median')),
    ('quadratic features', PolynomialFeatures(degree=4)),
    ('standardize', StandardScaler())
])

#full data pipeline
data_pipeline_poly4 = ColumnTransformer([
    ('numeric', num_pipeline_poly4, num_cols),
    ('class', cat_pipeline, cat_cols)
])

#model pipeline
poly4_reg_pipeline = Pipeline([
    ('processing', data_pipeline_poly4),
    ('modeling', LinearRegression())
])

#### measurements in training and testing data

In [9]:
poly4_reg_pipeline.fit(train, train[[target]])

print('training R2:', poly4_reg_pipeline.score(train, train[[target]]))
print('testing R2:', poly4_reg_pipeline.score(test, test[[target]]))

training R2: 0.9586116405583816
testing R2: -411.12073625815236
