The complete explanation for this notebook is available at https://youranalystbuddy.com/model-tuning/

## Loading data and create pipeline for quadratic features

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('auto-mpg.csv')

train, test = train_test_split(data, test_size=0.2)

num_cols = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']
cat_cols = ['origin']
target = 'mpg'

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

#pipeline for class features
cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder())
])

num_pipeline = Pipeline([    
    ('impute', SimpleImputer(strategy='median')),
    ('polynomial', PolynomialFeatures(degree=2)),
    ('standardize', StandardScaler()),
])

data_pipeline = ColumnTransformer([
    ('numeric', num_pipeline, num_cols),
    ('class', cat_pipeline, cat_cols)
])


## Ridge regression with different alphas

In [6]:
from sklearn.model_selection import cross_val_score

ridge_reg_pipeline = Pipeline([
    ('processing', data_pipeline),
    ('modeling', Ridge())
])

r2_10cv = cross_val_score(ridge_reg_pipeline, train, train[[target]], cv=10, scoring='r2')
np.mean(r2_10cv)

0.8598551283094334

In [12]:
ridge_reg_pipeline_2 = Pipeline([
    ('processing', data_pipeline),
    ('modeling', Ridge(alpha=0.01))    
])

r2_10cv = cross_val_score(ridge_reg_pipeline_2, train, train[[target]], cv=10, scoring='r2')
np.mean(r2_10cv)

0.8616209431618989

In [13]:
ridge_reg_pipeline_3 = Pipeline([
    ('processing', data_pipeline),
    ('modeling', Ridge(alpha=100))    
])

r2_10cv = cross_val_score(ridge_reg_pipeline_3, train, train[[target]], cv=10, scoring='r2')
np.mean(r2_10cv)

0.8234359459188229

## Grid search CV

In [18]:
from sklearn.model_selection import GridSearchCV

ridge_reg_pipeline = Pipeline([
    ('processing', data_pipeline),
    ('ridge', Ridge())
])

param_grid = [{'ridge__alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1 , 5, 10, 50, 100, 500, 1000]}]

grid_search = GridSearchCV(ridge_reg_pipeline, param_grid, cv=10, scoring='r2', return_train_score=True)

grid_search.fit(train,train[[target]])

In [19]:
grid_search.best_estimator_

In [20]:
grid_search.best_score_

0.8633998630137383

In [23]:
best_ridge_reg = grid_search.best_estimator_

r2_10cv = cross_val_score(best_ridge_reg, train, train[[target]], cv=10, scoring='r2')
np.mean(r2_10cv)

0.8633998630137383

## Inferencing with best model

In [24]:
best_ridge_reg = grid_search.best_estimator_

testY_pred = best_ridge_reg.predict(test)

In [25]:
from sklearn.metrics import r2_score

r2_score(test[[target]], testY_pred)

0.8558428749586002