In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
boston = pd.read_csv(r"C:\Hogwarts\advanced_analytics\Datasets\Boston.csv")

In [3]:
X = boston.drop('medv', axis = 1)
y = boston['medv']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state = 2022,
                                                   train_size = 0.7)

In [6]:
# default alpha  = 1.
ridge = Ridge()
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(r2_score(y_test, y_pred))

0.7361969139334597


In [8]:
# using alpha = 1.5
ridge = Ridge(alpha = 1.5)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(r2_score(y_test, y_pred))

0.734564628782998


In [9]:
from sklearn.model_selection import KFold, GridSearchCV

## Grid Search CV

In [10]:
# here they are spaced equally into 50 parts.
np.linspace(1,11)

array([ 1.        ,  1.20408163,  1.40816327,  1.6122449 ,  1.81632653,
        2.02040816,  2.2244898 ,  2.42857143,  2.63265306,  2.83673469,
        3.04081633,  3.24489796,  3.44897959,  3.65306122,  3.85714286,
        4.06122449,  4.26530612,  4.46938776,  4.67346939,  4.87755102,
        5.08163265,  5.28571429,  5.48979592,  5.69387755,  5.89795918,
        6.10204082,  6.30612245,  6.51020408,  6.71428571,  6.91836735,
        7.12244898,  7.32653061,  7.53061224,  7.73469388,  7.93877551,
        8.14285714,  8.34693878,  8.55102041,  8.75510204,  8.95918367,
        9.16326531,  9.36734694,  9.57142857,  9.7755102 ,  9.97959184,
       10.18367347, 10.3877551 , 10.59183673, 10.79591837, 11.        ])

In [12]:
kfold = KFold(n_splits = 5, shuffle = True, random_state = 2022)
ridge = Ridge()

In [13]:
# linspace (1, 11, 22), here 22 is no of values equally spreaded.
params = {'alpha': np.linspace(0.001, 11, 20)}
gcv = GridSearchCV(ridge, param_grid = params, cv = kfold, scoring = 'r2')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

{'alpha': 0.001}
0.7076373475407247


In [14]:
best_model = gcv.best_estimator_
print(best_model.coef_)

[-1.08005626e-01  4.64220587e-02  2.05100877e-02  2.68656088e+00
 -1.77550976e+01  3.80995606e+00  6.81976275e-04 -1.47539719e+00
  3.06022426e-01 -1.23355051e-02 -9.52619511e-01  9.31228383e-03
 -5.24771088e-01]


In [15]:
X.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'black', 'lstat'],
      dtype='object')