# Ridge regression algorithm

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
df = pd.read_csv(url)
print(df.head())

      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        b  lstat  medv  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90   5.33  36.2  


In [3]:
df.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat', 'medv'],
      dtype='object')

In [4]:
# X = df[['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
#        'ptratio', 'b', 'lstat']].values    #feature variable or independent variable i.e. the dataset without the target variable

# or  alternatively use: X = df.drop('medv', axis=1).values
# or  X = df.iloc[:,:-1].values  # all rows, all columns except last column

X = df.drop('medv', axis=1)   # keep as DataFrame
# then train_test_split will return DataFrames and .columns will work

Y = df['medv'].values      #target variable or dependent variable i.e. Median value of owner-occupied homes in $1000's(medv)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [6]:
X_train

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
5,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21
116,0.13158,0.0,10.01,0,0.547,6.176,72.5,2.7301,6,432,17.8,393.30,12.04
45,0.17142,0.0,6.91,0,0.448,5.682,33.8,5.1004,3,233,17.9,396.90,10.21
16,1.05393,0.0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21.0,386.85,6.58
468,15.57570,0.0,18.10,0,0.580,5.926,71.0,2.9084,24,666,20.2,368.74,18.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.17120,0.0,8.56,0,0.520,5.836,91.9,2.2110,5,384,20.9,395.67,18.66
270,0.29916,20.0,6.96,0,0.464,5.856,42.1,4.4290,3,223,18.6,388.65,13.00
348,0.01501,80.0,2.01,0,0.435,6.635,29.7,8.3440,4,280,17.0,390.94,5.99
435,11.16040,0.0,18.10,0,0.740,6.629,94.6,2.1247,24,666,20.2,109.85,23.27


In [7]:
# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns, index=X_train.index)
X_test  = pd.DataFrame(scaler.transform(X_test),  columns=X.columns, index=X_test.index)
# now this will work

# check the statistics of the standardized data
# it should have mean ~0 and std ~1 so no large feature dominates others


In [8]:
X_train.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
count,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0
mean,0.0,-5.017957e-18,3.763468e-16,1.003591e-17,-6.874601e-16,-3.51257e-16,-6.774242e-17,-1.405028e-16,-6.272446000000001e-17,-1.756285e-16,4.0143660000000004e-17,-4.541251e-16,-2.508979e-18
std,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415
min,-0.416762,-0.505125,-1.432137,-0.2815463,-1.477013,-3.428977,-2.385402,-1.241753,-0.9779889,-1.32327,-2.519507,-4.140252,-1.505295
25%,-0.40794,-0.505125,-0.8663857,-0.2815463,-0.9196796,-0.6073179,-0.8243866,-0.7970573,-0.6302741,-0.7700089,-0.7223839,0.1967778,-0.7826466
50%,-0.386964,-0.505125,-0.2082672,-0.2815463,-0.1651363,-0.1366932,0.3094984,-0.3102052,-0.5143691,-0.46331,0.1428974,0.3674826,-0.2106528
75%,-0.022663,0.3807519,1.0055,-0.2815463,0.6322788,0.4957956,0.8931822,0.7700449,1.687825,1.557294,0.8528718,0.42016,0.5355791
max,10.296908,3.70279,2.396785,3.551814,2.690124,3.417358,1.129371,3.938069,1.687825,1.827911,1.651593,0.4285411,3.597242


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score


In [10]:
regression = LinearRegression()

In [11]:
mse = cross_val_score(regression, X_train, Y_train, scoring='neg_mean_squared_error', cv=5)

In [12]:
reg_pred = regression.fit(X_train, Y_train).predict(X_test)
reg_pred

array([28.64896005, 36.49501384, 15.4111932 , 25.40321303, 18.85527988,
       23.14668944, 17.3921241 , 14.07859899, 23.03692679, 20.59943345,
       24.82286159, 18.53057049, -6.86543527, 21.80172334, 19.22571177,
       26.19191985, 20.27733882,  5.61596432, 40.44887974, 17.57695918,
       27.44319095, 30.1715964 , 10.94055823, 24.02083139, 18.07693812,
       15.934748  , 23.12614028, 14.56052142, 22.33482544, 19.3257627 ,
       22.16564973, 25.19476081, 25.31372473, 18.51345025, 16.6223286 ,
       17.50268505, 30.94992991, 20.19201752, 23.90440431, 24.86975466,
       13.93767876, 31.82504715, 42.56978796, 17.62323805, 27.01963242,
       17.19006621, 13.80594006, 26.10356557, 20.31516118, 30.08649576,
       21.3124053 , 34.15739602, 15.60444981, 26.11247588, 39.31613646,
       22.99282065, 18.95764781, 33.05555669, 24.85114223, 12.91729352,
       22.68101452, 30.80336295, 31.63522027, 16.29833689, 21.07379993,
       16.57699669, 20.36362023, 26.15615896, 31.06833034, 11.98

In [13]:
sns.displot(Y_test - reg_pred, kind ='kde')  # residuals should be normally distributed

<seaborn.axisgrid.FacetGrid at 0x1d4c6c2ae40>

In [14]:
from sklearn.metrics import r2_score
r2_score(Y_test, reg_pred)

0.7112260057484934

In [15]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [16]:
ridge_model = Ridge() # now we found the best alpha value using GridSearchCV

In [17]:
parameters = {'alpha': [ 1, 5 ,10, 20, 30, 35, 40, 45, 50, 75, 100, 200]} # through this we will find the best alpha value among all these values
ridge_cv = GridSearchCV(ridge_model, parameters, scoring='neg_mean_squared_error', cv=5) # 5 fold cross validation i.e. divide data into 5 parts and train on 4 parts and test on 1 part and repeat this 5 times
ridge_cv.fit(X_train, Y_train)

In [18]:
print(ridge_cv.best_params_)
print(ridge_cv.best_score_) # this is negative mean squared error for the best alpha value which is slight improvement over linear regression

{'alpha': 10}
-25.807228822291467


In [19]:
ridge_pred = ridge_cv.predict(X_test)
# ridge_pred
r2_score(Y_test, ridge_pred)

0.7072830902371285

## Lasso implementation

In [None]:
from sklearn.linear_model import Lasso   ## lasso also do feature selection mtlb some coefficients become zero
from sklearn.model_selection import GridSearchCV
lasso = Lasso()

In [22]:
parameters = {'alpha': [ 1, 5 ,10, 20, 30, 35, 40, 45, 50, 75, 100, 200]} # through this we will find the best alpha value among all these values
lasso_cv = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv=5) # 5 fold cross validation i.e. divide data into 5 parts and train on 4 parts and test on 1 part and repeat this 5 times
lasso_cv.fit(X_train, Y_train)

In [23]:
lasso_cv.best_params_
lasso_cv.best_score_ # this is negative mean squared error for the best alpha value which is slight improvement over linear regression

np.float64(-31.153603752119004)