In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

In [5]:
df = pd.read_csv('auto-insurance.csv', header=None)

In [6]:
df.head()

Unnamed: 0,0,1
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       63 non-null     int64  
 1   1       63 non-null     float64
dtypes: float64(1), int64(1)
memory usage: 1.1 KB


In [21]:
data = df[0]
data = data.values.reshape(-1,1)
data

array([[108],
       [ 19],
       [ 13],
       [124],
       [ 40],
       [ 57],
       [ 23],
       [ 14],
       [ 45],
       [ 10],
       [  5],
       [ 48],
       [ 11],
       [ 23],
       [  7],
       [  2],
       [ 24],
       [  6],
       [  3],
       [ 23],
       [  6],
       [  9],
       [  9],
       [  3],
       [ 29],
       [  7],
       [  4],
       [ 20],
       [  7],
       [  4],
       [  0],
       [ 25],
       [  6],
       [  5],
       [ 22],
       [ 11],
       [ 61],
       [ 12],
       [  4],
       [ 16],
       [ 13],
       [ 60],
       [ 41],
       [ 37],
       [ 55],
       [ 41],
       [ 11],
       [ 27],
       [  8],
       [  3],
       [ 17],
       [ 13],
       [ 13],
       [ 15],
       [  8],
       [ 29],
       [ 30],
       [ 24],
       [  9],
       [ 31],
       [ 14],
       [ 53],
       [ 26]], dtype=int64)

In [22]:
target = df[1]
target = target.values.reshape(-1,1)
target

array([[392.5],
       [ 46.2],
       [ 15.7],
       [422.2],
       [119.4],
       [170.9],
       [ 56.9],
       [ 77.5],
       [214. ],
       [ 65.3],
       [ 20.9],
       [248.1],
       [ 23.5],
       [ 39.6],
       [ 48.8],
       [  6.6],
       [134.9],
       [ 50.9],
       [  4.4],
       [113. ],
       [ 14.8],
       [ 48.7],
       [ 52.1],
       [ 13.2],
       [103.9],
       [ 77.5],
       [ 11.8],
       [ 98.1],
       [ 27.9],
       [ 38.1],
       [  0. ],
       [ 69.2],
       [ 14.6],
       [ 40.3],
       [161.5],
       [ 57.2],
       [217.6],
       [ 58.1],
       [ 12.6],
       [ 59.6],
       [ 89.9],
       [202.4],
       [181.3],
       [152.8],
       [162.8],
       [ 73.4],
       [ 21.3],
       [ 92.6],
       [ 76.1],
       [ 39.9],
       [142.1],
       [ 93. ],
       [ 31.9],
       [ 32.1],
       [ 55.6],
       [133.3],
       [194.5],
       [137.9],
       [ 87.4],
       [209.8],
       [ 95.5],
       [244.6],
       [

In [23]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, random_state=1)
print(Xtrain.shape, Xtest.shape)

(47, 1) (16, 1)


In [24]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

In [25]:
lr.fit(Xtrain, ytrain)

LinearRegression()

In [26]:
print(lr.intercept_)

[21.21293115]


In [27]:
print(lr.coef_)

[[3.38455013]]


In [28]:
print('train R^2', lr.score(Xtrain, ytrain))
print('test R^2', lr.score(Xtest, ytest))

train R^2 0.8411115219729022
test R^2 0.7971386424990525


In [29]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

ytest_pred = lr.predict(Xtest)

print("Mean absolute error", mean_absolute_error(ytest, ytest_pred))

Mean absolute error 27.391486320581414


In [30]:
print("Mean squared error", mean_squared_error(ytest, ytest_pred))

Mean squared error 1074.4851020539998


In [64]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

poly = make_pipeline(PolynomialFeatures(), Ridge())
poly.fit(Xtrain, ytrain)
print('train R^2', poly.score(Xtrain, ytrain))
print('test R^2', poly.score(Xtest, ytest))

train R^2 0.8440279931781216
test R^2 0.8027824472370432


In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
poly = make_pipeline(PolynomialFeatures(2), Ridge())

In [34]:
poly.named_steps

{'polynomialfeatures': PolynomialFeatures(), 'ridge': Ridge()}

In [56]:
grid_params = {'ridge__alpha':[0.001, 0.01, 0.1, 1, 10, 100],
               'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sag'],
               'ridge__fit_intercept': [True, False],
               'ridge__normalize':[True, False]}

grid = GridSearchCV(poly, grid_params, n_jobs=-1, cv=5)

In [57]:
grid.fit(Xtrain, ytrain)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('ridge', Ridge())]),
             n_jobs=-1,
             param_grid={'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
                         'ridge__fit_intercept': [True, False],
                         'ridge__normalize': [True, False],
                         'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sag']})

In [58]:
grid.best_score_

0.509270682511758

In [59]:
grid.best_params_

{'ridge__alpha': 0.01,
 'ridge__fit_intercept': True,
 'ridge__normalize': True,
 'ridge__solver': 'sag'}

In [60]:
grid.cv_results_

{'mean_fit_time': array([0.00200005, 0.00179901, 0.00279951, 0.00179992, 0.00140018,
        0.00138407, 0.00179973, 0.00560098, 0.00100031, 0.00120001,
        0.00179987, 0.00539947, 0.0010005 , 0.00100002, 0.00119996,
        0.00539975, 0.00120058, 0.00140009, 0.00199928, 0.00159984,
        0.00119967, 0.00120029, 0.00180011, 0.00579891, 0.00099959,
        0.00079999, 0.00159726, 0.00540004, 0.00099978, 0.00119824,
        0.00140033, 0.0057992 , 0.00140004, 0.00159988, 0.00180016,
        0.00139966, 0.00100017, 0.00140257, 0.0015976 , 0.00600233,
        0.00099988, 0.00119972, 0.00179996, 0.00559688, 0.00119972,
        0.00079999, 0.00160384, 0.00600033, 0.00139985, 0.0016026 ,
        0.00200291, 0.00139961, 0.0012002 , 0.00099988, 0.00119991,
        0.00560012, 0.00099988, 0.00119967, 0.00140195, 0.00579963,
        0.00100284, 0.00099936, 0.00159974, 0.00719967, 0.00099978,
        0.00120039, 0.00139966, 0.00160427, 0.00119996, 0.00119991,
        0.00160398, 0.00559907,

In [61]:
grid.score(Xtrain, ytrain)

0.8429626771320229

In [62]:
grid.score(Xtest, ytest)

0.7998958991124473

In [63]:
rbest = grid.best_estimator_
rbest.score(Xtest, ytest)

0.7998958991124473