In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV

In [4]:
data = pd.read_csv('automobile_processed.csv')
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,age
0,12.0,8,350.0,180,4499,12.5,48
1,37.2,4,86.0,65,2019,16.4,41
2,21.0,6,199.0,90,2648,15.0,51
3,36.0,4,105.0,74,1980,15.3,39
4,25.0,4,110.0,87,2672,17.5,51


In [5]:
X = data.drop(['mpg', 'age'], axis = 1)

Y = data['mpg']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [14]:
#Find the best value of constant which determines the magnitude of the regression

parameters = {'alpha' : [0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0]}

grid_search = GridSearchCV(Lasso(), parameters, cv=3, return_train_score = True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'alpha': 0.6}

alpha is used to multiply the pelanty terms of the Lasso regression model, we have 7 values in alpha so 7 different models will be built with each different alpha values<br>

CV = 3 -Use 3-fold cross validation to find the best model - split the dataset into 3 part, each model will be trained using three different runs 2 parts will be used for training and 1 part will be testing.<br>

Your model will be scored and evaluated using default scoring mechanism used for the particular estimator object. In the case of regression models, the default scoring is the R square score.

In [15]:
for i in range(len(parameters['alpha'])):
    
    print('parameters : ', grid_search.cv_results_['params'][i])
    
    print('mean_test_score : ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank : ', grid_search.cv_results_['rank_test_score'][i])

parameters :  {'alpha': 0.2}
mean_test_score :  0.6996758888558544
Rank :  7
parameters :  {'alpha': 0.4}
mean_test_score :  0.7006414155380508
Rank :  4
parameters :  {'alpha': 0.6}
mean_test_score :  0.7006997595305625
Rank :  1
parameters :  {'alpha': 0.7}
mean_test_score :  0.7006736072902807
Rank :  2
parameters :  {'alpha': 0.8}
mean_test_score :  0.7006466054656864
Rank :  3
parameters :  {'alpha': 0.9}
mean_test_score :  0.7006183739319639
Rank :  5
parameters :  {'alpha': 1.0}
mean_test_score :  0.7005889001887646
Rank :  6


In [16]:
lasso_model = Lasso(alpha=grid_search.best_params_['alpha']).fit(x_train, y_train)

In [17]:
y_pred = lasso_model.predict(x_test)

print('Training Score : ',lasso_model.score(x_train, y_train))
print('Test Score : ', r2_score(y_test, y_pred))

Training Score :  0.7046480677722311
Test Score :  0.7091285918751662


<h3>Hyperparameter tuning on multiple different regression model

In [19]:
parameters = {'n_neighbors' : [10, 12, 14, 18, 20, 25, 30, 35, 50]}

grid_search = GridSearchCV(KNeighborsRegressor(), parameters, cv=3, return_train_score = True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'n_neighbors': 50}

In [20]:
for i in range(len(parameters['n_neighbors'])):
    
    print('parameters : ', grid_search.cv_results_['params'][i])
    
    print('mean_test_score : ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank : ', grid_search.cv_results_['rank_test_score'][i])

parameters :  {'n_neighbors': 10}
mean_test_score :  0.6782973960878521
Rank :  8
parameters :  {'n_neighbors': 12}
mean_test_score :  0.6747225405314263
Rank :  9
parameters :  {'n_neighbors': 14}
mean_test_score :  0.6819013013876152
Rank :  7
parameters :  {'n_neighbors': 18}
mean_test_score :  0.6859339775495327
Rank :  6
parameters :  {'n_neighbors': 20}
mean_test_score :  0.6949225071248226
Rank :  5
parameters :  {'n_neighbors': 25}
mean_test_score :  0.7027913513090455
Rank :  4
parameters :  {'n_neighbors': 30}
mean_test_score :  0.7048712840139543
Rank :  2
parameters :  {'n_neighbors': 35}
mean_test_score :  0.703590767856785
Rank :  3
parameters :  {'n_neighbors': 50}
mean_test_score :  0.706301971560003
Rank :  1


In [21]:
kneighbors_model = KNeighborsRegressor(n_neighbors=grid_search.best_params_['n_neighbors']).fit(x_train, y_train)

In [22]:
y_pred = kneighbors_model.predict(x_test)

print('Training Score : ',kneighbors_model.score(x_train, y_train))
print('Test Score : ', r2_score(y_test, y_pred))

Training Score :  0.7213335488431176
Test Score :  0.7058645102896566


<h3>Decision Tree Regressor

In [23]:
parameters = {'max_depth' : [1, 2, 3, 4, 5, 7, 8]}

grid_search = GridSearchCV(DecisionTreeRegressor(), parameters, cv=3, return_train_score = True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 2}

In [24]:
for i in range(len(parameters['max_depth'])):
    
    print('parameters : ', grid_search.cv_results_['params'][i])
    
    print('mean_test_score : ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank : ', grid_search.cv_results_['rank_test_score'][i])

parameters :  {'max_depth': 1}
mean_test_score :  0.5893347678292989
Rank :  5
parameters :  {'max_depth': 2}
mean_test_score :  0.6854673971611914
Rank :  1
parameters :  {'max_depth': 3}
mean_test_score :  0.6847669438796132
Rank :  2
parameters :  {'max_depth': 4}
mean_test_score :  0.6622569406259705
Rank :  3
parameters :  {'max_depth': 5}
mean_test_score :  0.6257573830786636
Rank :  4
parameters :  {'max_depth': 7}
mean_test_score :  0.5697944494975046
Rank :  7
parameters :  {'max_depth': 8}
mean_test_score :  0.5790619456531001
Rank :  6


In [25]:
decisiontree_model = DecisionTreeRegressor(max_depth=grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [26]:
y_pred = decisiontree_model.predict(x_test)

print('Training Score : ',decisiontree_model.score(x_train, y_train))
print('Test Score : ', r2_score(y_test, y_pred))

Training Score :  0.7357933420986243
Test Score :  0.6335150802108107


<h3>Support Vector Regressor

In [27]:
#Epsilon - size of the margin
#C - penalty that apply on outlier points that lies outside of the margin
#GridSearch will perform totally 4*2 = 8 models


parameters = {'epsilon' : [0.05, 0.1, 0.2, 0.3],
             'C':[0.2, 0.3]}

grid_search = GridSearchCV(SVR(kernel='linear'), parameters, cv=3, return_train_score = True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 0.3, 'epsilon': 0.05}

In [29]:
svr_model = SVR(kernel='linear', 
                epsilon=grid_search.best_params_['epsilon'],
                C=grid_search.best_params_['C']).fit(x_train, y_train)

In [30]:
y_pred = svr_model.predict(x_test)

print('Training Score : ',svr_model.score(x_train, y_train))
print('Test Score : ', r2_score(y_test, y_pred))

Training Score :  0.6847961812231318
Test Score :  0.6753327953196764
