In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

In [2]:
#date: 12 Jul
mpg_df = pd.read_csv('/Users/pawankumarkc/Documents/vscodepython/MLAlgo/datasets/auto-mpg.csv')
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
mpg_df = mpg_df.drop('car name', axis=1)
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


In [4]:
mpg_df['origin'].value_counts()

1    249
3     79
2     70
Name: origin, dtype: int64

In [5]:
mpg_df['origin'] = mpg_df['origin'].replace({1:'America', 2:'Europe', 3:'Asia'})
mpg_df['origin'].value_counts()

America    249
Asia        79
Europe      70
Name: origin, dtype: int64

In [6]:
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])

In [7]:
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_America,origin_Asia,origin_Europe
0,18.0,8,307.0,130,3504,12.0,70,1,0,0
1,15.0,8,350.0,165,3693,11.5,70,1,0,0
2,18.0,8,318.0,150,3436,11.0,70,1,0,0
3,16.0,8,304.0,150,3433,12.0,70,1,0,0
4,17.0,8,302.0,140,3449,10.5,70,1,0,0


In [8]:
#Whenever we do onehot encoding, we need to drop one variable
mpg_df = mpg_df.drop('origin_America', axis=1)

In [9]:
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_Asia,origin_Europe
0,18.0,8,307.0,130,3504,12.0,70,0,0
1,15.0,8,350.0,165,3693,11.5,70,0,0
2,18.0,8,318.0,150,3436,11.0,70,0,0
3,16.0,8,304.0,150,3433,12.0,70,0,0
4,17.0,8,302.0,140,3449,10.5,70,0,0


In [10]:
mpg_df.isnull().sum()
# No missing values

mpg              0
cylinders        0
displacement     0
horsepower       0
weight           0
acceleration     0
model year       0
origin_Asia      0
origin_Europe    0
dtype: int64

In [12]:
#Finding unique values in columns
for i in mpg_df.columns:
    print('******************************************', i, '*********************************************')
    print()
    print(set(mpg_df[i].to_list()))

****************************************** mpg *********************************************

{9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.5, 27.5, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 29.5, 35.0, 31.5, 36.0, 33.5, 36.1, 39.4, 43.1, 35.7, 37.3, 41.5, 38.1, 46.6, 40.8, 44.3, 43.4, 44.6, 44.0, 14.5, 15.5, 16.5, 16.9, 17.5, 17.6, 18.5, 18.6, 18.1, 19.9, 19.4, 19.1, 20.5, 20.6, 21.5, 21.1, 21.6, 22.5, 22.4, 23.9, 23.6, 23.5, 24.5, 25.4, 25.1, 25.5, 26.4, 26.6, 27.4, 27.9, 28.4, 28.1, 29.9, 30.5, 30.9, 31.9, 31.6, 32.8, 32.1, 32.2, 32.7, 32.4, 33.8, 32.3, 33.7, 32.9, 34.1, 34.2, 34.5, 34.3, 34.7, 35.1, 34.4, 34.0, 36.4, 37.2, 37.0, 37.7, 38.0, 39.1, 39.0, 40.9, 16.2, 17.7, 18.2, 19.2, 19.8, 20.2, 20.8, 20.3, 22.3, 23.2, 23.8, 23.7, 24.3, 24.2, 25.8, 26.8, 27.2, 28.8, 29.8, 30.7, 31.8, 31.3}
****************************************** cylinders *********************************************

{3, 4, 5, 6, 8}
****************

In [15]:
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median(), axis=0))

In [16]:
#Seperate independent variable and dependent variable

x = mpg_df.drop('mpg', axis=1)
y = mpg_df[['mpg']]

In [17]:
x.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin_Asia,origin_Europe
0,8,307.0,130,3504,12.0,70,0,0
1,8,350.0,165,3693,11.5,70,0,0
2,8,318.0,150,3436,11.0,70,0,0
3,8,304.0,150,3433,12.0,70,0,0
4,8,302.0,140,3449,10.5,70,0,0


In [18]:
y.head()

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0


In [None]:
# Treat outlier - TODO

In [21]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
x_scaled = StandardScaler()
x_scaled = x_scaled.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled, columns=x.columns)

In [23]:
x_scaled

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin_Asia,origin_Europe
0,1.498191,1.090604,0.673118,0.630870,-1.295498,-1.627426,-0.497643,-0.461968
1,1.498191,1.503514,1.589958,0.854333,-1.477038,-1.627426,-0.497643,-0.461968
2,1.498191,1.196232,1.197027,0.550470,-1.658577,-1.627426,-0.497643,-0.461968
3,1.498191,1.061796,1.197027,0.546923,-1.295498,-1.627426,-0.497643,-0.461968
4,1.498191,1.042591,0.935072,0.565841,-1.840117,-1.627426,-0.497643,-0.461968
...,...,...,...,...,...,...,...,...
393,-0.856321,-0.513026,-0.479482,-0.213324,0.011586,1.621983,-0.497643,-0.461968
394,-0.856321,-0.925936,-1.370127,-0.993671,3.279296,1.621983,-0.497643,2.164651
395,-0.856321,-0.561039,-0.531873,-0.798585,-1.440730,1.621983,-0.497643,-0.461968
396,-0.856321,-0.705077,-0.662850,-0.408411,1.100822,1.621983,-0.497643,-0.461968


In [22]:
#Split data into train and test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=1)

# Building linear regression model


In [24]:
linear = LinearRegression()
linear.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coeff. for {} is {}".format(col_name, linear.coef_[0][idx]))

The coeff. for cylinders is -0.510727148924759
The coeff. for displacement is 2.4406994641535573
The coeff. for horsepower is -0.6849487368203032
The coeff. for weight is -5.886755316305747
The coeff. for acceleration is 0.2888056095488505
The coeff. for model year is 3.0589637623211514
The coeff. for origin_Asia is 1.072992305702572
The coeff. for origin_Europe is 1.0803661840485703


In [25]:
intercept = linear.intercept_[0]
print("The intercept for liner model is {}".format(intercept))

The intercept for liner model is 23.510190092051925


## Regularization method

In [26]:
## part 1 - Lasso method
# gives exact zero or closer to zero.
# alpha is penalty, for lasso keep it to 0.1 and for Ridge keep it to 0.3
lasso = Lasso(alpha=0.1)
lasso.fit(x_train, y_train)
print('Lasso model : ', lasso.coef_)

# the co.eff is reduced from its earlier values

Lasso model :  [-0.          0.         -0.28819342 -4.80639497  0.04406849  2.86610654
  0.67201844  0.62820971]


In [30]:
linear.coef_

array([[-0.51072715,  2.44069946, -0.68494874, -5.88675532,  0.28880561,
         3.05896376,  1.07299231,  1.08036618]])

In [27]:
lasso.intercept_

array([23.52158927])

In [33]:
# Accuracy of Lasso method
y_pred_train_lasso = lasso.predict(x_train)
y_pred_test_lasso = lasso.predict(x_test)
print('Training accuracy Lasso: ', r2_score(y_train , y_pred_train_lasso))
print('Training accuracy Lasso: ', r2_score(y_test , y_pred_test_lasso))

Training accuracy Lasso:  0.8085089302256353
Training accuracy Lasso:  0.8632534347842027


In [34]:
# Accuracy of Linear method
y_pred_train_linear = linear.predict(x_train)
y_pred_test_linear = linear.predict(x_test)
print('Training accuracy Lasso: ', r2_score(y_train , y_pred_train_linear))
print('Training accuracy Lasso: ', r2_score(y_test , y_pred_test_linear))

Training accuracy Lasso:  0.8143785617196869
Training accuracy Lasso:  0.8647729653026789


In [28]:
## part 2 - Ridge regression(L2 regularization) [Most used]
# closer to zero, but not exactly zero.

ridge = Ridge(alpha=0.3)
ridge.fit(x_train, y_train)
print('Ridge model : ', ridge.coef_)


Ridge model :  [[-0.48568005  2.3423919  -0.69387387 -5.81972716  0.27582324  3.05010754
   1.0663768   1.06964322]]


In [29]:
linear.coef_

array([[-0.51072715,  2.44069946, -0.68494874, -5.88675532,  0.28880561,
         3.05896376,  1.07299231,  1.08036618]])

In [31]:
y_pred_train_ridge = ridge.predict(x_train)
y_pred_test_ridge = ridge.predict(x_test)

In [32]:
print('Training accuracy: ', r2_score(y_train , y_pred_train_ridge))
print('Training accuracy: ', r2_score(y_test , y_pred_test_ridge))

Training accuracy:  0.8143682953093296
Training accuracy:  0.8648106635132775
