In [1]:
# The "mpg" dataset, which stands for "miles per gallon". It contains information about various car models and their characteristics, such as cylinders, displacement, horsepower, weight, acceleration, model year, origin, and miles per gallon (mpg) fuel efficiency.

# Here's a brief explanation of each column:

# mpg: Miles per gallon, representing the fuel efficiency of the car.
# cylinders: Number of cylinders in the engine.
# displacement: Engine displacement, the measure of the cylinder volume swept by all of the pistons of a piston engine.
# horsepower: The power of the engine, typically measured in horsepower (hp).
# weight: Weight of the car, often measured in pounds.
# acceleration: Acceleration of the car from 0 to 60 miles per hour (mph) in seconds.
# model year: Year of manufacturing of the car model.
# origin: Origin of the car, represented as a categorical variable (1: USA, 2: Europe, 3: Japan).
# name: The name of the car model.
# This dataset is commonly used for regression tasks, where the goal is to predict the fuel efficiency (mpg) of a car based on its other characteristics

In [2]:
#1 mile is 1.6 km
#1 gallon is 3.7 km

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings('ignore')

In [4]:
df = sns.load_dataset('mpg')

In [5]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [6]:
df.drop("name", axis=1, inplace = True)

In [7]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [9]:
df.dtypes

Unnamed: 0,0
mpg,float64
cylinders,int64
displacement,float64
horsepower,float64
weight,int64
acceleration,float64
model_year,int64
origin,object


In [10]:
df.shape

(398, 8)

In [11]:
df.isna().sum()

Unnamed: 0,0
mpg,0
cylinders,0
displacement,0
horsepower,6
weight,0
acceleration,0
model_year,0
origin,0


In [12]:
#since we have not done outlier treatment, then better idea would be to replace the missing value with median

df['horsepower'].median()

93.5

In [13]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [14]:
df.isna().sum()

Unnamed: 0,0
mpg,0
cylinders,0
displacement,0
horsepower,0
weight,0
acceleration,0
model_year,0
origin,0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [16]:
df.dtypes

Unnamed: 0,0
mpg,float64
cylinders,int64
displacement,float64
horsepower,float64
weight,int64
acceleration,float64
model_year,int64
origin,object


In [17]:
df['origin'].value_counts()

Unnamed: 0_level_0,count
origin,Unnamed: 1_level_1
usa,249
japan,79
europe,70


In [18]:
#data encoding
df["origin"] = df["origin"].map({"usa":1, "japan":2, "europe":3})

In [19]:
# df["origin"] = df["origin"].astype(int)

In [20]:
df.dtypes

Unnamed: 0,0
mpg,float64
cylinders,int64
displacement,float64
horsepower,float64
weight,int64
acceleration,float64
model_year,int64
origin,int64


In [21]:
#separate into X and y

In [22]:
X = df.drop('mpg', axis=1)
y = df['mpg']

In [23]:
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,1
1,8,350.0,165.0,3693,11.5,70,1
2,8,318.0,150.0,3436,11.0,70,1
3,8,304.0,150.0,3433,12.0,70,1
4,8,302.0,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,1
394,4,97.0,52.0,2130,24.6,82,3
395,4,135.0,84.0,2295,11.6,82,1
396,4,120.0,79.0,2625,18.6,82,1


In [24]:
y

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0
...,...
393,27.0
394,44.0
395,32.0
396,28.0


In [25]:
#train test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)

In [26]:
X_train.shape

(278, 7)

In [27]:
X_test.shape

(120, 7)

In [28]:
#simple linear regression model
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()

In [29]:
regression_model

In [30]:
regression_model.fit(X_train, y_train)

In [31]:
regression_model.coef_

array([-0.31761423,  0.02623748, -0.01827076, -0.00748775,  0.05040673,
        0.84709514,  1.51909584])

In [32]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {regression_model.coef_[i]}")

The coefficient for cylinders is -0.31761423027992997
The coefficient for displacement is 0.02623748259907894
The coefficient for horsepower is -0.018270764913124644
The coefficient for weight is -0.007487750398361904
The coefficient for acceleration is 0.050406734619713886
The coefficient for model_year is 0.8470951427061371
The coefficient for origin is 1.5190958387975042


In [33]:
#observations:
#coefficients are relatively smaller, if one independent variable changes
#there will be not much difference in prediction
#This is sometime called as smoother model

#these features might not be contributing in model training

In [34]:
from sklearn.metrics import r2_score

y_pred_linear = regression_model.predict(X_test)
r2_linear = r2_score(y_test, y_pred_linear)
print(f"The R square of linear regression {r2_linear}")

The R square of linear regression 0.8348001123742284


In [35]:
#regularised model
#ridge regression

In [36]:
from sklearn.linear_model import Ridge
ridge_regression_model = Ridge(alpha=0.1)
ridge_regression_model
#in practical implementation lambda is alpha

In [37]:
ridge_regression_model.fit(X_train, y_train)

In [38]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {ridge_regression_model.coef_[i]}")

The coefficient for cylinders is -0.31700321010067906
The coefficient for displacement is 0.02621324975798342
The coefficient for horsepower is -0.018263252481449534
The coefficient for weight is -0.00748732605021309
The coefficient for acceleration is 0.050368969474425776
The coefficient for model_year is 0.8470062938903167
The coefficient for origin is 1.5174528285653937


In [39]:
#for ridge regression evaluation
y_pred_ridge = ridge_regression_model.predict(X_test)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"The R square of Ridge regression {r2_ridge}")

The R square of Ridge regression 0.8348084889168355


In [40]:
#we dont see much variation in coeff of ridge regression as compared to linear regression

In [41]:
from sklearn.linear_model import Lasso

In [42]:
lasso_regression_model = Lasso(alpha = 0.5)
lasso_regression_model

In [43]:
lasso_regression_model.fit(X_train, y_train)
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {lasso_regression_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.006208198888300358
The coefficient for horsepower is -0.011058382987169565
The coefficient for weight is -0.0069826731680230885
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.744654952003819
The coefficient for origin is 0.0


In [44]:
#three features coefficient is 0, lasso helps in feature selection

In [45]:
y_pred_lasso = lasso_regression_model.predict(X_test)
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f"The R square of Lasso regression {r2_lasso}")

The R square of Lasso regression 0.8277934716635555


In [46]:
#Elastic net

from sklearn.linear_model import ElasticNet
elastic_net_model = ElasticNet(alpha = 1, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)

In [47]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {elastic_net_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.005888869953667563
The coefficient for horsepower is -0.012403874933570126
The coefficient for weight is -0.006934550516257631
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.7133150744603874
The coefficient for origin is 0.0


In [48]:
y_pred_elastic = elastic_net_model.predict(X_test)
r2_elastic = r2_score(y_test, y_pred_elastic)
print(f"The R square of Elastic regression {r2_elastic}")

The R square of Elastic regression 0.8284840073256804


In [49]:
#regularisation with crossvalidation
from sklearn.linear_model import LassoCV
lasso_cv = LassoCV(cv = 5, verbose=2)

In [50]:
lasso_cv

In [51]:
lasso_cv.fit(X_train, y_train)

Path: 000 out of 100
Path: 001 out of 100
Path: 002 out of 100
Path: 003 out of 100
Path: 004 out of 100
Path: 005 out of 100
Path: 006 out of 100
Path: 007 out of 100
Path: 008 out of 100
Path: 009 out of 100
Path: 010 out of 100
Path: 011 out of 100
Path: 012 out of 100
Path: 013 out of 100
Path: 014 out of 100
Path: 015 out of 100
Path: 016 out of 100
Path: 017 out of 100
Path: 018 out of 100
Path: 019 out of 100
Path: 020 out of 100
Path: 021 out of 100
Path: 022 out of 100
Path: 023 out of 100
Path: 024 out of 100
Path: 025 out of 100
Path: 026 out of 100
Path: 027 out of 100
Path: 028 out of 100
Path: 029 out of 100
Path: 030 out of 100
Path: 031 out of 100
Path: 032 out of 100
Path: 033 out of 100
Path: 034 out of 100
Path: 035 out of 100
Path: 036 out of 100
Path: 037 out of 100
Path: 038 out of 100
Path: 039 out of 100
Path: 040 out of 100
Path: 041 out of 100
Path: 042 out of 100
Path: 043 out of 100
Path: 044 out of 100
Path: 045 out of 100
Path: 046 out of 100
Path: 047 out

In [52]:
y_pred = lasso_cv.predict(X_test)
score = r2_score(y_test, y_pred)
print(f"The R square of Lasso CV {score}")

The R square of Lasso CV 0.8082805983844751


In [53]:
from sklearn.linear_model import RidgeCV
ridgecv = RidgeCV(cv = 5)
ridgecv.fit(X_train, y_train)

In [54]:
y_pred = ridgecv.predict(X_test)
r2_score(y_test, y_pred)

0.8354145247502054

In [55]:
ridgecv.get_params()

{'alpha_per_target': False,
 'alphas': (0.1, 1.0, 10.0),
 'cv': 5,
 'fit_intercept': True,
 'gcv_mode': None,
 'scoring': None,
 'store_cv_results': None,
 'store_cv_values': 'deprecated'}

In [56]:
from sklearn.linear_model import ElasticNetCV

In [57]:
elastic_cv = ElasticNetCV(cv = 5)
elastic_cv.fit(X_train, y_train)

In [58]:
y_pred = elastic_cv.predict(X_test)

In [59]:
r2_score(y_test, y_pred)

0.792863401804916