In [78]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/autompg-dataset/auto-mpg.csv


In [42]:
raw_data = pd.read_csv('/kaggle/input/autompg-dataset/auto-mpg.csv')
raw_data.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl


In [43]:
raw_data.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


In [44]:
raw_data.shape

(398, 9)

In [45]:
raw_data.isnull().any()
# No missing values in columns

mpg             False
cylinders       False
displacement    False
horsepower      False
weight          False
acceleration    False
model year      False
origin          False
car name        False
dtype: bool

# Version 1

In [46]:
# Description in data says attribute "Horsepower" has missing values, hence, inspecting
raw_data.horsepower.unique()
raw_data.loc[raw_data['horsepower']=='?']

# 7 records found with '?' value - treating as missing value

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,?,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,?,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,?,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,?,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,?,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,?,3035,20.5,82,1,amc concord dl


In [47]:
# Exploring attribute types and their significance
raw_data.displacement.unique() # Continuous
raw_data.horsepower.unique() # Continuous
raw_data.weight.unique() # Continous but can be put into buckets and treated as categorical
raw_data.acceleration.unique() # Continous but can be put into buckets and treated as categorical

raw_data.cylinders.unique() # Multi-valued discrete - Treating as Categorical
raw_data['model year'].unique() # Categorical
raw_data.origin.unique() # Categorical

raw_data['car name'].unique() # Unique, can be excluded from training and prediction. First word of the name (Manufacturer) can be used.
raw_data.mpg.unique() # Target attribute - Continuous - First guess will be regression algorithms

array([18. , 15. , 16. , 17. , 14. , 24. , 22. , 21. , 27. , 26. , 25. ,
       10. , 11. ,  9. , 28. , 19. , 12. , 13. , 23. , 30. , 31. , 35. ,
       20. , 29. , 32. , 33. , 17.5, 15.5, 14.5, 22.5, 24.5, 18.5, 29.5,
       26.5, 16.5, 31.5, 36. , 25.5, 33.5, 20.5, 30.5, 21.5, 43.1, 36.1,
       32.8, 39.4, 19.9, 19.4, 20.2, 19.2, 25.1, 20.6, 20.8, 18.6, 18.1,
       17.7, 27.5, 27.2, 30.9, 21.1, 23.2, 23.8, 23.9, 20.3, 21.6, 16.2,
       19.8, 22.3, 17.6, 18.2, 16.9, 31.9, 34.1, 35.7, 27.4, 25.4, 34.2,
       34.5, 31.8, 37.3, 28.4, 28.8, 26.8, 41.5, 38.1, 32.1, 37.2, 26.4,
       24.3, 19.1, 34.3, 29.8, 31.3, 37. , 32.2, 46.6, 27.9, 40.8, 44.3,
       43.4, 36.4, 44.6, 40.9, 33.8, 32.7, 23.7, 23.6, 32.4, 26.6, 25.8,
       23.5, 39.1, 39. , 35.1, 32.3, 37.7, 34.7, 34.4, 29.9, 33.7, 32.9,
       31.6, 28.1, 30.7, 24.2, 22.4, 34. , 38. , 44. ])

#### Treating missing values

In [48]:
# Attribute 'Horsepower' - Continuous - So, using mean values to replace '?'
# Here missing value is not NaN, so can't use traditional imputation methods
# 7 records found with '?' value in feature 'horsepower'

sum_horsepower = 0
for value in raw_data.horsepower:
    if value != '?':
        sum_horsepower += int(value)
mean_horsepower = int(sum_horsepower / (raw_data.shape[0] - 7))

In [49]:
# Replacing '?' with mean_horsepower value
raw_data.loc[raw_data['horsepower'] == '?', 'horsepower'] = str(mean_horsepower)

In [50]:
raw_data.horsepower.unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '104', '100', '105', '175', '153', '180',
       '110', '72', '86', '70', '76', '65', '69', '60', '80', '54', '208',
       '155', '112', '92', '145', '137', '158', '167', '94', '107', '230',
       '49', '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [51]:
# Ideally just could've dropped those 7 rows, but as the total observations are less, hence replaced with mean

#### Handling Categorical attributes

In [52]:
# Categorical attributes - cylinders, model year, origin
raw_data.cylinders.unique()
raw_data['model year'].unique()
raw_data.origin.unique()

# All the attributes have numerical values, so Label encoding is not required here
# Using One Hot encoding/get_dummies
raw_data = pd.get_dummies(raw_data, columns=['cylinders', 'model year', 'origin'], prefix=['cylinders', 'model', 'origin'])

In [53]:
raw_data.head(20)

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,car name,cylinders_3,cylinders_4,cylinders_5,cylinders_6,...,model_76,model_77,model_78,model_79,model_80,model_81,model_82,origin_1,origin_2,origin_3
0,18.0,307.0,130,3504,12.0,chevrolet chevelle malibu,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,15.0,350.0,165,3693,11.5,buick skylark 320,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,18.0,318.0,150,3436,11.0,plymouth satellite,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,16.0,304.0,150,3433,12.0,amc rebel sst,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,17.0,302.0,140,3449,10.5,ford torino,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,15.0,429.0,198,4341,10.0,ford galaxie 500,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,14.0,454.0,220,4354,9.0,chevrolet impala,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,14.0,440.0,215,4312,8.5,plymouth fury iii,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,14.0,455.0,225,4425,10.0,pontiac catalina,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9,15.0,390.0,190,3850,8.5,amc ambassador dpl,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [54]:
raw_data.shape

(398, 27)

In [55]:
# Dropping attribute 'car name'
del raw_data['car name']
raw_data.head(20)

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,...,model_76,model_77,model_78,model_79,model_80,model_81,model_82,origin_1,origin_2,origin_3
0,18.0,307.0,130,3504,12.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,15.0,350.0,165,3693,11.5,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,18.0,318.0,150,3436,11.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,16.0,304.0,150,3433,12.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,17.0,302.0,140,3449,10.5,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5,15.0,429.0,198,4341,10.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
6,14.0,454.0,220,4354,9.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
7,14.0,440.0,215,4312,8.5,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
8,14.0,455.0,225,4425,10.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
9,15.0,390.0,190,3850,8.5,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [56]:
# TODO - Standardization - Will be done in later revisions

In [57]:
# Divide Dependent and Independent variables
X = raw_data.iloc[:, 1:]
y = raw_data.iloc[:, 0]

In [125]:
# Divide Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

#### Modeling

In [62]:
# Starting with a Linear Regression model
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_test)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Linear - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

# R2 value is ~87% - Good model. 
# TODO - We might want to later analyze the R2 value.

Linear - MSE: 7.692813307560194, R-squared: 0.8677441867705891


In [63]:
# Ridge Regression
model_ridge = Ridge()
model_ridge.fit(X_train, y_train)

y_pred = model_ridge.predict(X_test)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Ridge - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

Ridge - MSE: 8.007540462339524, R-squared: 0.8623333579701797


In [64]:
# Lasso Regression
model_lasso = Lasso()
model_lasso.fit(X_train, y_train)

y_pred = model_lasso.predict(X_test)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Lasso - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

Lasso - MSE: 18.996291107052475, R-squared: 0.6734133758014388


In [65]:
# ElasticNet Regression
model_elastic = ElasticNet()
model_elastic.fit(X_train, y_train)

y_pred = model_elastic.predict(X_test)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Elastic - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

Elastic - MSE: 19.00547834085804, R-squared: 0.6732554277231864


# Version 2

In [123]:
# Using the data from version1 - raw_data, X_train, X_test, y_train, y_test
raw_data.head(20)

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,...,model_76,model_77,model_78,model_79,model_80,model_81,model_82,origin_1,origin_2,origin_3
0,18.0,307.0,130,3504,12.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,15.0,350.0,165,3693,11.5,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,18.0,318.0,150,3436,11.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,16.0,304.0,150,3433,12.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,17.0,302.0,140,3449,10.5,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5,15.0,429.0,198,4341,10.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
6,14.0,454.0,220,4354,9.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
7,14.0,440.0,215,4312,8.5,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
8,14.0,455.0,225,4425,10.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
9,15.0,390.0,190,3850,8.5,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [67]:
# Applying Standardization

# Normalization or Standardization?
# Normalization is good to use when you know that the distribution of your data does not follow a Gaussian distribution. 
# This can be useful in algorithms that do not assume any distribution of the data like K-Nearest Neighbors and Neural Networks.
# Standardization, on the other hand, can be helpful in cases where the data follows a Gaussian distribution.
# Source: https://www.analyticsvidhya.com/blog/2020/04/feature-scaling-machine-learning-normalization-standardization/

In [70]:
# Normalization (Min-Max Scaling)
norm_scaler = MinMaxScaler().fit(X_train)
X_train_norm = norm_scaler.transform(X_train)
X_test_norm = norm_scaler.transform(X_test)

In [72]:
pd.DataFrame(X_train_norm).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.645995,0.565217,0.632979,0.357143,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.731266,0.5,0.69208,0.309524,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.131783,0.277174,0.234043,0.410714,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.496124,0.347826,0.723109,0.654762,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.136951,0.375,0.349291,0.458333,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.470284,0.293478,0.507092,0.446429,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,0.124031,0.23913,0.150709,0.357143,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,0.297158,0.168478,0.566489,0.720238,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.728682,0.538043,0.835402,0.357143,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,0.728682,0.538043,0.701832,0.297619,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [109]:
# Linear Regression model (Normalized data)
model_lr = LinearRegression()
model_lr.fit(X_train_norm, y_train)

y_pred = model_lr.predict(X_test_norm)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Linear (Normalized data) - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

# No difference!

Linear (Normalized data) - MSE: 7.692813307560207, R-squared: 0.8677441867705888


In [74]:
# Ridge Regression (Normalized)
model_ridge = Ridge()
model_ridge.fit(X_train_norm, y_train)

y_pred = model_ridge.predict(X_test_norm)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Ridge (Normalized data) - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

# MSE increased

Ridge (Normalized data) - MSE: 8.443259870407976, R-squared: 0.854842415144712


In [75]:
# Lasso Regression (Normalized)
model_lasso = Lasso()
model_lasso.fit(X_train_norm, y_train)

y_pred = model_lasso.predict(X_test_norm)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Lasso (Normalized data) - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

Lasso (Normalized data) - MSE: 27.29227010331996, R-squared: 0.5307878622448844


In [76]:
# ElasticNet Regression (Normalized)
model_elastic = ElasticNet()
model_elastic.fit(X_train_norm, y_train)

y_pred = model_elastic.predict(X_test_norm)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Elastic (Normalized data) - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

Elastic (Normalized data) - MSE: 32.09540971455245, R-squared: 0.44821168238183506


In [111]:
# Overall after Normalization results degraded (MSE stayed same or increased)

In [132]:
# Standardization
# We need to only use standardization on non-dummy features,
# as we don't want to add distribution to categorical (dummy) features

# All training columns
# X_train.columns

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

cols_to_scale = ['displacement', 'horsepower', 'weight', 'acceleration']
for col in cols_to_scale:
    std_scaler = StandardScaler().fit(X_train[[col]])
    X_train_scaled[col] = std_scaler.transform(X_train[[col]])
    X_test_scaled[col] = std_scaler.transform(X_test[[col]])

Unnamed: 0,displacement,horsepower,weight,acceleration,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,model_70,...,model_76,model_77,model_78,model_79,model_80,model_81,model_82,origin_1,origin_2,origin_3
142,79.0,67,1963,15.5,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
185,98.0,79,2255,17.7,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
362,146.0,120,2930,13.8,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
187,305.0,140,4215,13.0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
360,145.0,76,3160,19.6,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,70.0,100,2420,12.5,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
144,76.0,52,1649,16.5,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
290,351.0,142,4054,14.3,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
239,97.0,67,1985,16.4,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [133]:
# Linear Regression model (Standardized data)
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train)

y_pred = model_lr.predict(X_test_scaled)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Linear (Standardized data) - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

Linear (Standardized data) - MSE: 9.265085388632638, R-squared: 0.8316726957913729


In [134]:
# Ridge Regression (Standardized)
model_ridge = Ridge()
model_ridge.fit(X_train_scaled, y_train)

y_pred = model_ridge.predict(X_test_scaled)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Ridge (Standardized data) - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

Ridge (Standardized data) - MSE: 9.166849082331527, R-squared: 0.8334574448704659


In [135]:
# Lasso Regression (Standardized)
model_lasso = Lasso()
model_lasso.fit(X_train_scaled, y_train)

y_pred = model_lasso.predict(X_test_scaled)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Lasso (Standardized data) - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

Lasso (Standardized data) - MSE: 18.824242935815384, R-squared: 0.6580027129548421


In [136]:
# ElasticNet Regression (Standardized)
model_elastic = ElasticNet()
model_elastic.fit(X_train_scaled, y_train)

y_pred = model_elastic.predict(X_test_scaled)

lr_mse = mean_squared_error(y_test, y_pred)
lr_r2_score = r2_score(y_test, y_pred)
print('Elastic (Standardized data) - MSE: {}, R-squared: {}'.format(lr_mse, lr_r2_score))

Elastic (Standardized data) - MSE: 19.33821784225144, R-squared: 0.6486648594108899


In [138]:
# Results have degraded after Normalization and Standardization
# TODO - Analyze data again