In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/autompg-dataset/auto-mpg.csv


#### Using solution_1 as base reference and build on it

In [10]:
raw_data = pd.read_csv("/kaggle/input/autompg-dataset/auto-mpg.csv", na_values='?')
raw_data.head(20)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198.0,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220.0,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215.0,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225.0,4425,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190.0,3850,8.5,70,1,amc ambassador dpl


In [11]:
raw_data.shape

(398, 9)

In [3]:
raw_data.describe()
describe = raw_data.describe()
describe
# describe['horsepower']
raw_data.iloc[0]['horsepower']

'130'

In [12]:
raw_data.isnull().any()
# No missing values in columns

mpg             False
cylinders       False
displacement    False
horsepower       True
weight          False
acceleration    False
model year      False
origin          False
car name        False
dtype: bool

## Data Exploration and Engineering

### 1. Missing values

In [14]:
raw_data.isnull().any()
raw_data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [13]:
# Description in data says attribute "Horsepower" has missing values, hence, inspecting.
# 7 records found with '?' value - to be treated as missing value
# Solved this by reading the csv with 'na_values' argument

raw_data.horsepower.unique()

array([130., 165., 150., 140., 198., 220., 215., 225., 190., 170., 160.,
        95.,  97.,  85.,  88.,  46.,  87.,  90., 113., 200., 210., 193.,
        nan, 100., 105., 175., 153., 180., 110.,  72.,  86.,  70.,  76.,
        65.,  69.,  60.,  80.,  54., 208., 155., 112.,  92., 145., 137.,
       158., 167.,  94., 107., 230.,  49.,  75.,  91., 122.,  67.,  83.,
        78.,  52.,  61.,  93., 148., 129.,  96.,  71.,  98., 115.,  53.,
        81.,  79., 120., 152., 102., 108.,  68.,  58., 149.,  89.,  63.,
        48.,  66., 139., 103., 125., 133., 138., 135., 142.,  77.,  62.,
       132.,  84.,  64.,  74., 116.,  82.])

In [17]:
# Imputing missing value
# 'Horsepower' - Continuous values

raw_data['horsepower'] = raw_data['horsepower'].fillna(raw_data['horsepower'].mean())
raw_data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

### Continous and Categorical values

In [18]:
# Exploring attribute types and their significance
raw_data.displacement.unique() # Continuous
raw_data.horsepower.unique() # Continuous
raw_data.weight.unique() # Continous but can be put into buckets and treated as categorical
raw_data.acceleration.unique() # Continous but can be put into buckets and treated as categorical

raw_data.cylinders.unique() # Multi-valued discrete - Treating as Categorical
raw_data['model year'].unique() # Categorical
raw_data.origin.unique() # Categorical

raw_data['car name'].unique() # Unique, can be excluded from training and prediction. First word of the name (Manufacturer) can be used.
raw_data.mpg.unique() # Target attribute - Continuous - First guess will be regression algorithms

array([18. , 15. , 16. , 17. , 14. , 24. , 22. , 21. , 27. , 26. , 25. ,
       10. , 11. ,  9. , 28. , 19. , 12. , 13. , 23. , 30. , 31. , 35. ,
       20. , 29. , 32. , 33. , 17.5, 15.5, 14.5, 22.5, 24.5, 18.5, 29.5,
       26.5, 16.5, 31.5, 36. , 25.5, 33.5, 20.5, 30.5, 21.5, 43.1, 36.1,
       32.8, 39.4, 19.9, 19.4, 20.2, 19.2, 25.1, 20.6, 20.8, 18.6, 18.1,
       17.7, 27.5, 27.2, 30.9, 21.1, 23.2, 23.8, 23.9, 20.3, 21.6, 16.2,
       19.8, 22.3, 17.6, 18.2, 16.9, 31.9, 34.1, 35.7, 27.4, 25.4, 34.2,
       34.5, 31.8, 37.3, 28.4, 28.8, 26.8, 41.5, 38.1, 32.1, 37.2, 26.4,
       24.3, 19.1, 34.3, 29.8, 31.3, 37. , 32.2, 46.6, 27.9, 40.8, 44.3,
       43.4, 36.4, 44.6, 40.9, 33.8, 32.7, 23.7, 23.6, 32.4, 26.6, 25.8,
       23.5, 39.1, 39. , 35.1, 32.3, 37.7, 34.7, 34.4, 29.9, 33.7, 32.9,
       31.6, 28.1, 30.7, 24.2, 22.4, 34. , 38. , 44. ])

In [19]:
raw_data['origin'].unique()

array([1, 3, 2])

In [20]:
raw_data['model year'].unique()

array([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82])

In [21]:
raw_data['cylinders'].unique()

array([8, 4, 6, 3, 5])

In [22]:
# Handing Categorical values - 'cylinders', 'origin'. Leaving out 'model year'.

# All the attributes have numerical values, so Label encoding is not required here
# Using One Hot encoding/get_dummies
raw_data = pd.get_dummies(raw_data, columns=['cylinders', 'origin'], prefix=['cylinders', 'origin'])

In [23]:
raw_data.head(20)

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model year,car name,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,origin_1,origin_2,origin_3
0,18.0,307.0,130.0,3504,12.0,70,chevrolet chevelle malibu,0,0,0,0,1,1,0,0
1,15.0,350.0,165.0,3693,11.5,70,buick skylark 320,0,0,0,0,1,1,0,0
2,18.0,318.0,150.0,3436,11.0,70,plymouth satellite,0,0,0,0,1,1,0,0
3,16.0,304.0,150.0,3433,12.0,70,amc rebel sst,0,0,0,0,1,1,0,0
4,17.0,302.0,140.0,3449,10.5,70,ford torino,0,0,0,0,1,1,0,0
5,15.0,429.0,198.0,4341,10.0,70,ford galaxie 500,0,0,0,0,1,1,0,0
6,14.0,454.0,220.0,4354,9.0,70,chevrolet impala,0,0,0,0,1,1,0,0
7,14.0,440.0,215.0,4312,8.5,70,plymouth fury iii,0,0,0,0,1,1,0,0
8,14.0,455.0,225.0,4425,10.0,70,pontiac catalina,0,0,0,0,1,1,0,0
9,15.0,390.0,190.0,3850,8.5,70,amc ambassador dpl,0,0,0,0,1,1,0,0


In [24]:
# Dropping attribute 'car name'
del raw_data['car name']
raw_data.head(20)

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model year,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,origin_1,origin_2,origin_3
0,18.0,307.0,130.0,3504,12.0,70,0,0,0,0,1,1,0,0
1,15.0,350.0,165.0,3693,11.5,70,0,0,0,0,1,1,0,0
2,18.0,318.0,150.0,3436,11.0,70,0,0,0,0,1,1,0,0
3,16.0,304.0,150.0,3433,12.0,70,0,0,0,0,1,1,0,0
4,17.0,302.0,140.0,3449,10.5,70,0,0,0,0,1,1,0,0
5,15.0,429.0,198.0,4341,10.0,70,0,0,0,0,1,1,0,0
6,14.0,454.0,220.0,4354,9.0,70,0,0,0,0,1,1,0,0
7,14.0,440.0,215.0,4312,8.5,70,0,0,0,0,1,1,0,0
8,14.0,455.0,225.0,4425,10.0,70,0,0,0,0,1,1,0,0
9,15.0,390.0,190.0,3850,8.5,70,0,0,0,0,1,1,0,0


## Modeling

### Dependent/Independent variables

In [25]:
# Divide Dependent and Independent variables
X = raw_data.iloc[:, 1:]
y = raw_data.iloc[:, 0]

In [29]:
# Divide Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Standardization/Normalization

In [30]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Model

In [31]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

lr_mse = mean_squared_error(y_test, y_pred_lr)
print("MSE_LinearRegression: ", lr_mse)

MSE_LinearRegression:  12.449898488003043
