### create a model that can predict the mpg(millege per gallon) of any car that come to garage.

##### Stages of building any model 
##### 1. Load data set
##### 2. Pre-process data
##### 3. Train Model
##### 4. Test model
##### 5. Evaluate model performance

In [31]:
# Load the data set

import pandas as pd
import numpy as np

df = pd.read_csv("autompg.csv")

In [32]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [34]:
#Horsepower column has some null values lets drop all that

df.dropna(inplace =True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model_year    392 non-null    int64  
 7   origin        392 non-null    object 
 8   name          392 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 30.6+ KB


In [35]:
#name column will nor reqiure to calculate mpg, lets drop it
df.drop('name',axis =1)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa


In [36]:
#Creating matrix of predictors i.e x values
X = df.iloc[:, 1:8]
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,usa
1,8,350.0,165.0,3693,11.5,70,usa
2,8,318.0,150.0,3436,11.0,70,usa
3,8,304.0,150.0,3433,12.0,70,usa
4,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,usa
394,4,97.0,52.0,2130,24.6,82,europe
395,4,135.0,84.0,2295,11.6,82,usa
396,4,120.0,79.0,2625,18.6,82,usa


In [37]:
#Creating target i.e y
y = df.iloc[:, 0]
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 392, dtype: float64

In [42]:
#pd.get_dummies(data= df, columns=['origin'])
X = pd.get_dummies(X)
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa
0,8,307.0,130.0,3504,12.0,70,0,0,1
1,8,350.0,165.0,3693,11.5,70,0,0,1
2,8,318.0,150.0,3436,11.0,70,0,0,1
3,8,304.0,150.0,3433,12.0,70,0,0,1
4,8,302.0,140.0,3449,10.5,70,0,0,1
...,...,...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,0,0,1
394,4,97.0,52.0,2130,24.6,82,1,0,0
395,4,135.0,84.0,2295,11.6,82,0,0,1
396,4,120.0,79.0,2625,18.6,82,0,0,1


In [46]:
#Split the data
# Scikit library is used for this

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)


In [50]:
X_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa
397,4,119.0,82.0,2720,19.4,82,0,0,1
160,6,231.0,110.0,3907,21.0,75,0,0,1
143,4,97.0,78.0,2300,14.5,74,1,0,0
346,4,97.0,67.0,2065,17.8,81,0,1,0
67,8,429.0,208.0,4633,11.0,72,0,0,1
...,...,...,...,...,...,...,...,...,...
325,4,90.0,48.0,2085,21.7,80,1,0,0
194,6,232.0,90.0,3085,17.6,76,0,0,1
118,4,116.0,75.0,2158,15.5,73,1,0,0
48,6,250.0,88.0,3139,14.5,71,0,0,1


In [52]:
y_train

397    31.0
160    17.0
143    26.0
346    32.3
67     11.0
       ... 
325    44.3
194    22.5
118    24.0
48     18.0
174    18.0
Name: mpg, Length: 274, dtype: float64

In [56]:
#Applying standard scaler on the data
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit_transform(X_train)
scale.transform(X_test)



array([[-0.88659063, -1.02317636, -0.78464367, ..., -0.44329059,
        -0.50114025,  0.75809804],
       [-0.88659063, -0.53828382, -0.44166898, ..., -0.44329059,
        -0.50114025,  0.75809804],
       [ 1.48341749,  1.49826488,  1.45788314, ..., -0.44329059,
        -0.50114025,  0.75809804],
       ...,
       [-0.88659063, -0.88740645, -0.2569903 , ...,  2.25585658,
        -0.50114025, -1.3190906 ],
       [-0.88659063, -0.43160746, -0.38890364, ..., -0.44329059,
        -0.50114025,  0.75809804],
       [ 0.29841343,  0.60606259, -0.2569903 , ..., -0.44329059,
        -0.50114025,  0.75809804]])

In [62]:
#Importing model for training data set
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

#fitting the model
reg.fit(X_train, y_train)

#Check the coeffient and intercet
m = reg.coef_
c = reg.intercept_
m,c


(array([-0.42567331,  0.01570832, -0.01351716, -0.00634677,  0.05903662,
         0.73695747,  0.68063071,  0.79609034, -1.47672105]),
 -13.399081636532241)

In [72]:
#Predicting mpg in the training dataset
y_pred_on_train = reg.predict(X_train)


In [92]:
# y_pred_on_train

In [73]:
#Predicting mpg in the testing data set
y_pred_on_test = reg.predict(X_test)

In [91]:
# y_pred_on_test

In [74]:
# Prediction Accuracy by r2 square score on train data set
from sklearn.metrics import r2_score
r2_S = r2_score(y_train, y_pred_on_train)
r2_S


0.8215150184732999

In [75]:
#On test data set
r2_s = r2_score(y_test, y_pred_on_test)
r2_s

0.8234552425657374

In [77]:
reg.score(X_test, y_test)

0.8234552425657374

In [79]:
df1 = X_test.copy()
df1['calculated_mpg'] = y_test
df1['predicted_mpg'] = y_pred_on_test
df1

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa,calculated_mpg,predicted_mpg
146,4,90.0,75.0,2125,14.5,74,0,0,1,28.0,25.725462
282,4,140.0,88.0,2890,17.3,79,0,0,1,22.3,25.329966
69,8,350.0,160.0,4456,13.5,72,0,0,1,12.0,10.630700
378,4,105.0,63.0,2125,14.7,82,0,0,1,38.0,32.030760
331,4,97.0,67.0,2145,18.0,80,0,1,0,33.8,32.717807
...,...,...,...,...,...,...,...,...,...,...,...
291,8,267.0,125.0,3605,15.0,79,0,0,1,19.2,20.448369
388,4,156.0,92.0,2585,14.5,82,0,0,1,26.0,29.508565
22,4,104.0,95.0,2375,17.5,70,1,0,0,25.0,23.474975
392,4,151.0,90.0,2950,17.3,82,0,0,1,27.0,27.305789


In [81]:
df1['Square_Error'] = (df1['calculated_mpg'] - df1['predicted_mpg'])**2
df1

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa,calculated_mpg,predicted_mpg,Square_Error
146,4,90.0,75.0,2125,14.5,74,0,0,1,28.0,25.725462,5.173521
282,4,140.0,88.0,2890,17.3,79,0,0,1,22.3,25.329966,9.180693
69,8,350.0,160.0,4456,13.5,72,0,0,1,12.0,10.630700,1.874982
378,4,105.0,63.0,2125,14.7,82,0,0,1,38.0,32.030760,35.631824
331,4,97.0,67.0,2145,18.0,80,0,1,0,33.8,32.717807,1.171142
...,...,...,...,...,...,...,...,...,...,...,...,...
291,8,267.0,125.0,3605,15.0,79,0,0,1,19.2,20.448369,1.558425
388,4,156.0,92.0,2585,14.5,82,0,0,1,26.0,29.508565,12.310029
22,4,104.0,95.0,2375,17.5,70,1,0,0,25.0,23.474975,2.325702
392,4,151.0,90.0,2950,17.3,82,0,0,1,27.0,27.305789,0.093507


In [82]:
RMSE = (np.sum(df1['Square_Error']))**0.5
RMSE

35.79817570801195

In [88]:
# df1['predicted_mpg']= y_pred_on_test
# df1.to_csv('result.csv')