# Multiple Linear Regression- Car dataset

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Importing the dataset

In [2]:
dataset=pd.read_csv("CAR.csv")

In [3]:
dataset.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [4]:
dataset.info() #No missing values to impute

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year           4337 non-null   int64 
 1   selling_price  4337 non-null   int64 
 2   km_driven      4337 non-null   int64 
 3   fuel           4337 non-null   object
 4   seller_type    4337 non-null   object
 5   transmission   4337 non-null   object
 6   owner          4337 non-null   object
dtypes: int64(3), object(4)
memory usage: 237.3+ KB


### Split the dataset into independent and Dependent variables

In [5]:
dataset.columns

Index(['year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')

In [6]:
X=dataset[['year', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner']]
y=dataset[['selling_price']]

### Work with the catagorical data

In [7]:
X=pd.get_dummies(dataset[['year',
                          'km_driven',
                          'fuel',
                          'seller_type',
                          'transmission',
                          'owner']],
                drop_first=True)

In [8]:
X.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0


In [9]:
X.sample(10)

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
2833,2016,81595,1,0,0,0,0,0,1,0,0,0,0
2013,2020,10000,0,0,0,1,1,0,1,0,0,0,0
960,2020,1500,1,0,0,0,1,0,0,0,0,0,0
224,2017,20000,1,0,0,0,1,0,1,0,0,0,0
628,1995,100000,0,0,0,1,1,0,1,0,1,0,0
3142,2017,40000,0,0,0,1,1,0,0,0,0,0,0
2004,2018,50000,1,0,0,0,1,0,1,0,0,0,0
4335,2016,90000,1,0,0,0,1,0,1,0,0,0,0
2373,2018,31000,0,0,0,1,0,0,1,0,0,0,0
2968,2014,110000,1,0,0,0,1,0,1,0,1,0,0


### Encoding Guide
##### fuel_Diesel = 1000
##### fuel_Electric = 0100
##### fuel_LPG = 0010
##### fuel_Petrol = 0001
##### fuel_CNG = 0000 (dropped)
##### seller_type_Individual = 10
##### seller_type_TrustmarkDealer = 01
##### seller_type_Dealer = 00 (dropped)
##### transmission_Manual = 1
##### transmission_Automatic = 0 (dropped)
##### owner_Fourth&AboveOwner = 1000
##### owner_SecondOwner = 0100
##### owner_TestDriveCar = 0010
##### owner_ThirdOwner = 0001
##### owner_FirstOwner = 0000 (dropped)

## Splitting the dataset into the Training set and Test set
- Random State 20

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,
                                                 test_size=.2,
                                                 random_state=20)

## Training the Multiple Linear Regression model on the Training set

In [11]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values, y_train)

LinearRegression()

## Intercept and Coefficient

In [12]:
print("Coefficients: ", regressor.coef_)
print("Intercept: ", regressor.intercept_)

Coefficients:  [[ 3.62245575e+04 -8.31584195e-01  2.88702246e+05  5.82076609e-11
   4.65464254e+04  2.64057958e+03 -6.04980455e+04  1.71882689e+05
  -8.64323880e+05 -3.50851884e+03 -4.04890692e+04  1.83178786e+05
  -2.83903020e+04]]
Intercept:  [-71683645.58006924]


In [13]:
X_train.columns

Index(['year', 'km_driven', 'fuel_Diesel', 'fuel_Electric', 'fuel_LPG',
       'fuel_Petrol', 'seller_type_Individual', 'seller_type_Trustmark Dealer',
       'transmission_Manual', 'owner_Fourth & Above Owner',
       'owner_Second Owner', 'owner_Test Drive Car', 'owner_Third Owner'],
      dtype='object')

### Equation
##### selling_price = -71683645.58 + 3.62x10^4(*year) - 8.32x10^-1(*km_driven) + 2.89x10^5(*fuel_Diesel) + 5.82x10^-11(*fuel_Electric) + 4.65x10^4(*fuel_LPG) + 2.64x10^3(*fuel_Petrol) - 6.05x10^4(*seller_type_Individual) + 1.72x10^5(*seller_type_Trustmark Dealer) - 8.64x10^5(*transmission_Manual) - 3.51x10^3(*owner_Fourth & Above Owner) - 4.05x10^4(*owner_Second Owner) + 1.83x10^5(*owner_Test Drive Car) - 2.84x10^4(*owner_Third Owner)

## Predicting the Test set results

In [14]:
y_pred=regressor.predict(X_test.values)

### Calculate RMSE, R-Square

In [15]:
from sklearn.metrics import mean_squared_error, r2_score
import math
print(f"R Square: {r2_score(y_test,y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test,y_pred):.2f}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test,y_pred)):.2f}")

R Square: 0.52
MSE: 142267011638.16
RMSE: 377182.99


## Validation case scenario:
#### 1. Predict how much will be the car selling price for a car of 
- year 2014 
- 70000 km driven 
- fuel type Diesel
- Seller type Dealer
- manual transmission
- first owner

** 465000 ** actual


In [16]:
X_test.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
406,2012,80000,1,0,0,0,1,0,1,0,0,0,1
3027,2013,100000,1,0,0,0,1,0,1,0,0,0,0
2277,2016,30000,0,0,0,1,1,0,1,0,0,0,0
799,2017,7658,0,0,0,1,0,0,0,0,0,0,0
2738,2012,110000,0,0,0,1,1,0,1,0,1,0,0


In [17]:
regressor.predict([[2014,70000,1,0,0,0,0,0,1,0,0,0,0]])

array([[638780.60333151]])

In [18]:
#It is predicted the selling price would be $638,780.60
##Actual selling price is $465,000, which is a difference of approximately $173,781