# Multiple Linear Regression- Car dataset

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the dataset

In [2]:
dataset=pd.read_csv("CAR.csv")
dataset.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [3]:
dataset.isnull().sum()

year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

### Split the dataset into independent and Dependent variables

In [10]:
X=dataset.drop('selling_price', axis=1)
y=dataset[['selling_price']]

In [11]:
X.head()

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner
0,2007,70000,Petrol,Individual,Manual,First Owner
1,2007,50000,Petrol,Individual,Manual,First Owner
2,2012,100000,Diesel,Individual,Manual,First Owner
3,2017,46000,Petrol,Individual,Manual,First Owner
4,2014,141000,Diesel,Individual,Manual,Second Owner


In [12]:
X.columns

Index(['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner'], dtype='object')

### Work with the catagorical data

In [13]:
X = pd.get_dummies(dataset[['year',
                            'km_driven',
                            'fuel',
                            'seller_type',
                            'transmission',
                            'owner']],
                  drop_first=True)

In [14]:
X.head()


Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0


## Splitting the dataset into the Training set and Test set
- Random State 20

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,
                                                 test_size=.2,
                                                 random_state=20)

In [17]:
X_test, X_val, y_test, y_val=train_test_split(X_test,
                                             y_test,
                                             test_size=.5,
                                             random_state=20)

## Training the Multiple Linear Regression model on the Training set

In [19]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values, y_train)

LinearRegression()

## Intercept and Coefficient

In [20]:
print("Coefficients", regressor.coef_)
print("Intercept", regressor.intercept_)

Coefficients [[ 3.62245575e+04 -8.31584195e-01  2.88702246e+05  1.60071068e-10
   4.65464254e+04  2.64057958e+03 -6.04980455e+04  1.71882689e+05
  -8.64323880e+05 -3.50851884e+03 -4.04890692e+04  1.83178786e+05
  -2.83903020e+04]]
Intercept [-71683645.58006921]


## Predicting the Test set results

In [21]:
y_pred=regressor.predict(X_test.values)

### Calculate RMSE, R-Square

In [23]:
from sklearn.metrics import mean_squared_error, r2_score
import math 
print(f'r-square: {r2_score(y_test, y_pred):.2f}')
print(f'MSE: {mean_squared_error(y_test,y_pred):.2f}')
print(f'RMSE: {math.sqrt(mean_squared_error(y_test,y_pred)):.2f}')

r-square: 0.54
MSE: 154171100550.96
RMSE: 392646.28


## Validation case scenario:
#### 1. Predict how much will be the car selling price for a car of 
- year 2014 
- 70000 km driven 
- fuel type Diesel
- Seller type Dealer
- manual transmission
- first owner

** 465000 ** actual


In [24]:
X.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0


In [25]:
X

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4332,2014,80000,1,0,0,0,1,0,1,0,1,0,0
4333,2014,80000,1,0,0,0,1,0,1,0,1,0,0
4334,2009,83000,0,0,0,1,1,0,1,0,1,0,0
4335,2016,90000,1,0,0,0,1,0,1,0,0,0,0


In [26]:
dataset

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner
...,...,...,...,...,...,...,...
4332,2014,409999,80000,Diesel,Individual,Manual,Second Owner
4333,2014,409999,80000,Diesel,Individual,Manual,Second Owner
4334,2009,110000,83000,Petrol,Individual,Manual,Second Owner
4335,2016,865000,90000,Diesel,Individual,Manual,First Owner


In [27]:
regressor.predict([[2014,70000,1,0,0,0,0,1,1,0,0,0,0]])

array([[810663.29247583]])