# Multiple Linear Regression

## Import Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Import Dataset 

In [3]:
dataset = pd.read_csv("car_price_prediction.csv")

In [4]:
dataset.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

## Update the Missing Values

In [6]:
dataset.isnull().sum()

ID                  0
Price               0
Levy                0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64

## Split the dataset into independent and Dependent variables

In [7]:
dataset.columns

Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod. year',
       'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',
       'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',
       'Airbags'],
      dtype='object')

In [8]:
X = dataset[['ID', 'Levy', 'Manufacturer', 'Model', 'Prod. year',
       'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',
       'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',
       'Airbags']]
y = dataset[['Price']]

## Handling Categorical Variables

In [9]:
X = pd.get_dummies(dataset[['ID', 'Levy', 'Manufacturer', 'Model', 'Prod. year',
       'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',
       'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',
       'Airbags']],drop_first = True)

In [10]:
X.head()

Unnamed: 0,ID,Prod. year,Cylinders,Airbags,Levy_1011,Levy_1016,Levy_1017,Levy_1018,Levy_1024,Levy_1028,...,Color_Green,Color_Grey,Color_Orange,Color_Pink,Color_Purple,Color_Red,Color_Silver,Color_Sky blue,Color_White,Color_Yellow
0,45654403,2010,6.0,12,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,44731507,2011,6.0,8,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,45774419,2006,4.0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,45769185,2011,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,45809263,2014,4.0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


## Splitting the dataset into the Training set, Test set and validation set

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_rest,y_train,y_rest = train_test_split(X,y,
                                                 test_size = 0.2, 
                                                 random_state = 42)

In [12]:
# test_size is helping to get the percentage of validate data
X_test,X_val,y_test,y_val = train_test_split(X_rest,y_rest,
                                                 test_size = 0.5, 
                                                 random_state = 42)

## Training the Multiple Linear Regression model on the Training set

In [13]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

## Intercept and Coefficient

In [14]:
print("Coefficients", regressor.coef_)
print("Intercept", regressor.intercept_)

Coefficients [[ 2.80617568e-04  8.63994378e+02  4.46331114e+03 ... -1.97331893e+04
  -3.57875316e+04 -4.98078311e+03]]
Intercept [-1743446.25361881]


In [15]:
X_train.columns

Index(['ID', 'Prod. year', 'Cylinders', 'Airbags', 'Levy_1011', 'Levy_1016',
       'Levy_1017', 'Levy_1018', 'Levy_1024', 'Levy_1028',
       ...
       'Color_Green', 'Color_Grey', 'Color_Orange', 'Color_Pink',
       'Color_Purple', 'Color_Red', 'Color_Silver', 'Color_Sky blue',
       'Color_White', 'Color_Yellow'],
      dtype='object', length=10047)

## Validation set

In [16]:
y_predval = regressor.predict(X_val.values)

In [17]:
y_validate = y_val.to_numpy() # converting dataframe to numpy array

In [18]:
np.hstack((y_predval, y_validate)).round()

array([[ 1.0830e+03,  1.4110e+03],
       [ 9.2863e+05,  2.7755e+04],
       [ 7.0700e+02,  7.0600e+02],
       ...,
       [ 2.4328e+04,  6.5860e+03],
       [ 9.4100e+02,  9.4100e+02],
       [-8.7770e+03,  3.3000e+01]])

## Predicting the Test set results

In [19]:
y_pred = regressor.predict(X_test.values)

## RMSE and R Square

In [20]:
from sklearn.metrics import mean_squared_error, r2_score
import math
print(f"r-square: {r2_score(y_test, y_pred): 0.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred): 0.2f}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test, y_pred)): 0.2f}")

r-square: -31.61
MSE:  10819465605.45
RMSE:  104016.66
