LIBRARIES 
    - PANDAS, NUMPY, SEABORN AND MATPLOTLIB
    - SKLEARN FOR LINEAR REGRESSION
    - MODEL SELECTION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## SET THE COLOR THEME FOR SEABORN
sns.set_theme(color_codes=True)

# PANDAS DATA FRAME TO DISPLAY ALL COLUMNS
pd.set_option('display.max_columns',None)
# PANDAS DATA FRAME WITH 2 DECMIMAL PLACES
pd.set_option('display.float_format', lambda x: '%.2f' % x)


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score

READ THE DATA FROM CSV FILE

In [2]:
# READ THE CSV FILE INTO DATAFRAME
df = pd.read_csv('ToyotaCorolla.csv', encoding='unicode_escape')

In [6]:
# DISPLAY THE TOP FIVE ROWS OF THE DATAFRAME
df.head()

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,Color,Automatic,cc,Doors,Cylinders,Gears,Quarterly_Tax,Weight,Mfr_Guarantee,BOVAG_Guarantee,Guarantee_Period,ABS,Airbag_1,Airbag_2,Airco,Automatic_airco,Boardcomputer,CD_Player,Central_Lock,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,Blue,0,2000,3,4,5,210,1165,0,1,3,1,1,1,0,0,1,0,1,1,1,0,0,0,1,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,Silver,0,2000,3,4,5,210,1165,0,1,3,1,1,1,1,0,1,1,1,0,1,0,0,0,1,0,0,0
2,3,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,Blue,0,2000,3,4,5,210,1165,1,1,3,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,Black,0,2000,3,4,5,210,1165,1,1,3,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,Black,0,2000,3,4,5,210,1170,1,1,3,1,1,1,1,0,1,0,1,1,1,0,1,0,1,0,0,0


FILTER OF NECESSARY COLUMNS FROM ORGINAL DATA FRAME INTO WORKING DATAFRAME

In [7]:
## Corolla<-Corolla[c("Price","Age_08_04","KM","HP","cc","Doors","Gears","Quarterly_Tax","Weight")] 
# 
filtered_df = df[["Price","Age_08_04","KM","HP","cc","Doors","Gears","Quarterly_Tax","Weight"]]
filtered_df.head()

Unnamed: 0,Price,Age_08_04,KM,HP,cc,Doors,Gears,Quarterly_Tax,Weight
0,13500,23,46986,90,2000,3,5,210,1165
1,13750,23,72937,90,2000,3,5,210,1165
2,13950,24,41711,90,2000,3,5,210,1165
3,14950,26,48000,90,2000,3,5,210,1165
4,13750,30,38500,90,2000,3,5,210,1170


ASSIGNING DEPENDENT AND INDEPENDENT VARIABLES

In [8]:
# INDEPENDENT VARIABLES
X = filtered_df.drop('Price', axis=1)

# DEPENDENT VARIABLES
y = df[['Price']]

In [None]:
X

In [None]:
y

MODEL CREATION

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [12]:
X_train.shape

(1148, 8)

In [13]:
X_test.shape

(288, 8)

In [14]:
y_train.shape

(1148, 1)

In [15]:
y_test.shape

(288, 1)

FIT THE REGRESSION MODEL

In [16]:
reg_model = LinearRegression().fit(X_train, y_train)

In [None]:
# INTERCEPTS (b - bias)
b = reg_model.intercept_ 
b # array([-4736.57306226])

array([-4736.57306226])

In [None]:
# COEFFICIENT (w - weights)
w= reg_model.coef_
w
## array([[-1.21492290e+02, -1.81611668e-02,  4.28116598e+01,
##        -2.31248176e+00, -3.18132240e+01,  4.67833377e+02,
##         8.94751241e+00,  1.84244866e+01]])

array([[-1.21492290e+02, -1.81611668e-02,  4.28116598e+01,
        -2.31248176e+00, -3.18132240e+01,  4.67833377e+02,
         8.94751241e+00,  1.84244866e+01]])

PREDICTION FOR MULTIPLE LINEAR REGRESSION

In [38]:
# Age_08_04 = 18
# KM = 21000
# HP = 90
# cc = 2000
# Doors	= 3, Gears=6, Quarterly_Tax=190	Weigh=2155

yindep_value = [[20], [21000], [90],[2000],[3],[5],[210],[1175]]
yindep_value = pd.DataFrame(yindep_value).T

reg_model.predict(yindep_value) # OUTPUT - array([[36039.6235464]])



array([[17451.75901451]])

In [39]:
# Train RMSE
y_pred = reg_model.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred)) # OUTPUT - 1330.2387045843745

1330.2387045843745

In [40]:
#Train RSQUARE
reg_model.score(X_train, y_train) # OUTPUT - 0.8629195100847789

0.8629195100847789

In [41]:
# Test RMSE
y_pred = reg_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred)) # OUPUT - 2380.0336297881863

2380.0336297881863

In [42]:
# Test RSQUARE
reg_model.score(X_test,y_test) # OUTPUT - 0.5978784673802031

0.5978784673802031

In [43]:
## CROSS VALIDATION - FOR 1O DATA IN cv 

-cross_val_score(reg_model, X, y, cv=10, scoring="neg_mean_squared_error") 

array([9389367.20321333, 3674764.50702764, 2446718.75766696,
       1519714.72939808, 1868819.18947632, 1013253.73249337,
       1504571.18169813, 1897143.48667803, 1159314.23094662,
       1491591.82031488])

In [44]:
## CROSS VALIDATION - FOR 1O DATA IN cv - SQUARE ROOT FOR THE ABOVE OUTPUT

np.sqrt(-cross_val_score(reg_model, X, y, cv=10, scoring="neg_mean_squared_error")) # WILL MULTIPLY BY NEGATIVE

array([3064.20743476, 1916.96752894, 1564.19907866, 1232.76710266,
       1367.04761785, 1006.60505288, 1226.6096289 , 1377.36831918,
       1076.71455407, 1221.30742253])

In [45]:
## #CROSS VALIDATION - FOR 5 DATA IN cv - MEAN FOR THE ABOVE OUTPUT

np.mean(np.sqrt(-cross_val_score(reg_model, X, y, cv=5, scoring="neg_mean_squared_error"))) # WILL MULTIPLY BY NEGATIVE
# OUTPUT : 1723.1941023261602

1723.1941023261602

OUR TEST ERROR (10 DATASET) : 
TRAINING ERROR (10 DATASET) : 
ERROR FROM THE CROSS VALIDATION (10 DATASET): 
ERROR FROM THE CROSS VALIDATION (5 DATASET) : 

SINCE OUR DATA SET IS ON THE SMALLER SIDE, TRUSTING THE 10 FOLD CROSS VALIDATION LIKE A SMARTER BET