# Problem Statement
Predicting the costs of used cars given the data collected from various sources and distributed across various locations in India.

## Import libraries

In [1]:
#Importing all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.simplefilter('ignore')
from pandas import set_option
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.neighbors import KNeighborsRegressor
import copy as cp

import re
warnings.filterwarnings('ignore')

## Importing the Datasets

In [2]:
#Importing the Datasets
df_train = pd.read_excel("Data_Train.xlsx")
df_test = pd.read_excel("Data_Test.xlsx")

## Performing EDA - Exploratory Data Analysis

In [3]:
#Identifying the number of features in the Datasets
df_train.shape , df_test.shape

((6019, 12), (1234, 11))

In [4]:
#Identifying the features in the Datasets
print(list(df_train.columns))
print(list(df_test.columns))

['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats', 'Price']
['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats']


In [5]:
#Identifying the data types of features provided in train and test set
print("\nTraining Set : \n","\n", df_train.dtypes)
print("\nTest Set : \n","\n",df_test.dtypes)


Training Set : 
 
 Name                  object
Location              object
Year                   int64
Kilometers_Driven      int64
Fuel_Type             object
Transmission          object
Owner_Type            object
Mileage               object
Engine                object
Power                 object
Seats                float64
Price                float64
dtype: object

Test Set : 
 
 Name                  object
Location              object
Year                   int64
Kilometers_Driven      int64
Fuel_Type             object
Transmission          object
Owner_Type            object
Mileage               object
Engine                object
Power                 object
Seats                float64
dtype: object


In [6]:
#Identifying the nummber of empty/null cells or NaNs by features
print(df_train.isnull().sum())
print()
print(df_test.isnull().sum())

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               36
Power                36
Seats                42
Price                 0
dtype: int64

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               0
Engine               10
Power                10
Seats                11
dtype: int64


In [7]:
#Check statistics for train data
df_train.describe(include = 'all')

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
count,6019,6019,6019.0,6019.0,6019,6019,6019,6017,5983,5983,5977.0,6019.0
unique,1876,11,,,5,2,4,442,146,372,,
top,Mahindra XUV500 W8 2WD,Mumbai,,,Diesel,Manual,First,17.0 kmpl,1197 CC,74 bhp,,
freq,49,790,,,3205,4299,4929,172,606,235,,
mean,,,2013.358199,58738.38,,,,,,,5.278735,9.479468
std,,,3.269742,91268.84,,,,,,,0.80884,11.187917
min,,,1998.0,171.0,,,,,,,0.0,0.44
25%,,,2011.0,34000.0,,,,,,,5.0,3.5
50%,,,2014.0,53000.0,,,,,,,5.0,5.64
75%,,,2016.0,73000.0,,,,,,,5.0,9.95


In [8]:
#Check statistics for test data
df_test.describe(include = 'all')

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats
count,1234,1234,1234.0,1234.0,1234,1234,1234,1234,1224,1224,1223.0
unique,768,11,,,4,2,4,301,104,249,
top,Maruti Alto LXi,Mumbai,,,Diesel,Manual,First,17.0 kmpl,1197 CC,74 bhp,
freq,9,159,,,647,905,1023,35,126,45,
mean,,,2013.400324,58507.288493,,,,,,,5.284546
std,,,3.1797,35598.702098,,,,,,,0.825622
min,,,1996.0,1000.0,,,,,,,2.0
25%,,,2011.0,34000.0,,,,,,,5.0
50%,,,2014.0,54572.5,,,,,,,5.0
75%,,,2016.0,75000.0,,,,,,,5.0


## Data Cleaning

In [9]:
#Appending Test and Train Data Frame In to One dataFrame
df = df_train.append(df_test, ignore_index=True, sort=False)

In [10]:
#removing Electric vehicals
df = df[df['Fuel_Type'] != 'Electric']
len(df)

7251

In [11]:
#Adding Are age according to 2020
df['Car_Age'] = 2020 - df['Year']
#Removing Unit
df['Mileage'] = df['Mileage'].apply(lambda x : str(x).split(' ')[0]).astype(float)
df['Engine'] = df['Engine'].apply(lambda x : str(x).split(" ")[0]).astype(float)
df['Power'] = df['Power'].replace('null bhp','0 bhp').apply(lambda x : str(x).split(' ')[0]).astype(float)
#Adding seat as 5 where seat value is null
df['Seats'] = df['Seats'].fillna(5)
#Creating columms of company followed by car model ---> Car_Brand1
df['Car_Brand1'] = df['Name'].apply(lambda x: ' '.join(x.split(' ')[:2]))
#substituting Engine and Power null value  with there mean
df['Engine'] = df.groupby(['Car_Brand1']).transform(lambda x: x.fillna(x.median()))['Engine']
df['Power'] = df.groupby(['Car_Brand1']).transform(lambda x: x.fillna(x.median()))['Power']
#Creating columms of company ---> Car_Brand2
df['Car_Brand2'] = df['Name'].apply(lambda x: x.split(' ')[0])

In [12]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Car_Age,Car_Brand1,Car_Brand2
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75,10,Maruti Wagon,Maruti
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5,5,Hyundai Creta,Hyundai
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5,9,Honda Jazz,Honda
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0,8,Maruti Ertiga,Maruti
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74,7,Audi A4,Audi


In [13]:
#changing catagorical variable to numbers
df_obj = df.select_dtypes(exclude=['int64','float64'])
df_num = df.select_dtypes(include=['int64','float64'])
df_encoded = df_obj.apply(LabelEncoder().fit_transform)
df_2 = df_num.join(df_encoded)

In [14]:
df_obj.shape, df_num.shape, df_encoded.shape, df_2.shape

((7251, 7), (7251, 8), (7251, 7), (7251, 15))

In [15]:
df_2['Mileage'].replace(0.00, np.nan, inplace= True) #As Milage can't be 0.00
df_2['Seats'].replace(0.00, np.nan, inplace= True) #As Seats can't be 0.00

df_2['Mileage'].replace(0.00, np.nan, inplace= True) #As Milage can't be 0.00
df_2['Seats'].replace(0.00, np.nan, inplace= True) #As Seats can't be 0.00

In [16]:
#Dropping name and Year because we have age and car_brand1 and car_brand 2
df_2.drop(columns=['Name','Year'], axis = 1, inplace=True)
df_2['Price'] = df_2['Price'].fillna(0.00)

In [17]:
#Attend to missing values
df_2['Mileage']=df_2['Mileage'].fillna(df_2['Mileage'].median())
df_2['Seats']=df_2['Seats'].fillna(5)

In [18]:
#Covert Seats and Engine feature to int
df_2['Seats']=df_2['Seats'].astype(int)
df_2['Engine']=df_2['Engine'].astype(int)

In [19]:
df_2.head()

Unnamed: 0,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,Car_Age,Location,Fuel_Type,Transmission,Owner_Type,Car_Brand1,Car_Brand2
0,72000,26.6,998,58.16,5,1.75,10,9,0,1,0,129,19
1,41000,19.67,1582,126.2,5,12.5,5,10,1,1,0,68,11
2,46000,18.2,1199,88.7,5,4.5,9,2,3,1,0,63,10
3,87000,20.77,1248,88.76,7,6.0,8,2,1,1,0,116,19
4,40670,15.2,1968,140.8,5,17.74,7,3,1,0,2,2,1


In [20]:
df_2.isnull().sum()

Kilometers_Driven    0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
Car_Age              0
Location             0
Fuel_Type            0
Transmission         0
Owner_Type           0
Car_Brand1           0
Car_Brand2           0
dtype: int64

### OneHotEncoding

In [21]:
# importing one hot encoder from sklearn 
from sklearn.preprocessing import OneHotEncoder 
#One hot encoding catagorical variables
onehotencoder = OneHotEncoder(categorical_features = [7,8,9,10,11,12]) 
df_2 = onehotencoder.fit_transform(df_2).toarray()
df_2 = pd.DataFrame(df_2)

In [22]:
df_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,272,273,274,275,276,277,278,279,280,281
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,72000.0,26.6,998.0,58.16,5.0,1.75,10.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,41000.0,19.67,1582.0,126.2,5.0,12.5,5.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,46000.0,18.2,1199.0,88.7,5.0,4.5,9.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,87000.0,20.77,1248.0,88.76,7.0,6.0,8.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,40670.0,15.2,1968.0,140.8,5.0,17.74,7.0


In [23]:
#dividing traning and test dataset 
train_df = df_2[df_2[280]!=0.0]# 280 = Price
test_df = df_2[df_2[280]==0.0]
test_df.drop(columns=[280], axis = 1, inplace=True)

In [24]:
train_df.shape, test_df.shape

((6017, 282), (1234, 281))

In [25]:
#No of null values for each feature
print(train_df.isnull().sum(),'\n',test_df.isnull().sum())

0      0
1      0
2      0
3      0
4      0
      ..
277    0
278    0
279    0
280    0
281    0
Length: 282, dtype: int64 
 0      0
1      0
2      0
3      0
4      0
      ..
276    0
277    0
278    0
279    0
281    0
Length: 281, dtype: int64


In [26]:
### Scaling/Normalization of Features
#sc = StandardScaler()
#test_df_arr_scld = sc.fit_transform(test_df)
#test_df_2=pd.DataFrame(test_df_arr_scld, columns=test_df.columns)
test_df_2 = test_df.copy()
#train_df_arr_scld = sc.fit_transform(train_df)
#train_df_2=pd.DataFrame(train_df_arr_scld, columns=train_df.columns)
train_df_2 = train_df.copy()

In [27]:
train_df_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,272,273,274,275,276,277,278,279,280,281
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,72000.0,26.6,998.0,58.16,5.0,1.75,10.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,41000.0,19.67,1582.0,126.2,5.0,12.5,5.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,46000.0,18.2,1199.0,88.7,5.0,4.5,9.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,87000.0,20.77,1248.0,88.76,7.0,6.0,8.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,40670.0,15.2,1968.0,140.8,5.0,17.74,7.0


In [28]:
test_df_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,271,272,273,274,275,276,277,278,279,281
6017,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,40929.0,32.26,998.0,58.2,4.0,6.0
6018,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,54493.0,24.7,796.0,47.3,5.0,7.0
6019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,34000.0,13.68,2393.0,147.8,7.0,3.0
6020,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,139000.0,23.59,1364.0,0.0,5.0,8.0
6021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,29000.0,18.5,1197.0,82.85,5.0,6.0


## Model Building, Predicting and Evaluation

In [29]:
#dividing dataset into X and Y 
train_y = train_df_2[280]
train_df_2.drop(columns=[280], axis = 1, inplace=True)
train_x = train_df_2

In [30]:
train_x.columns#280 is missing so length and last column is 281

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            271, 272, 273, 274, 275, 276, 277, 278, 279, 281],
           dtype='int64', length=281)

In [31]:
print(train_y)

0        1.75
1       12.50
2        4.50
3        6.00
4       17.74
        ...  
6012     4.75
6013     4.00
6014     2.90
6015     2.65
6016     2.50
Name: 280, Length: 6017, dtype: float64


In [32]:
#Train Test Split on the Train dataset
seed = 15
test_size = 0.3
X_train, X_val, Y_train, Y_val = train_test_split(train_x, train_y, test_size = test_size, random_state = seed)

In [33]:
Y_train.isnull().sum()

0

## RandomForestRegressor without any hyperparameters, i.e Default paramters

In [34]:
reg = RandomForestRegressor()
reg.fit(X_train,  Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [35]:
Y_train_pred = reg.predict(X_train)
Y_val_pred = reg.predict(X_val)
Y_test_pridiction = reg.predict(test_df_2)#this line pridicts the price vlues of the test dataset

train_RMSE=np.sqrt(mean_squared_error(Y_train,Y_train_pred))
val_RMSE=np.sqrt(mean_squared_error(Y_val,Y_val_pred))
print('RandomForestRegressor Train RMSE: ', train_RMSE)
print('RandomForestRegressor Validation RMSE: ', val_RMSE)
print('Score for Train Data: ', reg.score(X_train,Y_train))
print('Score for Validation Data: ', reg.score(X_val,Y_val))

RandomForestRegressor Train RMSE:  1.6731187878686375
RandomForestRegressor Validation RMSE:  4.547440504063445
Score for Train Data:  0.9761061598968589
Score for Validation Data:  0.8560671439744345


In [36]:
#Using Cross Validation
scores = cross_val_score(reg, train_x, train_y, cv=10)

scores

array([0.90919851, 0.92216819, 0.90372741, 0.9153646 , 0.94710547,
       0.81847155, 0.77294153, 0.92729282, 0.91159744, 0.83846971])

In [37]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.89 (+/- 0.11)


In [38]:
#test_df_arr_scld = sc.inverse_transform(df_2)
#test_df_2=pd.DataFrame(test_df_arr_scld, columns=test_df_2.columns)


#### Writing Result in 'Output_RandomForestRegressor.xlsx'

In [39]:
df_test['Price'] = Y_test_pridiction

In [40]:
df_test.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,2.775
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,2.778
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0,16.908
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0,3.725
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0,4.898


In [51]:
df_sub = pd.DataFrame(data=df_test)
writer = pd.ExcelWriter('Output_RandomForestRegressor.xlsx', engine='xlsxwriter')
df_sub.to_excel(writer,sheet_name='Sheet1', index=False)
writer.save()

## RandomForestRegressor with hyperparameters

In [42]:
#Finding optimal parameters via grid_search
reg = RandomForestRegressor()
param_dist = {"max_features": sp_randint(1, 10),
              "min_samples_split": sp_randint(2, 10),
              "max_depth": [2,3,4,5,6,7,8,9,10],
              "min_samples_leaf": sp_randint(2, 10),
              "n_estimators" : sp_randint(1, 40)}

n_iter_search = 40
random_search = RandomizedSearchCV(reg, param_distributions=param_dist, cv=10,
                                   n_iter=n_iter_search)
random_search.fit(train_x,train_y)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_st...


In [43]:
random_search.best_params_

{'max_depth': 9,
 'max_features': 8,
 'min_samples_leaf': 5,
 'min_samples_split': 6,
 'n_estimators': 28}

In [44]:
reg = RandomForestRegressor(n_estimators=35,min_samples_split=15,max_features=7,max_depth=9,min_samples_leaf=2)
reg.fit(X_train,  Y_train)

reg_temp = cp.deepcopy(reg) #After all analysis, this turns out to be the model with highest accuracy, hence keeping a copy of it

In [45]:
Y_train_pred = reg.predict(X_train)
Y_val_pred = reg.predict(X_val)
Y_test_pridiction = reg.predict(test_df_2)#this line pridicts the price vlues of the test dataset

train_RMSE=np.sqrt(mean_squared_error(Y_train,Y_train_pred))
val_RMSE=np.sqrt(mean_squared_error(Y_val,Y_val_pred))
print('RandomForestRegressor Train RMSE: ', train_RMSE)
print('RandomForestRegressor Validation RMSE: ', val_RMSE)
print('Score for Train Data: ', reg.score(X_train,Y_train))
print('Score for Validation Data: ', reg.score(X_val,Y_val))

RandomForestRegressor Train RMSE:  6.749308597253119
RandomForestRegressor Validation RMSE:  8.075674790510591
Score for Train Data:  0.6111778722662836
Score for Validation Data:  0.5460755933353392


In [46]:
scores = cross_val_score(reg, train_x, train_y, cv=10)

scores

array([0.61008824, 0.55081252, 0.62222832, 0.55949693, 0.63094496,
       0.65281798, 0.5276967 , 0.64419226, 0.65029071, 0.54856955])

In [47]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.60 (+/- 0.09)


#### Writing Result in 'Output_RandomForestRegressor_hyperparameters.xlsx'

In [48]:
df_test['Price'] = Y_test_pridiction

In [49]:
df_test.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,5.65279
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,5.321865
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0,12.597875
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0,7.225822
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0,5.530752


In [50]:
df_sub = pd.DataFrame(data=df_test)
writer = pd.ExcelWriter('Output_RandomForestRegressor_hyperparameters.xlsx', engine='xlsxwriter')
df_sub.to_excel(writer,sheet_name='Sheet1', index=False)
writer.save()

## KNNRegressor without any hyperparameters

In [52]:
reg = KNeighborsRegressor()
reg.fit(X_train,  Y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [53]:
Y_train_pred = reg.predict(X_train)
Y_val_pred = reg.predict(X_val)
Y_test_pridiction = reg.predict(test_df_2)#this line pridicts the price vlues of the test dataset

train_RMSE=np.sqrt(mean_squared_error(Y_train,Y_train_pred))
val_RMSE=np.sqrt(mean_squared_error(Y_val,Y_val_pred))
print('KNeighborsRegressor Train RMSE: ', train_RMSE)
print('KNeighborsRegressor Validation RMSE: ', val_RMSE)
print('Score for Train Data: ', reg.score(X_train,Y_train))
print('Score for Validation Data: ', reg.score(X_val,Y_val))

KNeighborsRegressor Train RMSE:  5.739066503492769
KNeighborsRegressor Validation RMSE:  8.049760985975917
Score for Train Data:  0.7188650006320806
Score for Validation Data:  0.5489840897323454


In [54]:
scores = cross_val_score(reg, train_x, train_y, cv=10)

scores

array([0.56450849, 0.57980151, 0.56963149, 0.52226228, 0.61253383,
       0.56326604, 0.49388524, 0.61801347, 0.56425099, 0.60333262])

In [55]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.57 (+/- 0.07)


#### Writing Result in 'Output_KNNRegressor.xlsx'

In [56]:
df_test['Price'] = Y_test_pridiction

In [57]:
df_test.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,3.36
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,3.48
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0,20.95
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0,5.56
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0,4.36


In [58]:
df_sub = pd.DataFrame(data=df_test)
writer = pd.ExcelWriter('Output_KNNRegressor.xlsx', engine='xlsxwriter')
df_sub.to_excel(writer,sheet_name='Sheet1', index=False)
writer.save()

## KNNRegressor with hyperparameters

In [59]:
k_range = range(5,15)

for k in k_range:
    reg = KNeighborsRegressor(k)
    reg.fit(X_train,Y_train)
    Y_train_pred = reg.predict(X_train)
    Y_val_pred = reg.predict(X_val)
    Y_test_pridiction = reg.predict(test_df_2)#this line pridicts the price vlues of the test dataset

    train_RMSE=np.sqrt(mean_squared_error(Y_train,Y_train_pred))
    val_RMSE=np.sqrt(mean_squared_error(Y_val,Y_val_pred))
    print("For K value : ",k)
    print("---------------------------------------------------")
    print('KNeighborsRegressor Train RMSE: ', train_RMSE)
    print('KNeighborsRegressor Validation RMSE: ', val_RMSE)
    print('KNeighborsRegressor: Test RMSE - Validation RMSE: ', train_RMSE-val_RMSE)
    print("\n")
    print('Score for Train Data: ', reg.score(X_train,Y_train))
    print('Score for Validation Data: ', reg.score(X_val,Y_val))
    print("\n")

For K value :  5
---------------------------------------------------
KNeighborsRegressor Train RMSE:  5.739066503492769
KNeighborsRegressor Validation RMSE:  8.049760985975917
KNeighborsRegressor: Test RMSE - Validation RMSE:  -2.3106944824831483


Score for Train Data:  0.7188650006320806
Score for Validation Data:  0.5489840897323454


For K value :  6
---------------------------------------------------
KNeighborsRegressor Train RMSE:  5.945594389040329
KNeighborsRegressor Validation RMSE:  8.13046895619079
KNeighborsRegressor: Test RMSE - Validation RMSE:  -2.1848745671504606


Score for Train Data:  0.6982668982381763
Score for Validation Data:  0.5398948615606475


For K value :  7
---------------------------------------------------
KNeighborsRegressor Train RMSE:  6.1625011438986865
KNeighborsRegressor Validation RMSE:  8.227543292747336
KNeighborsRegressor: Test RMSE - Validation RMSE:  -2.06504214884865


Score for Train Data:  0.6758497008027158
Score for Validation Data:  0.5

In [60]:
reg = KNeighborsRegressor(7)
reg.fit(X_train,  Y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                    weights='uniform')

In [61]:
Y_train_pred = reg.predict(X_train)
Y_val_pred = reg.predict(X_val)
Y_test_pridiction = reg.predict(test_df_2)#this line pridicts the price vlues of the test dataset

train_RMSE=np.sqrt(mean_squared_error(Y_train,Y_train_pred))
val_RMSE=np.sqrt(mean_squared_error(Y_val,Y_val_pred))
print('KNeighborsRegressor Train RMSE: ', train_RMSE)
print('KNeighborsRegressor Validation RMSE: ', val_RMSE)
print('Score for Train Data: ', reg.score(X_train,Y_train))
print('Score for Validation Data: ', reg.score(X_val,Y_val))

KNeighborsRegressor Train RMSE:  6.1625011438986865
KNeighborsRegressor Validation RMSE:  8.227543292747336
Score for Train Data:  0.6758497008027158
Score for Validation Data:  0.5288423532083327


In [62]:
scores = cross_val_score(reg, train_x, train_y, cv=10)

scores

array([0.5744737 , 0.5518754 , 0.5623545 , 0.50852333, 0.63246532,
       0.58227552, 0.46339278, 0.61727032, 0.6022581 , 0.59097663])

In [63]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.57 (+/- 0.10)


#### Writing Result in 'Output_KNNRegressor_hyperparameters.xlsx'

In [64]:
df_test['Price'] = Y_test_pridiction

In [65]:
df_test.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,3.86
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,4.234286
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0,21.278571
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0,4.271429
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0,4.864286


In [66]:
df_sub = pd.DataFrame(data=df_test)
writer = pd.ExcelWriter('Output_KNNRegressor_hyperparameters.xlsx', engine='xlsxwriter')
df_sub.to_excel(writer,sheet_name='Sheet1', index=False)
writer.save()

## Decision Tree Regressor

In [70]:
#X_train, X_test, Y_train, Y_test = train_test_split(features_final, span_new['price'], test_size=0.33, random_state=42)
reg = tree.DecisionTreeRegressor(max_depth=3)
reg.fit(X_train,Y_train)
Y_train_pred = reg.predict(X_train)
Y_val_pred = reg.predict(X_val)
Y_test_pridiction = reg.predict(test_df_2)#this line pridicts the price vlues of the test dataset

train_RMSE=np.sqrt(mean_squared_error(Y_train,Y_train_pred))
val_RMSE=np.sqrt(mean_squared_error(Y_val,Y_val_pred))
print('Decision Tree Regressor Train RMSE: ', train_RMSE)
print('Decision Tree Regressor Validation RMSE: ', val_RMSE)
print('Score for Train Data: ', reg.score(X_train,Y_train))
print('Score for Validation Data: ', reg.score(X_val,Y_val))

Decision Tree Regressor Train RMSE:  5.359756950882754
Decision Tree Regressor Validation RMSE:  6.343867010081507
Score for Train Data:  0.7547987988267252
Score for Validation Data:  0.7198865053593095


In [72]:
trknn_scores=[]
teknn_scores= []
rmse_scores=[]
for i in np.arange(1,20,1):
    reg = tree.DecisionTreeRegressor(max_depth=i,random_state=42)
    reg.fit(X_train,Y_train)
    Y_train_pred = reg.predict(X_train)
    Y_val_pred = reg.predict(X_val)
    Y_test_pridiction = reg.predict(test_df_2)
    train_scores = reg.score(X_train,Y_train)
    val_scores = reg.score(X_val,Y_val)
    # The Root mean squared error
    trknn_scores.append(train_scores)
    teknn_scores.append(val_scores)
    rmse_scores.append(np.sqrt(mean_squared_error(Y_val, Y_val_pred)))

In [73]:
from sklearn import tree
from sklearn.metrics import mean_squared_error
#X_train, X_test, y_train, y_test = train_test_split(features_final, span_new['price'], test_size=0.33, random_state=42)
reg = tree.DecisionTreeRegressor(max_depth=8,random_state=42)
reg.fit(X_train,Y_train)
Y_train_pred = reg.predict(X_train)
Y_val_pred = reg.predict(X_val)
Y_test_pridiction = reg.predict(test_df_2)


train_RMSE=np.sqrt(mean_squared_error(Y_train,Y_train_pred))
val_RMSE=np.sqrt(mean_squared_error(Y_val,Y_val_pred))
print('Decision Tree Regressor Train RMSE: ', train_RMSE)
print('Decision Tree Regressor Validation RMSE: ', val_RMSE)
print('Score for Train Data: ', reg.score(X_train,Y_train))
print('Score for Validation Data: ', reg.score(X_val,Y_val))

Decision Tree Regressor Train RMSE:  2.3867360149072883
Decision Tree Regressor Validation RMSE:  4.852000808467565
Score for Train Data:  0.9513770645962355
Score for Validation Data:  0.836142010922837


In [74]:
scores = cross_val_score(reg, train_x, train_y, cv=10)

scores

array([0.86666347, 0.8570604 , 0.80713371, 0.79352683, 0.86544284,
       0.81880558, 0.7412111 , 0.84985702, 0.85282096, 0.77702013])

In [75]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.82 (+/- 0.08)


#### Writing Result in 'Output_Decision_Tree_Regressor.xlsx'

In [76]:
df_test['Price'] = Y_test_pridiction

In [77]:
df_test.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,3.66439
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,2.726
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0,18.420833
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0,2.690392
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0,4.58016


In [78]:
df_sub = pd.DataFrame(data=df_test)
writer = pd.ExcelWriter('Output_Decision_Tree_Regressor.xlsx', engine='xlsxwriter')
df_sub.to_excel(writer,sheet_name='Sheet1', index=False)
writer.save()

## Linear Regression Model


In [81]:
# Linear Regression Sklearn
from sklearn.linear_model import LinearRegression
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
reg = LinearRegression()
reg.fit(X_train,Y_train)
Y_train_pred = reg.predict(X_train)
Y_val_pred = reg.predict(X_val)
Y_test_pridiction = reg.predict(test_df_2)

train_RMSE=np.sqrt(mean_squared_error(Y_train,Y_train_pred))
val_RMSE=np.sqrt(mean_squared_error(Y_val,Y_val_pred))
print('Linear Regression Train RMSE: ', train_RMSE)
print('Linear Regression Validation RMSE: ', val_RMSE)
print('Score for Train Data: ', reg.score(X_train,Y_train))
print('Score for Validation Data: ', reg.score(X_val,Y_val))

Linear Regression Train RMSE:  4.127971521299212
Linear Regression Validation RMSE:  10152663934907.166
Score for Train Data:  0.8545526590622473
Score for Validation Data:  -7.174405394901521e+23


In [74]:
scores = cross_val_score(reg, train_x, train_y, cv=10)

scores

array([0.86666347, 0.8570604 , 0.80713371, 0.79352683, 0.86544284,
       0.81880558, 0.7412111 , 0.84985702, 0.85282096, 0.77702013])

#### Writing Result in 'Output_Linear_Regression.xlsx'

In [82]:
df_test['Price'] = Y_test_pridiction

In [83]:
df_test.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,28368320000000.0
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,68885130000000.0
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0,76178080000000.0
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0,81136790000000.0
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0,47048230000000.0


In [84]:
df_sub = pd.DataFrame(data=df_test)
writer = pd.ExcelWriter('Output_Linear_Regression.xlsx', engine='xlsxwriter')
df_sub.to_excel(writer,sheet_name='Sheet1', index=False)
writer.save()

In [75]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.82 (+/- 0.08)


### Conclusion

We observe that, among the models implemented, RandomForestRegressor performs well on the provided dataset