Import Libraries


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

Data Preprocessing

In [16]:
#Load our dataset
df = pd.read_csv("ford.csv")
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,B-MAX,2017,8999,Manual,16700,Petrol,150,47.1,1.4
17962,B-MAX,2014,7499,Manual,40700,Petrol,30,57.7,1.0
17963,Focus,2015,9999,Manual,7010,Diesel,20,67.3,1.6
17964,KA,2018,8299,Manual,5007,Petrol,145,57.7,1.2


In [7]:
#Checking for missing value
df.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [9]:
print(df['transmission'].unique())
print(df['fuelType'].unique())

['Automatic' 'Manual' 'Semi-Auto']
['Petrol' 'Diesel' 'Hybrid' 'Electric' 'Other']


In [24]:
#encoding the categorial transmission column
df.replace({'transmission': {'Automatic':0, 'Manual':1, 'Semi-Auto':2}},inplace=True)
#encoding the categorial fuelType column
df.replace({'fuelType': {'Petrol':0, 'Diesel':1, 'Hybrid':2, 'Electric':3, 'Other':4}},inplace=True)





In [26]:
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,0,15944,0,150,57.7,1.0
1,Focus,2018,14000,1,9083,0,150,57.7,1.0
2,Focus,2017,13000,1,12456,0,150,57.7,1.0
3,Fiesta,2019,17500,1,10460,0,145,40.3,1.5
4,Fiesta,2019,16500,0,1482,0,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,B-MAX,2017,8999,1,16700,0,150,47.1,1.4
17962,B-MAX,2014,7499,1,40700,0,30,57.7,1.0
17963,Focus,2015,9999,1,7010,1,20,67.3,1.6
17964,KA,2018,8299,1,5007,0,145,57.7,1.2


Splitting data into Feature and Labels

In [29]:
x = df.drop(['model','price'],axis=1)
y = df['price']

In [30]:
x

Unnamed: 0,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,2017,0,15944,0,150,57.7,1.0
1,2018,1,9083,0,150,57.7,1.0
2,2017,1,12456,0,150,57.7,1.0
3,2019,1,10460,0,145,40.3,1.5
4,2019,0,1482,0,145,48.7,1.0
...,...,...,...,...,...,...,...
17961,2017,1,16700,0,150,47.1,1.4
17962,2014,1,40700,0,30,57.7,1.0
17963,2015,1,7010,1,20,67.3,1.6
17964,2018,1,5007,0,145,57.7,1.2


In [31]:
y

0        12000
1        14000
2        13000
3        17500
4        16500
         ...  
17961     8999
17962     7499
17963     9999
17964     8299
17965     8299
Name: price, Length: 17966, dtype: int64

Standardise the Data

In [34]:
scaler = StandardScaler()
scaler.fit(x)

In [35]:
standardized_x = scaler.transform(x)
standardized_x

array([[ 0.06512772, -2.67003231, -0.38099808, ...,  0.59135805,
        -0.02044162, -0.81138621],
       [ 0.55286624,  0.04135139, -0.73335899, ...,  0.59135805,
        -0.02044162, -0.81138621],
       [ 0.06512772,  0.04135139, -0.56013157, ...,  0.59135805,
        -0.02044162, -0.81138621],
       ...,
       [-0.91034931,  0.04135139, -0.83982222, ..., -1.50505332,
         0.92766777,  0.57636151],
       [ 0.55286624,  0.04135139, -0.94269045, ...,  0.51072684,
        -0.02044162, -0.34880364],
       [-0.91034931,  0.04135139, -0.94269045, ..., -1.47280084,
        -0.02044162, -0.81138621]])

In [36]:
x = standardized_x
y = df['price']

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=.1, random_state=42)
print(x.shape, x_train.shape, x_test.shape)
print(y.shape, y_train.shape, y_test.shape)

(17966, 7) (16169, 7) (1797, 7)
(17966,) (16169,) (1797,)


XGB Regressor

In [44]:
#load our model
xgb_model = XGBRegressor()

In [45]:
#fit our training data into the model
xgb_model.fit(x_train,y_train)


Model Evaluation

In [46]:
#Prediction on Training Data
training_data_pred = xgb_model.predict(x_train)

#R2 score on training data
score_1 = metrics.r2_score(y_train, training_data_pred)

#mean absolute error
mae = metrics.mean_absolute_error(y_train, training_data_pred)

print('R2 score on training data: ', score_1)
print('Mean Absolute Error on training data:', mae)

R2 score on training data:  0.9531964580818831
Mean Absolute Error on training data: 740.4128602406732


In [47]:
#Prediction on Test Data
test_data_pred = xgb_model.predict(x_test)

#R2 score on training data
score_1 = metrics.r2_score(y_test, test_data_pred)

#mean absolute error
mae = metrics.mean_absolute_error(y_test, test_data_pred)

print('R2 score on test data: ', score_1)
print('Mean Absolute Error on training data:', mae)

R2 score on test data:  0.9116280023596516
Mean Absolute Error on training data: 907.3636198473694


Making Predictions

In [54]:
input_data = (2019, 1, 10460, 0, 145, 40.3, 1.5) 
#changing the input into numpy array and reshaping
input_changed = np.array(input_data).reshape(1,-1)

#standardize the input data
std_input = scaler.transform(input_changed)

prediction = xgb_model.predict(std_input)
print(prediction)

print("This car price estimation is: ", prediction)

[18320.613]
This car price estimation is:  [18320.613]




In [55]:
input_data = (2015, 1, 7010, 0, 20, 67.3, 1.6) 
#changing the input into numpy array and reshaping
input_changed = np.array(input_data).reshape(1,-1)

#standardize the input data
std_input = scaler.transform(input_changed)

prediction = xgb_model.predict(std_input)
print(prediction)
print("This car price estimation is: ", prediction)


[10309.842]
This car price estimation is:  [10309.842]




Saving Our Model and Scaler

In [57]:
import joblib

#save the model
joblib.dump(xgb_model, 'xgb_model.pkl')

#save the standard scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']