In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score
df=pd.read_csv('/content/ford 2.csv')
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,B-MAX,2017,8999,Manual,16700,Petrol,150,47.1,1.4
17962,B-MAX,2014,7499,Manual,40700,Petrol,30,57.7,1.0
17963,Focus,2015,9999,Manual,7010,Diesel,20,67.3,1.6
17964,KA,2018,8299,Manual,5007,Petrol,145,57.7,1.2


**Data Preprocessing**

In [None]:
df.dtypes

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax               int64
mpg             float64
engineSize      float64
dtype: object

In [None]:
df.isna().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [None]:
df.duplicated().sum()

154

In [None]:
#dropping duplicate values
df1 = df.drop_duplicates().reset_index(drop=True)

In [None]:
df1.duplicated().sum()

0

In [None]:
df1.shape

(17812, 9)

In [None]:
#removing negative values
df1=df1[df1['price'] >= 0]

**Model Building**

In [None]:
X=df1.drop(['price'],axis=1)
y=df1['price']

In [None]:
le=LabelEncoder()
lst=['model','transmission','fuelType']
for i in lst:
  X[i]=le.fit_transform(X[i])

In [None]:
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,5,2017,0,15944,4,150,57.7,1.0
1,6,2018,1,9083,4,150,57.7,1.0
2,6,2017,1,12456,4,150,57.7,1.0
3,5,2019,1,10460,4,145,40.3,1.5
4,5,2019,0,1482,4,145,48.7,1.0
...,...,...,...,...,...,...,...,...
17807,0,2017,1,16700,4,150,47.1,1.4
17808,0,2014,1,40700,4,30,57.7,1.0
17809,6,2015,1,7010,0,20,67.3,1.6
17810,11,2018,1,5007,4,145,57.7,1.2


In [None]:
#scaling using standard scaler
ms=MinMaxScaler()
X_ms=ms.fit_transform(X)

In [None]:
#Performing train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_ms,y,test_size=0.3,random_state=0)

In [None]:
#model building using Multiple regression
mlr=LinearRegression()
mlr.fit(X_train,y_train)
y_pred=mlr.predict(X_test)
y_pred

array([19743.83972635, 18253.13830937, 17448.11730824, ...,
       11683.22573171, 14601.24635879, 10278.46057461])

In [None]:
#model validation for Multiple Regression
import numpy as np
print("mean absolute error:",mean_absolute_error(y_test,y_pred))
print("mean squared error:",mean_squared_error(y_test,y_pred))
print("root mean squared error:",np.sqrt(mean_squared_error(y_test,y_pred)))
print("r2-score:",r2_score(y_test,y_pred))

mean absolute error: 1744.4062565534568
mean squared error: 6111403.357249421
root mean squared error: 2472.125271350427
r2-score: 0.7195451674275322


In [None]:
#model building using Random Forest Regressor
rs=RandomForestRegressor()
rs.fit(X_train,y_train)
y_pred1=rs.predict(X_test)
y_pred1

array([21171.86 , 20755.32 , 17572.26 , ..., 11412.315, 13295.42 ,
       11814.37 ])

In [None]:
print("mean absolute error:",mean_absolute_error(y_test,y_pred1))
print("mean squared error:",mean_squared_error(y_test,y_pred1))
print("root mean squared error:",np.sqrt(mean_squared_error(y_test,y_pred1)))
print("R2 score:",r2_score(y_test,y_pred1))

mean absolute error: 889.4743106510193
mean squared error: 1699546.1869131576
root mean squared error: 1303.6664400501984
R2 score: 0.9220071212065388


In [None]:
#model building using XGBoost Regressor
xgb=XGBRegressor()
xgb.fit(X_train,y_train)
y_pred4=xgb.predict(X_test)
y_pred4

array([21397.762, 20585.121, 17926.434, ..., 10762.372, 13357.588,
       11120.473], dtype=float32)

In [None]:
print("mean absolute error:",mean_absolute_error(y_test,y_pred4))
print("mean squared error:",mean_squared_error(y_test,y_pred4))
print("root mean squared error:",np.sqrt(mean_squared_error(y_test,y_pred4)))
print("R2 score:",r2_score(y_test,y_pred4))

mean absolute error: 828.0110195639604
mean squared error: 1487618.290191304
root mean squared error: 1219.679585051461
R2 score: 0.9317325802080395


In [None]:
#Comparing Actual and predicted Values
Result=pd.DataFrame({'Actual Values':y_test,'linear_Model':y_pred,'Random_Forest':y_pred1,'Xg_Boost':y_pred4})
Result

Unnamed: 0,Actual Values,linear_Model,Random_Forest,Xg_Boost
2122,21845,19743.839726,21171.860,21397.761719
12465,20000,18253.138309,20755.320,20585.121094
5989,20000,17448.117308,17572.260,17926.433594
1243,5998,8679.130973,6676.890,6804.084473
16105,15850,15474.870636,16199.600,16542.343750
...,...,...,...,...
12498,16000,15216.166212,16693.910,16869.708984
8676,11175,10678.409605,10554.600,10409.725586
1530,11298,11683.225732,11412.315,10762.372070
12995,12500,14601.246359,13295.420,13357.587891


Here We can see that XG boost model is more efficient than Linear model.So we predict the price using **XG boost regressor** model.

In [None]:
y_new=xgb.predict(ms.transform([[4,2010,1,10,4,150,40,2]]))
y_new

array([9393.382], dtype=float32)

In [None]:
y_new=xgb.predict(ms.transform([[4,2025,1,10,4,150,40,2]]))
y_new

array([22868.488], dtype=float32)