In [1]:
#Import Needed Libraries
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import iplot , plot
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler , LabelEncoder
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor , BaggingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error , r2_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading Dataset And Get Info
df = pd.read_csv('car details v3.csv')


In [3]:

# Show Sample of Data
df.sample(5)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
404,Mahindra XUV500 AT W10 AWD,2016,1290000,50000,Diesel,Individual,Automatic,First Owner,16.0 kmpl,2179 CC,140 bhp,330Nm@ 1600-2800rpm,7.0
7097,Mahindra TUV 300 mHAWK100 T8,2017,700000,60000,Diesel,Individual,Manual,First Owner,18.49 kmpl,1493 CC,100 bhp,240Nm@ 1600-2800rpm,7.0
3958,Maruti Wagon R VXI BS IV,2014,300000,31000,Petrol,Individual,Manual,First Owner,20.51 kmpl,998 CC,67.04 bhp,90Nm@ 3500rpm,5.0
1404,Ford Fiesta EXi 1.4 TDCi Ltd,2011,170000,80000,Diesel,Individual,Manual,Third Owner,17.8 kmpl,1399 CC,68 bhp,"16.3@ 2,000(kgm@ rpm)",5.0
4686,Honda CR-V 2.4L 4WD MT,2008,400000,120000,Petrol,Individual,Manual,First Owner,11.1 kmpl,2354 CC,158.8 bhp,218Nm@ 4200rpm,5.0


In [4]:
# Show Shape of Data
print(f"Number of Row : {df.shape[0]}\nNumber of Columns : {df.shape[1]}")

Number of Row : 8128
Number of Columns : 13


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


In [6]:
# Check NaN Value
df.isna().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

In [7]:
# Describe Numiric Data
df.describe()

Unnamed: 0,year,selling_price,km_driven,seats
count,8128.0,8128.0,8128.0,7907.0
mean,2013.804011,638271.8,69819.51,5.416719
std,4.044249,806253.4,56550.55,0.959588
min,1983.0,29999.0,1.0,2.0
25%,2011.0,254999.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,675000.0,98000.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


In [8]:
# Describe non Numiric 
df.describe(exclude=np.number)

Unnamed: 0,name,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque
count,8128,8128,8128,8128,8128,7907,7907,7913,7906
unique,2058,4,3,2,5,393,121,322,441
top,Maruti Swift Dzire VDI,Diesel,Individual,Manual,First Owner,18.9 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm
freq,129,4402,6766,7078,5289,225,1017,377,530


In [9]:
pd.DataFrame({'Count':df.shape[0],
              'Null':df.isnull().sum(),
              'Null %':df.isnull().mean() * 100,
              'Cardinality':df.nunique()
})

Unnamed: 0,Count,Null,Null %,Cardinality
name,8128,0,0.0,2058
year,8128,0,0.0,29
selling_price,8128,0,0.0,677
km_driven,8128,0,0.0,921
fuel,8128,0,0.0,4
seller_type,8128,0,0.0,3
transmission,8128,0,0.0,2
owner,8128,0,0.0,5
mileage,8128,221,2.718996,393
engine,8128,221,2.718996,121


In [10]:
# Handling Missing Data (Nulls)

In [11]:
# Delete Nulls
df.dropna(inplace=True , ignore_index=True)

In [12]:
# Data After Delete Nulls
df.sample(5)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
302,Maruti Ertiga VXI,2013,535000,90000,Petrol,Individual,Manual,Third Owner,16.02 kmpl,1373 CC,93.7 bhp,130Nm@ 4000rpm,7.0
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0 kmpl,1399 CC,68.1 bhp,160Nm@ 2000rpm,5.0
5861,Ford Fiesta 1.4 TDCi EXI,2005,150000,120000,Diesel,Individual,Manual,Third Owner,17.8 kmpl,1399 CC,68 bhp,"16.3@ 2,000(kgm@ rpm)",5.0
1549,Tata Nexon 1.5 Revotorq XM,2018,700000,35000,Diesel,Individual,Manual,First Owner,21.5 kmpl,1497 CC,108.5 bhp,260Nm@ 1500-2750rpm,5.0
709,Hyundai Xcent 1.1 CRDi SX Option,2017,550000,70000,Diesel,Individual,Manual,Second Owner,24.4 kmpl,1120 CC,71 bhp,180.4Nm@ 1750-2500rpm,5.0


In [13]:
# The New of Shape Data
print(f"Number of Row : {df.shape[0]}\nNumber of Columns : {df.shape[1]}")

Number of Row : 7906
Number of Columns : 13


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7906 entries, 0 to 7905
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7906 non-null   object 
 1   year           7906 non-null   int64  
 2   selling_price  7906 non-null   int64  
 3   km_driven      7906 non-null   int64  
 4   fuel           7906 non-null   object 
 5   seller_type    7906 non-null   object 
 6   transmission   7906 non-null   object 
 7   owner          7906 non-null   object 
 8   mileage        7906 non-null   object 
 9   engine         7906 non-null   object 
 10  max_power      7906 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7906 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 803.1+ KB


In [15]:
# Check NaN Value
df.isna().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
mileage          0
engine           0
max_power        0
torque           0
seats            0
dtype: int64

In [16]:
# Describe Numiric (After Delete Null)
df.describe()

Unnamed: 0,year,selling_price,km_driven,seats
count,7906.0,7906.0,7906.0,7906.0
mean,2013.983936,649813.7,69188.66,5.416393
std,3.863695,813582.7,56792.3,0.959208
min,1994.0,29999.0,1.0,2.0
25%,2012.0,270000.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,690000.0,95425.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


In [17]:
# Describe non Numiric (After Delete Null)
df.describe(exclude=np.number)

Unnamed: 0,name,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque
count,7906,7906,7906,7906,7906,7906,7906,7906,7906
unique,1982,4,3,2,5,393,121,320,441
top,Maruti Swift Dzire VDI,Diesel,Individual,Manual,First Owner,18.9 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm
freq,129,4299,6563,6865,5215,225,1017,377,530


In [18]:
# The New Cardinality
pd.DataFrame({'Count':df.shape[0],
              'Null':df.isnull().sum(),
              'Null %':df.isnull().mean() * 100,
              'Cardinality':df.nunique()
})

Unnamed: 0,Count,Null,Null %,Cardinality
name,7906,0,0.0,1982
year,7906,0,0.0,27
selling_price,7906,0,0.0,670
km_driven,7906,0,0.0,898
fuel,7906,0,0.0,4
seller_type,7906,0,0.0,3
transmission,7906,0,0.0,2
owner,7906,0,0.0,5
mileage,7906,0,0.0,393
engine,7906,0,0.0,121


In [19]:
# EDA of Data and Get Insights

In [20]:
Brands_of_car = df['name'].tolist()
for i in range(len(Brands_of_car)):
    car = Brands_of_car[i].split(' ')
    Brands_of_car[i] = car[0]
Brands_of_car = pd.Series(Brands_of_car)

In [21]:
# Replace Model of each car to only Name of Brand
df['name'] = Brands_of_car

In [22]:
print(f"Number of Unique Brand of Car {df['name'].nunique()}")

Number of Unique Brand of Car 31


In [23]:

Count_of_Brands_Car = df['name'].value_counts()[:20]
fig = px.bar(x=Count_of_Brands_Car.index, 
             y=Count_of_Brands_Car.values, 
             title='Top 20 Brands of Car',
             template='plotly_dark',
             color=Count_of_Brands_Car.values,
             labels={'x': 'Brand of Car', 'y': 'Number of Frequency in Data'},
             text_auto=True)
fig.show(renderer='iframe')


In [24]:
brand_production_per_Year = df.groupby('year')['name']

In [25]:
print(f"Number of Cars Production in 2020 only '{len(brand_production_per_Year.get_group(2020))}'")

Number of Cars Production in 2020 only '74'


In [26]:
# Get Years from 2015 to 2019
last_five_years = list(df['year'].value_counts().sort_index(ascending=False)[1:6].index)
last_five_years.sort()

In [27]:
for year in last_five_years:
  iplot(px.bar(brand_production_per_Year.get_group(year).value_counts()[:10],
       title = f'Top 10 Brands of Car in {year}',
       template = 'plotly_dark',
       color = brand_production_per_Year.get_group(year).value_counts()[:10],
       labels={'value':f'Number of Cars Produced in {year}','name':'Brand of Car'},
       text_auto=True
       ))

In [28]:
# Total Selling Price per Years
sum_Price_Years = df.groupby('year')['selling_price'].sum()
px.line(sum_Price_Years,
        x = sum_Price_Years.index,
        y = sum_Price_Years.values,
        labels={'y':'Total Selling Price','year':'Year of Selling the Car'},
        template = 'plotly_dark',
        markers = True,
        line_shape = 'spline',
        color_discrete_sequence = ['red'],
        log_x = True)

In [29]:
print(f"Best year for total car sales profit in the Market in '{sum_Price_Years.idxmax()}' is ${sum_Price_Years.values.max():,.2f}")

Best year for total car sales profit in the Market in '2019' is $1,035,982,984.00


In [30]:
type_of_fuel = df['fuel'].value_counts()
px.pie(title='Types of Fuel',
       values = type_of_fuel.values,
       names = type_of_fuel.index,
       template = 'plotly_dark'
       ).update_traces(textinfo='label+percent')

In [31]:
price_Type_Fuel = df.groupby('fuel')['selling_price'].mean()
px.bar(x = price_Type_Fuel.index,
       y = price_Type_Fuel.values,
       template = 'plotly_dark',
       color = price_Type_Fuel.index,
       text_auto = True,
       title = 'Fuel type affects selling Price',
       labels = {'y':'Total Selling Price','x':'Type of Fuel'}
       )

In [32]:
# First -> Type of transmission on Selling Price

In [33]:
type_transmission = df['transmission'].value_counts()
px.pie(title='Types of Transmission',
       values = type_transmission.values,
       names = type_transmission.index,
       template = 'plotly_dark'
       ).update_traces(textinfo='label+percent')

In [34]:
Selling_price_transmission = df.groupby('transmission')['selling_price'].mean()
px.bar(x = Selling_price_transmission.index,
       y = Selling_price_transmission.values,
       template = 'plotly_dark',
       color = Selling_price_transmission.index,
       text_auto = True,
       title = 'Transmission type affects selling Price',
       labels = {'y':'Total Selling Price','x':'Type of Transmission'}
       )

In [35]:
# Second -> Type of Seller on Selling Price

In [36]:
seller_types = df['seller_type'].value_counts()
px.pie(title = 'Seller type affects selling Price',
       values = seller_types.values,
       names = seller_types.index,
       template = 'plotly_dark'
       ).update_traces(textinfo='label+percent')

In [37]:
Selling_price_seller_types = df.groupby('seller_type')['selling_price'].mean()
px.bar(x = Selling_price_seller_types.index,
       y = Selling_price_seller_types.values,
       template = 'plotly_dark',
       color = Selling_price_seller_types.index,
       text_auto = True,
       title = 'Seller type affects selling Price',
       labels = {'y':'Total Selling Price','x':'Type of Seller'}
       )

In [38]:
# Third -> Type of Owner on Selling Price

In [39]:
owner_types = df['owner'].value_counts()
px.pie(title = 'Seller type affects selling Price',
       values = owner_types.values,
       names = owner_types.index,
       template = 'plotly_dark'
       ).update_traces(textinfo='label+percent')

In [40]:
Selling_price_owner_types = df.groupby('owner')['selling_price'].mean()
px.bar(x = Selling_price_owner_types.index,
       y = Selling_price_owner_types.values,
       template = 'plotly_dark',
       color = Selling_price_owner_types.index,
       text_auto = True,
       title = 'Owner type affects selling Price',
       labels = {'y':'Total Selling Price','x':'Type of Owner'}
       )

In [41]:
# Data Preprocessing

In [42]:
# First Remove any object after numiric data
df = df.replace({'mileage':'[A-Za-z/]','engine':'[A-Za-z]','max_power':'[A-Za-z]'},'',regex=True)
df['mileage'] = df['mileage'].astype(float)
df['engine'] = df['engine'].astype(float)
df['max_power'] = df['max_power'].astype(float)

In [43]:
# Delete the column 'torque' because there are many words and specifications that can be ignored.
df.drop('torque' , axis = 1 , inplace = True)

In [44]:
# Change column year to Age of production Car
df['year'] = 2025 - df['year']
df.rename(columns={'year':'age'},inplace=True)

In [45]:
# Show Data After some of Preprocessing
df.head()

Unnamed: 0,name,age,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti,11,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda,11,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda,19,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai,15,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti,18,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


In [46]:
# After Change DataType of Features
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7906 entries, 0 to 7905
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7906 non-null   object 
 1   age            7906 non-null   int64  
 2   selling_price  7906 non-null   int64  
 3   km_driven      7906 non-null   int64  
 4   fuel           7906 non-null   object 
 5   seller_type    7906 non-null   object 
 6   transmission   7906 non-null   object 
 7   owner          7906 non-null   object 
 8   mileage        7906 non-null   float64
 9   engine         7906 non-null   float64
 10  max_power      7906 non-null   float64
 11  seats          7906 non-null   float64
dtypes: float64(4), int64(3), object(5)
memory usage: 741.3+ KB


In [47]:
# Encode Object DataType

In [48]:
column_to_encode = ['name','fuel','seller_type','transmission','owner']
le = LabelEncoder()
for column in column_to_encode:
    df[column] = le.fit_transform(df[column])

In [49]:
# Show Data After some of Encode
df.head()

Unnamed: 0,name,age,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,20,11,450000,145500,1,1,1,0,23.4,1248.0,74.0,5.0
1,26,11,370000,120000,1,1,1,2,21.14,1498.0,103.52,5.0
2,10,19,158000,140000,3,1,1,4,17.7,1497.0,78.0,5.0
3,11,15,225000,127000,1,1,1,0,23.0,1396.0,90.0,5.0
4,20,18,130000,120000,3,1,1,0,16.1,1298.0,88.2,5.0


In [50]:
# Final Info after Encode
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7906 entries, 0 to 7905
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7906 non-null   int64  
 1   age            7906 non-null   int64  
 2   selling_price  7906 non-null   int64  
 3   km_driven      7906 non-null   int64  
 4   fuel           7906 non-null   int64  
 5   seller_type    7906 non-null   int64  
 6   transmission   7906 non-null   int64  
 7   owner          7906 non-null   int64  
 8   mileage        7906 non-null   float64
 9   engine         7906 non-null   float64
 10  max_power      7906 non-null   float64
 11  seats          7906 non-null   float64
dtypes: float64(4), int64(8)
memory usage: 741.3 KB


In [51]:
# Assign feature and target variables
X = df.drop('selling_price' , axis = 1)
y = df['selling_price']

In [52]:
# Splitting Data
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.25 , random_state = 44 , shuffle = True)

In [53]:
print(f'Shape of X_Train {X_train.shape}')
print(f'Shape of X_Test {X_test.shape}')
print(f'Shape of Y_Train {y_train.shape}')
print(f'Shape of Y_Test {y_test.shape}')

Shape of X_Train (5929, 11)
Shape of X_Test (1977, 11)
Shape of Y_Train (5929,)
Shape of Y_Test (1977,)


In [54]:
# Modeling

In [55]:
# Use Different Models Algorithm
def Kfold(model,model_name):
    model = cross_val_score(model , X , y , cv = 10)
    model_score = np.average(model)
    print(f"{model_name} score on cross validation: {model_score * 100}%")

def train(model,model_name):
    model.fit(X_train,y_train)
    model_train_score = model.score(X_train,y_train)
    model_test_score = model.score(X_test,y_test)
    print(f"{model_name} model score on Training data: {model_train_score * 100}%\n{model_name} model score on Testing data: {model_test_score * 100}%")

def r2(model,model_name):
    score = r2_score(y_test , model.predict(X_test))
    print(f"R2 Score for {model_name} is {score * 100}%")

In [56]:
# Linear Regression Model
lr = LinearRegression()
Kfold(lr,'Linear Regression')
train(lr,'Linear Regression')
r2(lr,'Linear Regression')

Linear Regression score on cross validation: 66.0028390382205%
Linear Regression model score on Training data: 67.61174801974936%
Linear Regression model score on Testing data: 68.83581706675936%
R2 Score for Linear Regression is 68.83581706675936%


In [57]:
# Random Forest Model
rf_model = RandomForestRegressor(n_estimators = 100 , max_depth = 10)
Kfold(rf_model, "Random Forest")
train(rf_model, "Random Forest")
r2(rf_model, "Random Forest")

Random Forest score on cross validation: 96.35380814696323%
Random Forest model score on Training data: 98.83957963102648%
Random Forest model score on Testing data: 97.24793111875968%
R2 Score for Random Forest is 97.24793111875968%


In [58]:
# Bagging Model
bag = BaggingRegressor(n_estimators= 10)
Kfold(bag,'Bagging')
train(bag,'Bagging')
r2(bag,'Bagging')

Bagging score on cross validation: 96.31202190951565%
Bagging model score on Training data: 99.38972892925726%
Bagging model score on Testing data: 96.78389737278083%
R2 Score for Bagging is 96.78389737278083%


In [59]:
# Gradient Boosting Model
GrBoost = GradientBoostingRegressor()
Kfold(GrBoost,'Gradient Boosting')
train(GrBoost,'Gradient Boosting')
r2(GrBoost,'Gradient Boosting')

Gradient Boosting score on cross validation: 95.44396218336692%
Gradient Boosting model score on Training data: 97.45946904753747%
Gradient Boosting model score on Testing data: 96.3381270748732%
R2 Score for Gradient Boosting is 96.3381270748732%


In [60]:
# KNN
knn = KNeighborsRegressor()
Kfold(knn,'KNN')
train(knn,'KNN')
r2(knn,'KNN')

KNN score on cross validation: 79.96605737923105%
KNN model score on Training data: 86.79748626374615%
KNN model score on Testing data: 82.5546602971281%
R2 Score for KNN is 82.5546602971281%


In [61]:
# Decision Tree Regressor
DTR = DecisionTreeRegressor()
Kfold(DTR,'Decision Tree')
train(DTR,'Decision Tree')
r2(DTR,'Decision Tree')

Decision Tree score on cross validation: 94.32484424267301%
Decision Tree model score on Training data: 99.9662127253933%
Decision Tree model score on Testing data: 95.93677821856593%
R2 Score for Decision Tree is 95.93677821856593%


In [62]:
# XGBoost
xgboost = XGBRegressor()
Kfold(xgboost, "XGBoost")
train(xgboost, "XGBoost")
r2(xgboost, "XGBoost")

XGBoost score on cross validation: 96.26465590866546%
XGBoost model score on Training data: 99.61505280210022%
XGBoost model score on Testing data: 97.41086015985965%
R2 Score for XGBoost is 97.41086015985965%


In [63]:
# Save XGBoost Model
joblib.dump(xgboost,'XGBoost.sav')

['XGBoost.sav']

In [64]:
# Streamlit Application 
##  [🚀 Click here to go to the GitHub repo](https://github.com/ahmedismaiill/AI-Projects-Main/tree/main/1-%20Machine%20Learning%20/1-%20Regression/Car%20Price%20Prediction)