In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Loading Dataset

In [2]:
df=pd.read_csv('cardata.csv')

In [3]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


## Checking null values

In [4]:
df.shape

(301, 9)

In [5]:
df.isna().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

## There is no null values

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [8]:
from sklearn.preprocessing import OneHotEncoder,LabelBinarizer
cat_process=ColumnTransformer([
    ('encoder',OneHotEncoder(),['Fuel_Type','Seller_Type','Transmission'])
])

In [9]:
x=df.drop(['Selling_Price','Car_Name'],axis=1)
y=df['Selling_Price']

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [11]:
import numpy as np
import pandas as pd
def outliers(X):
    Q1=np.quantile(X,0.25)
    Q3=np.quantile(X,0.75)
    IQR=Q3-Q1
    upper_bound=Q3+1.5*IQR
    lower_bound=Q1-1.5*IQR
    data_cliped=X.clip(lower=lower_bound,upper=upper_bound)    

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [14]:
model_1=Pipeline([
    ('data',cat_process),
    ('Linear',LinearRegression())
])
model_1.fit(x_train,y_train)

In [15]:
model_2=Pipeline([
    ('data',cat_process),
    ('SVM',SVR())
])
model_2.fit(x_train,y_train)

In [16]:
model_3=Pipeline([
    ('data',cat_process),
    ('Random',RandomForestRegressor())
])
model_3.fit(x_train,y_train)

In [17]:
model_4=Pipeline([
    ('data',cat_process),
    ('descion',DecisionTreeRegressor())
])
model_4.fit(x_train,y_train)

## Model Evaluation

In [18]:
from sklearn.metrics import r2_score,mean_squared_error

#### Prediction values

In [19]:
y_linear=model_1.predict(x_test)
y_svr=model_2.predict(x_test)
y_random=model_3.predict(x_test)
y_descion=model_4.predict(x_test)

In [20]:
print("The r2 score of Linear Regresion is :",r2_score(y_test,y_linear))
print("The r2 score of SVR is :",r2_score(y_test,y_svr))
print("The r2 score of Random Forest Regresion is :",r2_score(y_test,y_random))
print("The r2 score of Desicion tree Regresion is :",r2_score(y_test,y_descion))

The r2 score of Linear Regresion is : 0.520836035041201
The r2 score of SVR is : 0.619063991211094
The r2 score of Random Forest Regresion is : 0.7516090885770593
The r2 score of Desicion tree Regresion is : 0.7549817230619136


### Exporting the model

In [21]:
transform_data=model_1.named_steps['data'].transform(x)

In [22]:
transform_data.shape

(301, 7)

In [23]:
num_columns = transform_data.shape[1]
print(f"Number of columns after one-hot encoding: {num_columns}")


Number of columns after one-hot encoding: 7


In [24]:
df['Owner'].value_counts()

Owner
0    290
1     10
3      1
Name: count, dtype: int64

In [25]:
df['Fuel_Type'].value_counts()

Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64

In [26]:
df['Owner'].value_counts()

Owner
0    290
1     10
3      1
Name: count, dtype: int64

In [27]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [28]:
df.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [29]:
#name year Selling_price kms_driven fuel_type se
df_new=df.copy()

In [37]:
x_new=df_new.drop(columns=["Car_Name","Present_Price"])
y_new=df_new["Present_Price"]

In [31]:
x_new.head()

Unnamed: 0,Year,Selling_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,27000,Petrol,Dealer,Manual,0
1,2013,4.75,43000,Diesel,Dealer,Manual,0
2,2017,7.25,6900,Petrol,Dealer,Manual,0
3,2011,2.85,5200,Petrol,Dealer,Manual,0
4,2014,4.6,42450,Diesel,Dealer,Manual,0


In [32]:
df_new["Fuel_Type"]=df_new["Fuel_Type"].map({"Petrol":0,"Diesel":1,"CNG":3})
df_new["Seller_Type"]=df_new["Seller_Type"].map({"Individual":0,"Dealer":1})
df_new["Transmission"]=df_new["Transmission"].map({"Manual":0,"Automatic":1})

In [33]:
df_new

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,0,1,0,0
1,sx4,2013,4.75,9.54,43000,1,1,0,0
2,ciaz,2017,7.25,9.85,6900,0,1,0,0
3,wagon r,2011,2.85,4.15,5200,0,1,0,0
4,swift,2014,4.60,6.87,42450,1,1,0,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,9.50,11.60,33988,1,1,0,0
297,brio,2015,4.00,5.90,60000,0,1,0,0
298,city,2009,3.35,11.00,87934,0,1,0,0
299,city,2017,11.50,12.50,9000,1,1,0,0


In [40]:
x_new

Unnamed: 0,Year,Selling_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,27000,0,1,0,0
1,2013,4.75,43000,1,1,0,0
2,2017,7.25,6900,0,1,0,0
3,2011,2.85,5200,0,1,0,0
4,2014,4.60,42450,1,1,0,0
...,...,...,...,...,...,...,...
296,2016,9.50,33988,1,1,0,0
297,2015,4.00,60000,0,1,0,0
298,2009,3.35,87934,0,1,0,0
299,2017,11.50,9000,1,1,0,0


In [41]:
x_train_new,x_test_new,y_train_new,y_test_new=train_test_split(x_new,y_new,test_size=0.2,random_state=42)

In [42]:
rfr=RandomForestRegressor()
rfr.fit(x_train_new,y_train_new)

In [43]:
y_pre=rfr.predict(x_test_new)

In [44]:
r2_score(y_test_new,y_pre)

0.8511799352793695

In [46]:
import joblib
joblib.dump(rfr,'model.pkl')

['model.pkl']