In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import joblib

In [2]:
# Load the dataset
df = pd.read_csv('car data.csv')

In [3]:
df.shape

(301, 9)

In [4]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [5]:
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [7]:
df.Fuel_Type.value_counts()

Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64

In [8]:
df.Seller_Type.value_counts()

Seller_Type
Dealer        195
Individual    106
Name: count, dtype: int64

In [9]:
df.Transmission.value_counts()

Transmission
Manual       261
Automatic     40
Name: count, dtype: int64

In [10]:
# Data preprocessing
# Replace categorical values with numerical values
df.replace({'Fuel_Type':{'Petrol':0, 'Diesel':1, 'CNG':2}},inplace=True)
df.replace({'Seller_Type': {'Dealer':0, 'Individual':1}},inplace=True)
df.replace({'Transmission': {'Manual':0, 'Automatic':1}},inplace=True)

In [11]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,0,0,0,0
1,sx4,2013,4.75,9.54,43000,1,0,0,0
2,ciaz,2017,7.25,9.85,6900,0,0,0,0
3,wagon r,2011,2.85,4.15,5200,0,0,0,0
4,swift,2014,4.6,6.87,42450,1,0,0,0


In [12]:
# Define the features and target variable
X = df.drop(['Car_Name', 'Selling_Price'],axis=1)
Y = df['Selling_Price']

In [13]:
# Split the data into training and testing sets
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.15, random_state=2)

In [14]:
train_x.shape

(255, 7)

In [15]:
train_y.shape

(255,)

In [16]:
test_x.shape

(46, 7)

In [17]:
test_y.shape

(46,)

In [18]:
# Initialize the model
model = RandomForestRegressor()

In [19]:
# Train the model
model.fit(train_x, train_y)

In [20]:
# Evaluate the model
model.score(test_x, test_y)

0.9750561648664589

In [21]:
model.score(train_x, train_y)

0.9882100243066974

In [22]:
y_pred = model.predict(test_x)

In [23]:
print("Accuracy: ", r2_score(test_y, y_pred))

Accuracy:  0.9750561648664589


In [24]:
joblib_file = "random_forest_model.pkl"
joblib.dump(model, joblib_file)

['random_forest_model.pkl']