In [185]:
import numpy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning package
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import pickle

%matplotlib inline

## Load dataset

In [144]:
data = pd.read_csv('CarPrice.csv')
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [145]:
data.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage', 'engine', 'max_power', 'torque',
       'seats'],
      dtype='object')

# Explanatory analysis

In [146]:
data.shape

(8128, 13)

In [147]:
data.isnull().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

### Remove null values

In [148]:
data = data.dropna(how='any')
data.shape

(7906, 13)

### Extract integer value in torque column

In [149]:
torque = data.torque

In [150]:
pattern = r'(\d+\.\d+|\d+,\d+|\d+)'
torque = torque.str.extractall(pattern)

def convert_to_number(x):
    if ',' in x:
        return int(x.replace(',', ''))
    elif '.' in x:
        return float(x)
    else:
        return int(x)

torque = torque[0].apply(convert_to_number).groupby(level=0).apply(list).tolist()

torque = [max(x) for x in torque]

### Extract integer value in mileage column

In [151]:
mileage = data.mileage
mileage = [float(x.split()[0]) for x in mileage]

### Extract integer value in engine column

In [152]:
engine = data.engine
engine = [int(x.split()[0]) for x in engine]

### Extract integer value in max_power column

In [153]:
max_power = data.max_power
max_power = [float(x.split()[0]) for x in max_power]

### Compute car age

In [154]:
age = [2024-x for x in data.year]

### Get car brand

In [155]:
name = [x.split()[0] for x in data.name]

### Create new column in our dataset

In [156]:
data['torque_rmp'] = torque
data['mileage_kmpl'] = mileage
data['engine_cc'] = engine
data['max_power_bhp'] = max_power
data['age'] = age
data['brand'] = name


In [157]:
data = data.drop(['name', 'year','mileage', 'engine', 'max_power', 'torque', 'owner'], axis=1)
data.head()

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,seats,torque_rmp,mileage_kmpl,engine_cc,max_power_bhp,age,brand
0,450000,145500,Diesel,Individual,Manual,5.0,2000.0,23.4,1248,74.0,10,Maruti
1,370000,120000,Diesel,Individual,Manual,5.0,2500.0,21.14,1498,103.52,10,Skoda
2,158000,140000,Petrol,Individual,Manual,5.0,2700.0,17.7,1497,78.0,18,Honda
3,225000,127000,Diesel,Individual,Manual,5.0,2750.0,23.0,1396,90.0,14,Hyundai
4,130000,120000,Petrol,Individual,Manual,5.0,4500.0,16.1,1298,88.2,17,Maruti


In [158]:
data.brand.value_counts()

brand
Maruti           2367
Hyundai          1360
Mahindra          758
Tata              719
Honda             466
Toyota            452
Ford              388
Chevrolet         230
Renault           228
Volkswagen        185
BMW               118
Skoda             104
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               41
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Name: count, dtype: int64

### Deal with class imbalance

In [159]:
threshold = 100
count = data.brand.value_counts()
aggregate = count[count < threshold].index

In [160]:
data['brand'] = data['brand'].apply(lambda x: 'Other' if x in aggregate else x)


In [161]:
data.brand.value_counts()

brand
Maruti        2367
Hyundai       1360
Mahindra       758
Tata           719
Other          531
Honda          466
Toyota         452
Ford           388
Chevrolet      230
Renault        228
Volkswagen     185
BMW            118
Skoda          104
Name: count, dtype: int64

### Encode label

In [163]:
data_copy = data.copy()
label_encoder = LabelEncoder()
cols = ['brand', 'fuel', 'seller_type', 'transmission']
for col in cols:
    data_copy[col] = label_encoder.fit_transform(data[col])

## Visualize the dataset

In [164]:
data_copy.head()

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,seats,torque_rmp,mileage_kmpl,engine_cc,max_power_bhp,age,brand
0,450000,145500,1,1,1,5.0,2000.0,23.4,1248,74.0,10,6
1,370000,120000,1,1,1,5.0,2500.0,21.14,1498,103.52,10,9
2,158000,140000,3,1,1,5.0,2700.0,17.7,1497,78.0,18,3
3,225000,127000,1,1,1,5.0,2750.0,23.0,1396,90.0,14,4
4,130000,120000,3,1,1,5.0,4500.0,16.1,1298,88.2,17,6


In [165]:
data_copy.describe()

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,seats,torque_rmp,mileage_kmpl,engine_cc,max_power_bhp,age,brand
count,7906.0,7906.0,7906.0,7906.0,7906.0,7906.0,7906.0,7906.0,7906.0,7906.0,7906.0,7906.0
mean,649813.7,69188.66,1.888313,0.889831,0.868328,5.416393,3069.864154,19.419861,1458.708829,91.587374,10.016064,5.906274
std,813582.7,56792.3,1.001478,0.397182,0.338155,0.959208,943.6621,4.036263,503.893057,35.747216,3.863695,2.717376
min,29999.0,1.0,0.0,0.0,0.0,2.0,400.0,0.0,624.0,32.8,4.0,0.0
25%,270000.0,35000.0,1.0,1.0,1.0,5.0,2400.0,16.78,1197.0,68.05,7.0,4.0
50%,450000.0,60000.0,1.0,1.0,1.0,5.0,3000.0,19.3,1248.0,82.0,9.0,6.0
75%,690000.0,95425.0,3.0,1.0,1.0,5.0,4000.0,22.32,1582.0,102.0,12.0,7.0
max,10000000.0,2360457.0,3.0,2.0,1.0,14.0,21800.0,42.0,3604.0,400.0,30.0,12.0


### Normalize data

In [141]:
def normalize(col):
    min_ = col.min()
    max_ = col.max()
    return [(x-min_) / (max_ - min_) for x in col]

#### Normalize

In [170]:
for col in data_copy.columns:
    if col == 'selling_price' or col == 'brand':
        pass
    else:
        data_copy[col] = normalize(data_copy[col])

### Machine learning model

#### Split data

In [172]:
Y = data_copy.selling_price
X = data_copy.drop('selling_price', axis=1)

In [174]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
RF_model = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)

In [175]:
RF_model.fit(X_train, Y_train)

In [176]:
y_pred = RF_model.predict(X_test)

### Accuracy for test data

In [183]:
acc = round(RF_model.score(X_test, Y_test)*100, 2)
acc

96.75

### Save the trained model

In [187]:

with open('carprice_model.pkl', 'wb') as file:
    pickle.dump(RF_model, file)
