## Reading cleaned dataset

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("dataset/cleaned.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,car_name,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,selling_price
0,0,Maruti Alto,9,120000,Individual,Petrol,Manual,19.7,120000
1,1,Hyundai Grand,5,20000,Individual,Petrol,Manual,18.9,550000
2,2,Hyundai i20,11,60000,Individual,Petrol,Manual,17.0,215000
3,3,Maruti Alto,9,37000,Individual,Petrol,Manual,20.92,226000
4,4,Ford Ecosport,6,30000,Dealer,Diesel,Manual,22.77,570000


## Basic data preparation

In [3]:
data = data.drop(["Unnamed: 0"], axis=1)

In [4]:
data.head()

Unnamed: 0,car_name,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,selling_price
0,Maruti Alto,9,120000,Individual,Petrol,Manual,19.7,120000
1,Hyundai Grand,5,20000,Individual,Petrol,Manual,18.9,550000
2,Hyundai i20,11,60000,Individual,Petrol,Manual,17.0,215000
3,Maruti Alto,9,37000,Individual,Petrol,Manual,20.92,226000
4,Ford Ecosport,6,30000,Dealer,Diesel,Manual,22.77,570000


## One-hot encoding

In [5]:
encoded_data = pd.get_dummies(data, columns=['seller_type', 'fuel_type', 'transmission_type'], dtype=int )
encoded_data.head()

Unnamed: 0,car_name,vehicle_age,km_driven,mileage,selling_price,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,fuel_type_CNG,fuel_type_Diesel,fuel_type_LPG,fuel_type_Petrol,transmission_type_Automatic,transmission_type_Manual
0,Maruti Alto,9,120000,19.7,120000,0,1,0,0,0,0,1,0,1
1,Hyundai Grand,5,20000,18.9,550000,0,1,0,0,0,0,1,0,1
2,Hyundai i20,11,60000,17.0,215000,0,1,0,0,0,0,1,0,1
3,Maruti Alto,9,37000,20.92,226000,0,1,0,0,0,0,1,0,1
4,Ford Ecosport,6,30000,22.77,570000,1,0,0,0,1,0,0,0,1


In [6]:
## Sliding the selling price column to the end for better splitting

target_price = data.loc[:,'selling_price']
encoded_data = encoded_data.drop(['selling_price'], axis=1)
encoded_data['selling_price'] = target_price

In [7]:
encoded_data.head(2)

Unnamed: 0,car_name,vehicle_age,km_driven,mileage,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,fuel_type_CNG,fuel_type_Diesel,fuel_type_LPG,fuel_type_Petrol,transmission_type_Automatic,transmission_type_Manual,selling_price
0,Maruti Alto,9,120000,19.7,0,1,0,0,0,0,1,0,1,120000
1,Hyundai Grand,5,20000,18.9,0,1,0,0,0,0,1,0,1,550000


## Converting to Numpy

In [8]:
data = encoded_data.to_numpy()

In [9]:
type(data)

numpy.ndarray

In [10]:
data.shape

(13539, 14)

In [11]:
data[0]

array(['Maruti Alto', 9, 120000, 19.7, 0, 1, 0, 0, 0, 0, 1, 0, 1, 120000],
      dtype=object)

## Handling car_name column

In [12]:
cars = data[:,0]
cars

array(['Maruti Alto', 'Hyundai Grand', 'Hyundai i20', ...,
       'Maruti Ertiga', 'Skoda Rapid', 'Honda City'],
      shape=(13539,), dtype=object)

In [13]:
# this will give us unique names

cars = set(cars)

In [14]:
# mapping car names to numbers

car_map = {}
for index, name in enumerate(cars):
    car_map[name] = index+1

print(car_map)
# Total 92 unique cars in the dataset.

{'Ford Figo': 1, 'Datsun redi-GO': 2, 'Datsun RediGO': 3, 'Toyota Fortuner': 4, 'Honda Jazz': 5, 'Mahindra Marazzo': 6, 'Hyundai Santro': 7, 'Maruti Vitara': 8, 'Skoda Superb': 9, 'Honda WR-V': 10, 'Skoda Octavia': 11, 'Honda Amaze': 12, 'Nissan X-Trail': 13, 'BMW 3': 14, 'Mercedes-Benz CLS': 15, 'Audi A4': 16, 'Jeep Compass': 17, 'Audi A6': 18, 'Mahindra Bolero': 19, 'Honda City': 20, 'Hyundai Venue': 21, 'Jaguar XF': 22, 'Maruti Ertiga': 23, 'Mini Cooper': 24, 'Land Rover Rover': 25, 'Maruti Alto': 26, 'Honda Civic': 27, 'Hyundai Grand': 28, 'Ford Aspire': 29, 'Renault Triber': 30, 'Maruti Dzire ZXI': 31, 'Tata Tigor': 32, 'Mahindra XUV500': 33, 'Toyota Innova': 34, 'Hyundai Aura': 35, 'Toyota Yaris': 36, 'Tata Safari': 37, 'Honda CR': 38, 'Tata Nexon': 39, 'Mahindra KUV100': 40, 'Maruti S-Presso': 41, 'Mahindra Thar': 42, 'Datsun GO': 43, 'Volkswagen Vento': 44, 'Toyota Camry': 45, 'Maruti Swift': 46, 'Ford Freestyle': 47, 'Mercedes-Benz S-Class': 48, 'Maruti Baleno': 49, 'Hyundai i

In [15]:
import json

file_path = "dataset/car-name-mapping.json"

with open(file_path, "w") as file:
    json.dump(car_map, file, indent=4)

In [16]:
for row in data:
    row[0] = car_map[row[0]]

In [17]:
data[4012]

array([69, 3, 29000, 28.4, 0, 1, 0, 0, 1, 0, 0, 0, 1, 720000],
      dtype=object)

## Data Preprocessing
Split the dataset into training and testing sets, and scale the features.

In [18]:
from sklearn.preprocessing import StandardScaler

X = data[:, :-1]  # Features
y = data[:, -1]   # Target


In [19]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X[:,1:4])

In [20]:
X[:,1:4] = scaled_features

In [21]:
X[1]

array([28, -0.3490331650851443, -1.1159125268287682, -0.34313373766196315,
       0, 1, 0, 0, 0, 0, 1, 0, 1], dtype=object)

In [22]:
price_scaler = StandardScaler()
scaled_price = price_scaler.fit_transform(y.reshape(-1,1))

In [23]:
scaled_price

array([[-1.64891969],
       [-0.08983841],
       [-1.3044715 ],
       ...,
       [ 1.2698255 ],
       [-0.54305971],
       [ 2.26691236]], shape=(13539, 1))

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,scaled_price,test_size=0.2, random_state=42)

## Linear Regression Model

In [25]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [26]:
model.fit(X_train, y_train)

In [27]:
y_pred = model.predict(X_test)

In [28]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_pred=y_pred, y_true=y_test)

0.47139769343469

In [29]:
print(model.score(X_test, y_test))

0.6338318359571987


In [30]:
from sklearn.metrics import r2_score

print(r2_score(y_test, y_pred))

0.6338318359571987


## Polynomial Regression

In [31]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)

X_train_poly = poly.fit_transform(X_train)

lin2 = LinearRegression()

lin2.fit(X_train_poly, y_train)


In [32]:
X_test_poly = poly.transform(X_test)

y_pred_poly = lin2.predict(X_test_poly)

In [33]:
lin2.score(X_test_poly, y_test)

0.7095315262640465

In [35]:
mean_absolute_error(y_pred=y_pred_poly, y_true=y_test)

0.4019474634082295

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()

rf_model.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [None]:
rf_model.score(X_test, y_test)

0.878099518924329

In [None]:
y_pred = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_pred))

0.253339180640986


In [None]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.878099518924329

In [None]:
from sklearn.metrics import root_mean_squared_error

root_mean_squared_error(y_test, y_pred)


0.35344380742518183

## Polynomial Regression without Car_name feature

In [None]:
X_train = X_train_poly[:,1:]
X_test = X_test_poly[:,1:]

In [None]:
X_train

array([[48.        , -0.34903317,  0.65912404, ...,  1.        ,
         1.        ,  1.        ],
       [29.        , -1.37158655, -0.76325392, ...,  0.        ,
         0.        ,  1.        ],
       [58.        ,  0.67352022,  0.12974821, ...,  1.        ,
         1.        ,  1.        ],
       ...,
       [ 2.        ,  1.01437134,  0.98004338, ...,  1.        ,
         1.        ,  1.        ],
       [26.        ,  0.67352022,  1.35269773, ...,  0.        ,
         0.        ,  1.        ],
       [25.        , -1.03073542, -0.93958322, ...,  1.        ,
         1.        ,  1.        ]], shape=(10831, 65))

In [None]:
model2 = LinearRegression()

model2.fit(X_train, y_train)

In [None]:
model2.score(X_test, y_test)

0.7059486555192144