In [8]:
import sys

sys.path.append('../scripts')

In [9]:
import pandas as pd
import numpy as np
import preprocessing

In [10]:
data = pd.read_csv('../data/raw/car_price_prediction.csv')
data.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [11]:
data = preprocessing.preprocessing_pipeline(data)

Preprocessing started...
Initial shape: (19237, 18)
After dropping duplicates: (18924, 18)
Replacing categorical values...
After cleaning outliers: (16037, 18)
Feature engineering...
Dropping columns...
Final shape: (16037, 16)


In [12]:
data.head()

Unnamed: 0,Price,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Wheel,Color,Airbags,Age
0,13328,1399,LEXUS,RX 450,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,Left wheel,Silver,12,14
1,16621,1018,CHEVROLET,Equinox,Jeep,No,Petrol,3.0,192000,6.0,Tiptronic,4x4,Left wheel,Black,8,13
2,8467,0,HONDA,FIT,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,Right-hand drive,Black,2,18
3,3607,862,FORD,Escape,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,Left wheel,White,0,13
4,11726,446,HONDA,FIT,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,Left wheel,Silver,4,10


In [13]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

one_hot_columns = ['Leather interior', 'Gear box type', 'Drive wheels', 'Wheel']

# data = pd.get_dummies(data, columns=one_hot_columns)

oh_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
oh_encoded_train = oh_encoder.fit_transform(data[one_hot_columns])

oh_encoded_columns = oh_encoder.get_feature_names_out(one_hot_columns)


In [14]:
oh_encoded_train_df = pd.DataFrame(oh_encoded_train, columns=oh_encoded_columns, index=data.index)


In [15]:
data = pd.concat([data, oh_encoded_train_df], axis=1)
data.drop(columns=one_hot_columns, inplace=True)

In [16]:
import pickle
# Save the encoder for future use
with open('../models/one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(oh_encoder, f)

In [17]:
label_encode_columns = ['Manufacturer', 'Model', 'Category', 'Fuel type', 'Color']

label_encoders = {}
for column in label_encode_columns:
    label_encoder = LabelEncoder()
    data[column] = label_encoder.fit_transform(data[column])
    label_encoders[column] = label_encoder

In [18]:
# Save the label encoders for future use
with open('../models/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [19]:
X = data.drop('Price', axis=1)
y = data['Price']

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

Train set: 13631 samples
Test set: 2406 samples


In [21]:
from sklearn.discriminant_analysis import StandardScaler

numerical_columns = ['Levy', 'Engine volume', 'Mileage', 'Age']

scaler = StandardScaler()
X_train[numerical_columns]= scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [22]:
# Save the scaler
with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [26]:
X_train

Unnamed: 0,Levy,Manufacturer,Model,Category,Fuel type,Engine volume,Mileage,Cylinders,Color,Airbags,...,Leather interior_Yes,Gear box type_Automatic,Gear box type_Manual,Gear box type_Tiptronic,Gear box type_Variator,Drive wheels_4x4,Drive wheels_Front,Drive wheels_Rear,Wheel_Left wheel,Wheel_Right-hand drive
10363,-1.253429,35,391,3,5,-0.891360,0.246311,4.0,12,4,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
16277,0.519755,20,541,9,5,-0.555948,-0.103803,4.0,2,12,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
6811,1.723769,38,1311,10,5,-0.891360,1.066891,4.0,7,10,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
14217,0.791205,20,698,10,1,0.617993,-0.491465,4.0,14,4,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
11844,0.576672,11,361,3,5,-0.220536,0.010083,4.0,14,8,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16106,1.533316,50,1024,4,1,0.953405,-0.661101,4.0,12,4,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
6437,0.421245,37,2,9,5,-0.220536,-0.743420,4.0,14,12,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1011,-1.253429,53,750,3,5,-0.555948,-0.387773,4.0,14,0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
18952,1.312215,25,1121,4,5,2.295052,-0.097798,6.0,7,10,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [29]:
X_train.columns

Index(['Levy', 'Manufacturer', 'Model', 'Category', 'Fuel type',
       'Engine volume', 'Mileage', 'Cylinders', 'Color', 'Airbags', 'Age',
       'Leather interior_No', 'Leather interior_Yes',
       'Gear box type_Automatic', 'Gear box type_Manual',
       'Gear box type_Tiptronic', 'Gear box type_Variator', 'Drive wheels_4x4',
       'Drive wheels_Front', 'Drive wheels_Rear', 'Wheel_Left wheel',
       'Wheel_Right-hand drive'],
      dtype='object')

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

y_test_pred = rf.predict(X_test)
rmse = root_mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R^2 Score: {r2}")

Root Mean Squared Error: 5374.411087632199
R^2 Score: 0.7783866029256523


In [25]:
# Save the label encoders for future use
with open('../models/model.pkl', 'wb') as f:
    pickle.dump(rf, f)