In [110]:
import pandas as pd
import numpy as np
import preprocessing

In [111]:
data = pd.read_csv('./car_price_prediction.csv')
data.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [112]:
data = preprocessing.preprocessing_pipeline(data)

Preprocessing started...
Initial shape: (19237, 18)
After dropping duplicates: (18924, 18)
Replacing categorical values...
After cleaning outliers: (16037, 18)
Feature engineering...
Dropping columns...
Final shape: (16037, 16)


In [113]:
data.head()

Unnamed: 0,Price,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Wheel,Color,Airbags,Age
0,13328,1399,LEXUS,RX 450,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,Left wheel,Silver,12,14
1,16621,1018,CHEVROLET,Equinox,Jeep,No,Petrol,3.0,192000,6.0,Tiptronic,4x4,Left wheel,Black,8,13
2,8467,0,HONDA,FIT,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,Right-hand drive,Black,2,18
3,3607,862,FORD,Escape,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,Left wheel,White,0,13
4,11726,446,HONDA,FIT,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,Left wheel,Silver,4,10


In [114]:
from sklearn.preprocessing import LabelEncoder

one_hot_columns = ['Leather interior', 'Gear box type', 'Drive wheels', 'Wheel']

data = pd.get_dummies(data, columns=one_hot_columns)

label_encode_columns = ['Category', 'Fuel type', 'Color']

label_encoder = LabelEncoder()

for column in label_encode_columns:
    data[column] = label_encoder.fit_transform(data[column])

In [115]:
data

Unnamed: 0,Price,Levy,Manufacturer,Model,Category,Fuel type,Engine volume,Mileage,Cylinders,Color,...,Leather interior_Yes,Gear box type_Automatic,Gear box type_Manual,Gear box type_Tiptronic,Gear box type_Variator,Drive wheels_4x4,Drive wheels_Front,Drive wheels_Rear,Wheel_Left wheel,Wheel_Right-hand drive
0,13328,1399,LEXUS,RX 450,4,2,3.5,186005,6.0,12,...,1,1,0,0,0,1,0,0,1,0
1,16621,1018,CHEVROLET,Equinox,4,5,3.0,192000,6.0,1,...,0,0,0,1,0,1,0,0,1,0
2,8467,0,HONDA,FIT,3,5,1.3,200000,4.0,1,...,0,0,0,0,1,0,1,0,0,1
3,3607,862,FORD,Escape,4,2,2.5,168966,4.0,14,...,1,1,0,0,0,1,0,0,1,0
4,11726,446,HONDA,FIT,3,5,1.3,91901,4.0,12,...,1,1,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,8467,0,MERCEDES-BENZ,CLK 200,1,0,2.0,300000,4.0,12,...,1,0,1,0,0,0,0,1,1,0
19233,15681,831,HYUNDAI,Sonata,9,5,2.4,161600,4.0,11,...,1,0,0,1,0,0,1,0,1,0
19234,26108,836,HYUNDAI,Tucson,4,1,2.0,116365,4.0,7,...,1,1,0,0,0,0,1,0,1,0
19235,5331,1288,CHEVROLET,Captiva,4,1,2.0,51258,4.0,1,...,1,1,0,0,0,0,1,0,1,0


In [116]:
X = data.drop('Price', axis=1)
y = data['Price']

In [117]:
from sklearn.model_selection import train_test_split


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")


Train set: 11225 samples
Validation set: 2406 samples
Test set: 2406 samples


In [118]:
target_encoding_columns = ['Fuel type', 'Model', 'Airbags', 'Cylinders', 'Manufacturer']

train = pd.concat([X_train, y_train], axis=1)
for col in target_encoding_columns:
    mean_encoded = train.groupby(col)['Price'].mean()
    global_mean = train['Price'].mean()
    X_train[col] = X_train[col].map(mean_encoded).fillna(global_mean)
    X_val[col] = X_val[col].map(mean_encoded).fillna(global_mean)
    X_test[col] = X_test[col].map(mean_encoded).fillna(global_mean)

In [119]:
from sklearn.preprocessing import StandardScaler

numerical_columns = ['Levy', 'Engine volume', 'Mileage', 'Age', 'Fuel type', 'Model', 'Airbags', 'Cylinders', 'Manufacturer']

scaler = StandardScaler()
X_train[numerical_columns]= scaler.fit_transform(X_train[numerical_columns])
X_val[numerical_columns] = scaler.transform(X_val[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [120]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [121]:
from sklearn.metrics import mean_squared_error, r2_score

y_val_pred = lr.predict(X_val)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
r2 = r2_score(y_val, y_val_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R^2 Score: {r2}")

Root Mean Squared Error: 8118.740467306492
R^2 Score: 0.4674953913982821


In [122]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

y_val_pred = rf.predict(X_val)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
r2 = r2_score(y_val, y_val_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R^2 Score: {r2}")

Root Mean Squared Error: 5592.693530987943
R^2 Score: 0.7473099759653388
