In [1]:
%cd ../..

/home/ivanstefanov/Repositories/used-cars-price-prediction


In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from price_predictor.data_preparation.pick_or_drop_features import (
    pick_by_datatypes,
    drop_anonymous_features,
    drop_by_name
)

from price_predictor.trainers.train_linear_model import train_linear_regression
from price_predictor.trainers.train_ensamble_model import train_random_forest
from price_predictor.evaluate import evaluate_model

In [5]:
data_all = pd.read_csv('data/cars.csv')
data_all

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,True,False,True,True,False,False,False,False,True,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,Chrysler,300,automatic,silver,290000,2000,gasoline,False,gasoline,3.5,...,True,False,False,True,True,False,False,True,True,301
38527,Chrysler,PT Cruiser,mechanical,blue,321000,2004,diesel,False,diesel,2.2,...,True,False,False,True,True,False,False,True,True,317
38528,Chrysler,300,automatic,blue,777957,2000,gasoline,False,gasoline,3.5,...,True,False,False,True,True,False,False,True,True,369
38529,Chrysler,PT Cruiser,mechanical,black,20000,2001,gasoline,False,gasoline,2.0,...,True,False,False,False,False,False,False,False,True,490


In [6]:
data = drop_anonymous_features(data_all)
data = drop_by_name(data, 'model_name')
data.dropna(inplace=True)

In [7]:
dummies = pd.get_dummies(data)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(dummies.drop('price_usd', axis=1), dummies['price_usd'])

## Linear regression

In [9]:
lr = train_linear_regression(X_train, y_train)
evaluate_model(lr, X_test, y_test)

{'rmse': 3322.9451141235413,
 'mae': 2088.739809984353,
 'r2': 0.7372320104070034}

## Random forest

In [10]:
rf = train_random_forest(X_train, y_train, min_samples_leaf = 5)
evaluate_model(rf, X_test, y_test)

{'rmse': 2044.2666738884416,
 'mae': 1176.2974554651066,
 'r2': 0.9005508169618486}

## There is a bit of improvement with the categorical values

## Lets try with scaling

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
scaled_dummies = pd.DataFrame(MinMaxScaler().fit_transform(dummies), index=dummies.index, columns=dummies.columns)

In [13]:
scaled_dummies

Unnamed: 0,odometer_value,year_produced,engine_has_gas,engine_capacity,has_warranty,price_usd,is_exchangeable,number_of_photos,up_counter,duration_listed,...,state_owned,drivetrain_all,drivetrain_front,drivetrain_rear,location_region_Брестская обл.,location_region_Витебская обл.,location_region_Гомельская обл.,location_region_Гродненская обл.,location_region_Минская обл.,location_region_Могилевская обл.
0,0.190000,0.883117,0.0,0.294872,0.0,0.217984,0.0,0.094118,0.006452,0.007168,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.290000,0.779221,0.0,0.358974,0.0,0.099982,1.0,0.129412,0.028495,0.037186,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.402000,0.766234,0.0,0.294872,0.0,0.055981,1.0,0.035294,0.038172,0.067652,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.010000,0.740260,0.0,0.358974,0.0,0.199964,1.0,0.094118,0.022043,0.038530,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.280000,0.766234,0.0,0.294872,0.0,0.042663,1.0,0.152941,0.003226,0.003136,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,0.290000,0.753247,0.0,0.423077,0.0,0.054981,1.0,0.047059,0.045161,0.134857,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
38527,0.321000,0.805195,0.0,0.256410,0.0,0.095982,1.0,0.035294,0.010215,0.142025,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
38528,0.777957,0.753247,0.0,0.423077,0.0,0.085982,0.0,0.023529,0.033333,0.165323,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
38529,0.020000,0.766234,0.0,0.230769,0.0,0.079982,1.0,0.070588,0.083333,0.219534,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(scaled_dummies.drop('price_usd', axis=1), scaled_dummies['price_usd'])

In [15]:
rf = train_random_forest(X_train, y_train, min_samples_leaf = 5)
evaluate_model(rf, X_test, y_test)

{'rmse': 0.04180921465546592,
 'mae': 0.023613764605643767,
 'r2': 0.8950883312518049}

## Successfully squeezed out some more performance

## Lets try only the categorical features

In [22]:
target = data_all['price_usd']

In [24]:
data = drop_anonymous_features(data_all)
data = drop_by_name(data, 'model_name')
data.dropna(inplace=True)

data = pick_by_datatypes(data, ['categorical'])
data = pd.concat([data, target], axis=1)
data

Unnamed: 0,manufacturer_name,transmission,color,engine_fuel,engine_type,body_type,state,drivetrain,location_region,price_usd
0,Subaru,automatic,silver,gasoline,gasoline,universal,owned,all,Минская обл.,10900.00
1,Subaru,automatic,blue,gasoline,gasoline,universal,owned,all,Минская обл.,5000.00
2,Subaru,automatic,red,gasoline,gasoline,suv,owned,all,Минская обл.,2800.00
3,Subaru,mechanical,blue,gasoline,gasoline,sedan,owned,all,Минская обл.,9999.00
4,Subaru,automatic,black,gasoline,gasoline,universal,owned,all,Гомельская обл.,2134.11
...,...,...,...,...,...,...,...,...,...,...
26222,,,,,,,,,,12900.00
26582,,,,,,,,,,14500.00
26914,,,,,,,,,,15500.00
27554,,,,,,,,,,24750.00


In [25]:
dummies = pd.get_dummies(data)
scaled_dummies = pd.DataFrame(MinMaxScaler().fit_transform(dummies), index=dummies.index, columns=dummies.columns)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(scaled_dummies.drop('price_usd', axis=1), scaled_dummies['price_usd'])

In [27]:
rf = train_random_forest(X_train, y_train, min_samples_leaf = 5)
evaluate_model(rf, X_test, y_test)

{'rmse': 0.08399667769908672,
 'mae': 0.05576980390771338,
 'r2': 0.5781202666091639}

### So the other features were important as well

## Conclusion: best results so far with Random Forest on all data except models with scaling