In [1]:
%cd ../..

/home/ivanstefanov/Repositories/used-cars-price-prediction


In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from price_predictor.data_preparation.pick_or_drop_features import pick_by_datatypes, drop_anonymous_features
from price_predictor.evaluate import evaluate_model

In [5]:
data_all = pd.read_csv('data/cars.csv')

In [6]:
data = pick_by_datatypes(data_all, ['numerical', 'boolean'])
data = drop_anonymous_features(data)
data.dropna(inplace=True)

In [7]:
data.columns

Index(['odometer_value', 'year_produced', 'engine_has_gas', 'engine_capacity',
       'has_warranty', 'price_usd', 'is_exchangeable', 'number_of_photos',
       'up_counter', 'duration_listed'],
      dtype='object')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('price_usd', axis=1), data['price_usd'])

In [9]:
from price_predictor.trainers.train_linear_model import (
    train_linear_regression,
    train_lasso_cv,
    train_ridge_cv
)

from price_predictor.evaluate import evaluate_model

In [10]:
lr = train_linear_regression(X_train, y_train)

In [11]:
evaluate_model(lr, X_test, y_test)

{'rmse': 3900.9288447680665,
 'mae': 2487.9301356143583,
 'r2': 0.6286373970063328}

In [12]:
lasso = train_lasso_cv(X_train, y_train, alphas=[0.001, 0.01, 0.1, 1, 10, 100, 1000])

In [13]:
evaluate_model(lasso, X_test, y_test)

{'rmse': 3900.940097767877,
 'mae': 2487.9418478081043,
 'r2': 0.6286352544656739}

In [14]:
ridge = train_ridge_cv(X_train, y_train, alphas=[0.001, 0.01, 0.1, 1, 10, 100, 1000])

In [15]:
evaluate_model(ridge, X_test, y_test)

{'rmse': 3900.904360894034, 'mae': 2486.885206277105, 'r2': 0.6286420586482238}

## Ensamble methods

In [16]:
from price_predictor.trainers.train_ensamble_model import (
    train_random_forest,
    search_random_forest,
    train_ada_boost
)

In [17]:
rf = train_random_forest(X_train, y_train, min_samples_leaf = 5)

In [18]:
evaluate_model(rf, X_test, y_test)

{'rmse': 2516.864741358779,
 'mae': 1546.8039388085003,
 'r2': 0.8454098339729921}

In [19]:
ada = train_ada_boost(X_train, y_train, learning_rate=0.05, n_estimators = 70)

In [20]:
evaluate_model(ada, X_test, y_test)

{'rmse': 3140.1350453266186,
 'mae': 2054.642578136858,
 'r2': 0.7593650184928223}

## Conclusion: with boolean features we get barely any better results using Random Forest