In [1]:
%cd ..

/home/ivanstefanov/Repositories/used-cars-price-prediction


In [2]:
import pandas as pd
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import MinMaxScaler

In [4]:
from price_predictor.prepare_data import get_numerical, split_train_test_valid
from price_predictor.evaluate import evaluate

In [5]:
%matplotlib inline

## Read data

In [6]:
data_all = pd.read_csv('data/cars.csv')
data_all

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,True,False,True,True,False,False,False,False,True,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,Chrysler,300,automatic,silver,290000,2000,gasoline,False,gasoline,3.5,...,True,False,False,True,True,False,False,True,True,301
38527,Chrysler,PT Cruiser,mechanical,blue,321000,2004,diesel,False,diesel,2.2,...,True,False,False,True,True,False,False,True,True,317
38528,Chrysler,300,automatic,blue,777957,2000,gasoline,False,gasoline,3.5,...,True,False,False,True,True,False,False,True,True,369
38529,Chrysler,PT Cruiser,mechanical,black,20000,2001,gasoline,False,gasoline,2.0,...,True,False,False,False,False,False,False,False,True,490


## Filter data

In [7]:
data_all.dropna(inplace=True)

In [8]:
data = get_numerical(data_all)
data

Unnamed: 0,odometer_value,year_produced,engine_capacity,price_usd,number_of_photos,up_counter,duration_listed
0,190000,2010,2.5,10900.00,9,13,16
1,290000,2002,3.0,5000.00,12,54,83
2,402000,2001,2.5,2800.00,4,72,151
3,10000,1999,3.0,9999.00,9,42,86
4,280000,2001,2.5,2134.11,14,7,7
...,...,...,...,...,...,...,...
38526,290000,2000,3.5,2750.00,5,85,301
38527,321000,2004,2.2,4800.00,4,20,317
38528,777957,2000,3.5,4300.00,3,63,369
38529,20000,2001,2.0,4000.00,7,156,490


In [9]:
scaler = MinMaxScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [10]:
data_scaled

Unnamed: 0,odometer_value,year_produced,engine_capacity,price_usd,number_of_photos,up_counter,duration_listed
0,0.190000,0.883117,0.294872,0.217984,0.094118,0.006452,0.007168
1,0.290000,0.779221,0.358974,0.099982,0.129412,0.028495,0.037186
2,0.402000,0.766234,0.294872,0.055981,0.035294,0.038172,0.067652
3,0.010000,0.740260,0.358974,0.199964,0.094118,0.022043,0.038530
4,0.280000,0.766234,0.294872,0.042663,0.152941,0.003226,0.003136
...,...,...,...,...,...,...,...
38516,0.290000,0.753247,0.423077,0.054981,0.047059,0.045161,0.134857
38517,0.321000,0.805195,0.256410,0.095982,0.035294,0.010215,0.142025
38518,0.777957,0.753247,0.423077,0.085982,0.023529,0.033333,0.165323
38519,0.020000,0.766234,0.230769,0.079982,0.070588,0.083333,0.219534


In [11]:
X_train, X_test, X_valid, y_train, y_test, y_valid = split_train_test_valid(data_scaled)

## Linear regression

In [12]:
from price_predictor.trainers.train_linear_model import (
    train_linear_regression,
    train_lasso_cv,
    train_ridge_cv
)

from price_predictor.evaluate import evaluate_model

In [13]:
lr = train_linear_regression(X_train, y_train)

In [14]:
evaluate_model(lr, X_test, y_test)

{'mse': 0.006147453755118817, 'r2': 0.6155077631035596}

In [25]:
lasso = train_lasso_cv(X_train, y_train, alphas=[0.001, 0.01, 0.1, 1, 10, 100, 1000])

In [26]:
evaluate_model(lasso, X_test, y_test)

{'mse': 0.006484474930486148, 'r2': 0.5944287878464374}

In [27]:
ridge = train_ridge_cv(X_train, y_train, alphas=[0.001, 0.01, 0.1, 1, 10, 100, 1000])

In [28]:
evaluate_model(ridge, X_test, y_test)

{'mse': 0.006147945208172388, 'r2': 0.6154770251929669}

## Decision tree

In [19]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train);

In [20]:
dt.score(X_valid, y_valid)

0.6795313234142557

## Ensamble model

In [21]:
from price_predictor.trainers.train_ensamble_model import (
    train_random_forest,
    search_random_forest,
    train_ada_boost
)

In [22]:
rf = train_random_forest(X_train, y_train, min_samples_leaf = 5)

In [29]:
evaluate_model(rf, X_test, y_test)

{'mse': 0.0025868286518290168, 'r2': 0.8382069106284284}

In [24]:
param_grid = {'min_samples_split': [2, 4, 8], 'min_samples_leaf': [2, 4, 8]}

rf_search = search_random_forest(X_train, y_train, param_grid)

KeyboardInterrupt: 

In [None]:
rf_search.best_params_

In [66]:
ada = train_ada_boost(X_train, y_train, learning_rate=0.1, n_estimators = 100)

In [67]:
evaluate_model(ada, X_test, y_test)

{'mse': 0.004005036066098238, 'r2': 0.7495051874733023}