In [0]:
# !pip install --upgrade tables
# !pip install eli5
# !pip install xgboost

In [3]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix_two/dw-predict-car/"

/content/drive/My Drive/Colab Notebooks/dw_matrix_two/dw-predict-car


In [0]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import  mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold

import eli5
from eli5.sklearn import PermutationImportance

In [8]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

## Features

In [0]:
SUFFIX_CAT='__cat'

for feat in df.columns.values:
  if isinstance(df[feat][0], list): continue
  
  if SUFFIX_CAT in feat: continue
  df[feat+SUFFIX_CAT] = df[feat].factorize()[0]

In [10]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x]
cat_feats = [x for x in cat_feats if 'price' not in x]
len(cat_feats)

151

In [0]:
def run_model(model, feats):
  X = df[feats].values
  y = df['price_value'].values

  scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

## DecisionTree

In [0]:
run_model( DecisionTreeRegressor(max_depth=5), cat_feats)

## Random Forest

In [16]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=5)
run_model(model, cat_feats)

(-18750.31117742187, 101.5436716155277)

## XGBoost

In [17]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'seed': 0
}

model = xgb.XGBRegressor(**xgb_params)
run_model(model, cat_feats)



(-13108.379065811214, 74.32158265003798)

In [18]:
m = xgb.XGBRegressor(max_depth=5, n_estimators=50, learning_rate=0.1, seed=0)
m.fit(X, y)

imp = PermutationImportance(m, random_state=5).fit(X,y)
eli5.show_weights(imp, feature_names=cat_feats)



Weight,Feature
0.1192  ± 0.0020,param_napęd__cat
0.1128  ± 0.0021,param_rok-produkcji__cat
0.1090  ± 0.0031,param_stan__cat
0.0601  ± 0.0002,param_skrzynia-biegów__cat
0.0574  ± 0.0019,param_faktura-vat__cat
0.0488  ± 0.0008,param_moc__cat
0.0280  ± 0.0005,param_marka-pojazdu__cat
0.0237  ± 0.0006,feature_kamera-cofania__cat
0.0212  ± 0.0007,param_typ__cat
0.0176  ± 0.0013,seller_name__cat


In [39]:
feats=['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','seller_name__cat','param_pojemność-skokowa','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']

run_model(xgb.XGBRegressor(**xgb_params), feats)



(-9569.227198767323, 72.83561801421891)

In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x)=='None' else int(x))
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(str(x).split(' ')[0]))
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(str(x).split('cm')[0].replace(' ','')))

In [0]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(str(x).split('cm')[0].replace(' ','')))