In [2]:
!pip install --upgrade tables
!pip install eli5
!pip install xgboost

Collecting tables
[?25l  Downloading https://files.pythonhosted.org/packages/ed/c3/8fd9e3bb21872f9d69eb93b3014c86479864cca94e625fd03713ccacec80/tables-3.6.1-cp36-cp36m-manylinux1_x86_64.whl (4.3MB)
[K     |████████████████████████████████| 4.3MB 2.8MB/s 
Installing collected packages: tables
  Found existing installation: tables 3.4.4
    Uninstalling tables-3.4.4:
      Successfully uninstalled tables-3.4.4
Successfully installed tables-3.6.1


In [0]:
 import pandas as pd
 import numpy as np

 from sklearn.dummy import DummyRegressor
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.ensemble import RandomForestRegressor

 import xgboost as xgb

 from sklearn.metrics import mean_absolute_error as mae
 from sklearn.model_selection import cross_val_score, KFold

 import eli5
 from eli5.sklearn import PermutationImportance 

Wczytywanie danych


In [26]:
cd /content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car

/content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car


In [27]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

Features

In [0]:
SUFFIX_CAT = '__cat'
for feat in df.columns:
  if isinstance(df[feat][0],list): continue

  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat:
     df[feat] = factorized_values 
  else:   
     df[feat + SUFFIX_CAT] = factorized_values

In [29]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x ]
cat_feats = [x for x in cat_feats if 'price' not in x ]
len(cat_feats)

151

In [0]:
def run_model(model, feats):
  X = df[feats].values
  y = df['price_value'].values

  scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

##DecisionTree


In [35]:
run_model(DecisionTreeRegressor(max_depth=5),cat_feats)


(-19695.13091100928, 148.72570644015792)

## Random Forest

In [38]:
model = RandomForestRegressor(max_depth=5,n_estimators=50, random_state=0)
run_model(model, cat_feats)

(-18718.657185256638, 64.5424578125788)

##XGBOOST

In [43]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'seed': 0
}

run_model(xgb.XGBRegressor(**xgb_params),cat_feats)



(-13108.379065811214, 74.32158265003798)

In [49]:
m = xgb.XGBRegressor(max_depth=5, n_estimatort=50, learning_rate=0.1, seed=0)
m.fit(X, y)

imp = PermutationImportance(m, random_state=0).fit(X, y)
eli5.show_weights(imp, feature_names=cat_feats)



Weight,Feature
0.1341  ± 0.0029,param_rok-produkcji__cat
0.1037  ± 0.0028,param_stan__cat
0.0956  ± 0.0029,param_napęd__cat
0.0501  ± 0.0020,param_skrzynia-biegów__cat
0.0442  ± 0.0014,param_moc__cat
0.0438  ± 0.0018,param_faktura-vat__cat
0.0401  ± 0.0008,param_marka-pojazdu__cat
0.0216  ± 0.0007,param_typ__cat
0.0182  ± 0.0012,feature_kamera-cofania__cat
0.0175  ± 0.0006,param_wersja__cat


In [52]:
feats= ['param_rok-produkcji__cat', 'param_stan__cat', 'param_stan__cat', 'param_napęd__cat', 'param_skrzynia-biegów__cat', 'param_moc__cat', 'param_faktura-vat__cat', 'param_marka-pojazdu__cat', 'param_typ__cat', 'feature_kamera-cofania__cat', 'param_wersja__cat', 'param_model-pojazdu__cat', 'param_pojemność-skokowa__cat', 'param_kod-silnika__cat', 'seller_name__cat','feature_wspomaganie-kierownicy__cat', 'feature_czujniki-parkowania-przednie__cat', 'param_uszkodzony__cat','feature_system-start-stop__cat', 'feature_regulowane-zawieszenie__cat']

run_model(xgb.XGBRegressor(**xgb_params),feats)




(-13355.809019132956, 50.20090088643549)

In [58]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) =='None' else int(x))

feats= ['param_rok-produkcji', 'param_stan__cat', 'param_stan__cat', 'param_napęd__cat', 'param_skrzynia-biegów__cat', 'param_moc__cat', 'param_faktura-vat__cat', 'param_marka-pojazdu__cat', 'param_typ__cat', 'feature_kamera-cofania__cat', 'param_wersja__cat', 'param_model-pojazdu__cat', 'param_pojemność-skokowa__cat', 'param_kod-silnika__cat', 'seller_name__cat','feature_wspomaganie-kierownicy__cat', 'feature_czujniki-parkowania-przednie__cat', 'param_uszkodzony__cat','feature_system-start-stop__cat', 'feature_regulowane-zawieszenie__cat']

run_model(xgb.XGBRegressor(**xgb_params),feats)



(-11016.198217595891, 55.73248199030712)

In [78]:
df['param_moc'].unique()

array([ 90, 115, 262, 110, 310, 105, 140, 175, 125, 185, 190, 440, 141,
       200, 224,  75,  99, 184, 109, 233, 116,  68, 286, 126, 160, 135,
       120, 272,  -1, 150, 180, 136, 102, 131, 218, 245, 170, 112, 250,
       252,  73, 100, 313, 101, 285,  70, 383, 174, 277, 132, 130, 215,
        60, 330, 163, 177,  98,  78, 189, 156, 143,  69, 113,  65, 122,
        82, 251,  95, 197, 235, 238, 171, 381, 400, 178,  80, 165,  85,
       258, 142, 204, 124,  55, 144, 231, 248, 152, 181, 210, 340, 129,
       147,  50,  54, 290, 306, 193,  77, 164,  96, 194, 111, 166, 206,
       118, 360, 211, 271, 455, 280, 106, 114, 421,  74, 213, 121, 275,
       435, 384, 326,  88, 220, 260,  64,  86, 128, 256, 240, 244, 162,
       237, 350,  35, 265, 202, 133,  83, 117, 146,  92, 192, 145, 525,
       254, 182, 328, 367, 148, 456,  97, 270, 107, 108, 203, 155,  94,
        93, 241,  20,  71, 173,  58, 205, 236,   1, 557,  84, 457,  72,
       295, 134, 425, 228,  81, 230, 201,  87, 234, 299, 585, 20

In [85]:
df['param_moc']= df['param_moc'].map(lambda x: -1 if str(x) =='None' else int(str(x).split(' ')[0]) )

df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) =='None' else int(x))

feats= ['param_rok-produkcji', 'param_stan__cat', 'param_stan__cat', 'param_napęd__cat', 'param_skrzynia-biegów__cat', 'param_moc', 'param_faktura-vat__cat', 'param_marka-pojazdu__cat', 'param_typ__cat', 'feature_kamera-cofania__cat', 'param_wersja__cat', 'param_model-pojazdu__cat', 'param_pojemność-skokowa__cat', 'param_kod-silnika__cat', 'seller_name__cat','feature_wspomaganie-kierownicy__cat', 'feature_czujniki-parkowania-przednie__cat', 'param_uszkodzony__cat','feature_system-start-stop__cat', 'feature_regulowane-zawieszenie__cat']

run_model(xgb.XGBRegressor(**xgb_params),feats)




(-9448.31115406246, 81.02244120471303)

In [94]:
df['param_pojemność-skokowa']= df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) =='None' else int(str(x).split('cm')[0].replace(' ','')) )

feats= ['param_rok-produkcji', 'param_stan__cat', 'param_stan__cat', 'param_napęd__cat', 'param_skrzynia-biegów__cat', 'param_moc', 'param_faktura-vat__cat', 'param_marka-pojazdu__cat', 'param_typ__cat', 'feature_kamera-cofania__cat', 'param_wersja__cat', 'param_model-pojazdu__cat', 'param_pojemność-skokowa', 'param_kod-silnika__cat', 'seller_name__cat','feature_wspomaganie-kierownicy__cat', 'feature_czujniki-parkowania-przednie__cat', 'param_uszkodzony__cat','feature_system-start-stop__cat', 'feature_regulowane-zawieszenie__cat']

run_model(xgb.XGBRegressor(**xgb_params),feats)



(-9241.721058179113, 43.49460469751184)

In [0]:
 !git config --global user.email "konrad.zemla@gmail.com"
 !git config --global user.name "Konrad"

In [3]:
!git add day4_xgboost.ipynb
!git commit -m "xgboost"

fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


In [0]:
!git push -u origin master