In [0]:
!pip install --upgrade tables
!pip install eli5
!pip install xgboost

In [0]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

import eli5
from eli5.sklearn import PermutationImportance

In [13]:
ls

[0m[01;34mdw_marix_car[0m/


In [15]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_two/dw_marix_car"

/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_two/dw_marix_car


In [0]:
df = pd.read_hdf('data/car.h5')

In [17]:
df.shape

(106494, 155)

In [0]:
SUFFIX_CAT = '__cat'
for feat in df.columns:
  if isinstance(df[feat][0], list): continue #if the df is a list then skip it

  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat: #if the feature contains suffic __cat
    df[feat] = factorized_values #then assign the same value agian a__cat = a__cat
  else:
    df[feat + SUFFIX_CAT] = factorized_values #else add a suffix

In [53]:
cat_feats =  [x for x in df.columns if SUFFIX_CAT in x]
cat_feats =  [x for x in cat_feats if 'price' not in x] #151 feats on which we want to learn
len(cat_feats)

151

In [0]:
def run_model(model,feats):
  X = df[feats].values
  y = df['price_value'].values

  scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error') #neg mean 
  return np.mean(scores), np.std(scores) #almost 2 times better result

##Decision Tree

In [58]:
run_model( DecisionTreeRegressor(max_depth=5), cat_feats )

(-19747.81093847179, 75.95212671816309)

## Random Forest

In [0]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)

In [60]:
run_model(model, cat_feats)

(-18864.27843495869, 38.24233578175403)

## XGBoost


In [61]:
xgb_params = {
    'max_depth' : 5 ,
    'n_estimators' : 50,
    'learning_rate': 0.1,
    'seed': 0
}


run_model(xgb.XGBRegressor(**xgb_params), cat_feats)



(-13034.249470063352, 103.73566521784606)

In [63]:
m = xgb.XGBRegressor(max_depth=5, n_estimators=50, learning_rate=0.1, seed = 0)
m.fit(X,y)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=50,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=None,
             subsample=1, verbosity=1)

In [0]:
imp = PermutationImportance(m, random_state=0).fit(X,y)

In [65]:
eli5.show_weights(imp, feature_names = cat_feats)

Weight,Feature
0.1194  ± 0.0031,param_napęd__cat
0.1132  ± 0.0032,param_rok-produkcji__cat
0.1090  ± 0.0025,param_stan__cat
0.0619  ± 0.0025,param_skrzynia-biegów__cat
0.0568  ± 0.0016,param_faktura-vat__cat
0.0489  ± 0.0014,param_moc__cat
0.0273  ± 0.0007,param_marka-pojazdu__cat
0.0242  ± 0.0013,feature_kamera-cofania__cat
0.0212  ± 0.0008,param_typ__cat
0.0174  ± 0.0008,param_pojemność-skokowa__cat


In [66]:
feats = ['param_napęd__cat','param_rok-produkcji__cat','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc__cat','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','param_pojemność-skokowa__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']
len(feats)

20

In [67]:
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-13475.642177294301, 72.42283542993323)

In [71]:
df['param_rok-produkcji'].map(lambda x: -1 if str(x) == "None" else int(x))

0         2018
2         2011
3         2015
4         2009
5         2017
          ... 
160609    2013
160610    2013
160611    2006
160614    2003
160615    2006
Name: param_rok-produkcji, Length: 106494, dtype: int64

In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == "None" else int(x))

In [73]:
feats = ['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc__cat','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','param_pojemność-skokowa__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']
len(feats)
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-11327.392744425772, 66.22128239389319)

In [0]:
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(x.split(' ')[0]) )

In [76]:
feats = ['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','param_pojemność-skokowa__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']
len(feats)
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-9743.019185925381, 71.23754505718838)

In [77]:
df['param_pojemność-skokowa'].unique()

array(['898 cm3', '1 560 cm3', '3 000 cm3', ..., '5 992 cm3', '1 966 cm3',
       '142 280 cm3'], dtype=object)

In [78]:
df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int(x.split('cm')[0].replace(' ', (''))) )

0          898
2         1560
3         3000
4         1560
5         1984
          ... 
160609    1120
160610    1200
160611    1900
160614    2500
160615    1360
Name: param_pojemność-skokowa, Length: 106494, dtype: int64

In [0]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else int( str(x).split('cm')[0].replace(' ', (''))) )


In [82]:
feats = ['param_napęd__cat','param_rok-produkcji','param_stan__cat','param_skrzynia-biegów__cat','param_faktura-vat__cat','param_moc','param_marka-pojazdu__cat','feature_kamera-cofania__cat','param_typ__cat','param_pojemność-skokowa','seller_name__cat','feature_wspomaganie-kierownicy__cat','param_model-pojazdu__cat','param_wersja__cat','param_kod-silnika__cat','feature_system-start-stop__cat','feature_asystent-pasa-ruchu__cat','feature_czujniki-parkowania-przednie__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat']
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-9570.015830397702, 53.097008450207305)

In [0]:
!git config --global user.email "kuband93@gmail.com"
!git config --global user.name "Jakub Andrzejewski"
!git add day4.ipynb

In [89]:
!git commit -m 'xgboost traning to imporve model'

On branch master
Your branch is up to date with 'origin/master'.

Changes not staged for commit:
	[31mmodified:   day2_visualization.ipynb[m
	[31mmodified:   day3_simple_model.ipynb[m
	[31mmodified:   day5.ipynb[m

no changes added to commit


In [0]:
!git push -u origin master