In [1]:
from utils import prepare_jupyter
prepare_jupyter()

In [2]:
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt

from matplotlib.pylab import rcParams
from structure import XGBoostEnsemble
from data import Dataset

rcParams['figure.figsize'] = 12, 10

# train_set, val_set = Dataset.create_cancer().split(0.5)
train_set, val_set = Dataset.create_iris().split(0.5)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
params = {
    'n_estimators': 3,
    'max_depth': 5
}

clf = xgb.XGBClassifier(**params)
clf.fit(train_set.X, train_set.y)

clf.predict(val_set.X)

array([2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1, 0, 1, 2, 1, 0, 0, 1, 2, 1,
       0, 2, 0, 0, 1, 2, 1, 0, 1, 1, 1, 0, 1, 1, 1, 2, 2, 2, 0, 2, 2, 1,
       1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 1, 0, 0, 1, 1, 2, 0, 0, 0, 2, 2, 2,
       0, 2, 2, 0, 2, 1, 1, 0, 2])

In [4]:
clf.n_estimators

3

In [5]:
from structure.xgboost import XGBoostTree

# This is the best way to retrieve info about node (I can get additional data from `trees_to_dataframe`)
gv = xgb.to_graphviz(clf, num_trees=1)

xgb_tree = XGBoostTree.parse(str(gv), train_set)

xgb_preds = xgb_tree.predict(val_set.X)
xgb_preds

array([-6.62337765e-02, -6.62337765e-02,  1.37142867e-01, -6.62337765e-02,
        1.37142867e-01, -6.62337765e-02, -6.62337765e-02,  1.37142867e-01,
       -6.62337765e-02,  1.37142867e-01,  1.37142867e-01,  1.37142867e-01,
        1.37142867e-01, -6.99248239e-02,  1.37142867e-01, -6.62337765e-02,
        1.37142867e-01, -6.99248239e-02, -6.99248239e-02,  1.37142867e-01,
       -6.62337765e-02,  1.37142867e-01, -6.99248239e-02, -6.62337765e-02,
       -6.99248239e-02, -6.99248239e-02,  1.37142867e-01, -2.55448485e-09,
        1.37142867e-01, -6.99248239e-02,  1.37142867e-01,  1.37142867e-01,
        1.37142867e-01, -6.99248239e-02,  1.37142867e-01,  1.37142867e-01,
        1.37142867e-01, -6.62337765e-02, -2.55448485e-09, -2.55448485e-09,
       -6.99248239e-02, -6.62337765e-02, -6.62337765e-02,  1.37142867e-01,
        1.37142867e-01, -6.62337765e-02,  1.37142867e-01, -6.99248239e-02,
       -6.62337765e-02, -6.99248239e-02,  1.37142867e-01, -6.62337765e-02,
       -6.62337765e-02,  

In [6]:
plt.figure(figsize=(16, 10))
xgb.to_graphviz(clf).render()

'Digraph.gv.pdf'

<Figure size 1152x720 with 0 Axes>

In [7]:
clf.get_booster().get_dump()[0]

'0:[f2<2.70000005] yes=1,no=2,missing=1\n\t1:leaf=0.139849633\n\t2:leaf=-0.0713513568\n'

In [8]:
booster = clf.get_booster()

In [9]:
booster.trees_to_dataframe().head(n=13)

Unnamed: 0,Tree,Node,ID,Feature,Split,Yes,No,Missing,Gain,Cover
0,0,0,0-0,f2,2.7,0-1,0-2,0-1,38.31858,33.333332
1,0,1,0-1,Leaf,,,,,0.1398496,13.777777
2,0,2,0-2,Leaf,,,,,-0.07135136,19.555553
3,1,0,1-0,f2,2.7,1-1,1-2,1-1,12.42016,33.333332
4,1,1,1-1,Leaf,,,,,-0.06992482,13.777777
5,1,2,1-2,f3,1.7,1-3,1-4,1-3,19.99545,19.555553
6,1,3,1-3,Leaf,,,,,0.1371429,10.666666
7,1,4,1-4,f0,6.05,1-5,1-6,1-5,0.5060558,8.888888
8,1,5,1-5,Leaf,,,,,-2.554485e-09,1.333333
9,1,6,1-6,Leaf,,,,,-0.06623378,7.555555


In [10]:
from math import ceil, log2

trees_df = booster.trees_to_dataframe()
first_tree = trees_df[trees_df['Tree'] == 0]
nodes = first_tree

def get_info(node):
    node_id = node[1]
    node_type = 'leaf' if node[3] == 'Leaf' else 'split'
    feature_idx = node[3][1:]
    value = node[8]
    cover = node[9]
    
    return {
        'id': node[1],
        'type': node_type,
        'feature_idx': int(feature_idx) if node_type == 'split' else None,
        'value': value,
        'count': int(node[9])
    }


def divide_nodes(nodes):
    n = len(nodes)
    sizes = map(lambda x: 2 ** x, range(0, int(ceil(log2(n)))))
    transformed = np.array([get_info(node) for node in nodes])
    
    divided_nodes = []
    tmp_nodes = transformed.copy()
    
    for size in sizes:
        nodes_slice = tmp_nodes[:size]
        tmp_nodes = tmp_nodes[size:]
        divided_nodes.append(nodes_slice)
        
    return divided_nodes
    
divided = divide_nodes(nodes.values)
divided

[array([{'id': 0, 'type': 'split', 'feature_idx': 2, 'value': 38.3185768, 'count': 33}],
       dtype=object),
 array([{'id': 1, 'type': 'leaf', 'feature_idx': None, 'value': 0.139849633, 'count': 13},
        {'id': 2, 'type': 'leaf', 'feature_idx': None, 'value': -0.0713513568, 'count': 19}],
       dtype=object)]

In [11]:
leaf_str = 'leaf=-0.0697674453'
float(leaf_str.split('=')[1])

-0.0697674453

# XGBoost Ensemble testing

Wykorzystanie własnej struktury do ewaluacji

In [12]:
params = {
    'n_estimators': 3,
    'max_depth': 5
}

xgb_ensemble = XGBoostEnsemble(params)

In [13]:
iris_train, iris_val = Dataset.create_iris().split(0.90)
cancer_train, cancer_val = Dataset.create_cancer().split(0.90)

xgb_ensemble.fit(iris_train)

X_test = iris_val.X
y_test = iris_val.y

my_preds_raw = xgb_ensemble.predict_proba(X_test)
clf_preds_raw = xgb_ensemble.clf.predict_proba(X_test)

my_preds_cls = xgb_ensemble.predict(X_test)
clf_preds_cls = xgb_ensemble.predict(X_test)

# Bugfixing

In [14]:
train_set, val_set = Dataset.create_iris().split(0.5)

params = {
    'learning_rate': 0.1,
    'max_depth': 2,
    'n_estimators': 60,
    'num_leaves': 30
}

ensemble = XGBoostEnsemble(params)
ensemble.fit(train_set)

my_preds = ensemble.predict(val_set.X)
clf_preds = ensemble.clf.predict(val_set.X)

# Categorical feature

In [40]:
from sklearn.metrics import accuracy_score

train_set, val_set = Dataset.from_openml('aids').split(0.5)
train_set = train_set.oh_encoded()

params = {
    'learning_rate': 0.1,
    'max_depth': 2,
    'n_estimators': 60,
    'num_leaves': 30
}

ensemble = XGBoostEnsemble(params)
ensemble.fit(train_set)

my_preds = ensemble.predict(val_set.X)
clf_preds = ensemble.clf.predict(val_set.X)
y_true = val_set.y

print(f'Accuracy = {accuracy_score(val_set.y, clf_preds)}')

Accuracy = 0.6


In [None]:
# Encoded
enc_set