In [3]:
import numpy as np
import lightgbm as lgb

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.metrics import accuracy_score
from structure.Dataset import Dataset
from structure.SklearnEnsemble import SklearnEnsemble
from structure.LGBEnsemble import LGBEnsemble
from predict import majority_voting

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [25]:
# Data
iris_train, iris_val = Dataset.from_sklearn(load_iris()).split()
cancer_train, cancer_val = Dataset.from_sklearn(load_breast_cancer()).split()

# train_data, val_data = iris_train, iris_val
train_data, val_data = cancer_train, cancer_val

## Sklearn

In [5]:
# Somehow it works for max_depth >=7, and not <7
tree = DecisionTreeClassifier(max_depth=7)

clf = AdaBoostClassifier(n_estimators=10, base_estimator=tree)
ensemble = SklearnEnsemble(clf)
ensemble.fit(train_data)

clf_preds = ensemble.clf.predict(val_data.X)
preds = ensemble.predict(val_data.X)

print(clf_preds, preds)
print(accuracy_score(val_data.y, preds))
print(accuracy_score(val_data.y, clf_preds))
print(f'Node div = {ensemble.node_diversity()}')

[1 1 0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1 1 0 1 1
 1 1 0 0 1 0 1 0 0 1 0 1 0 0 1 1 0 1 0 1] [1 1 0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1 1 0 1 1
 1 1 0 0 1 0 1 0 0 1 0 1 0 0 1 1 0 1 0 1]
0.8947368421052632
0.8947368421052632
Node div = 0.0


## Light GBM

In [6]:
clf = lgb.LGBMClassifier(n_estimators=10, objective='binary')
ensemble = LGBEnsemble(clf)
ensemble.fit(train_data)

clf_preds = ensemble.clf.predict(val_data.X)
preds = ensemble.predict(val_data.X)

print(clf_preds)
print(preds)
print(accuracy_score(val_data.y, preds))
print(accuracy_score(val_data.y, clf_preds))
print(f'Node div = {ensemble.node_diversity()}')

[1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 0 0 1 1 0 1 0 1 1 0 1 1 1 0 1 1
 1 1 0 0 1 0 1 0 0 1 0 1 0 0 1 1 0 1 1 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.3333333333333333
0.9473684210526315
Node div = 9.340235543068493


In [57]:
from scipy.special import softmax, expit

X = val_data.X
y = val_data.y

trees = ensemble.trees
clf = ensemble.clf

n_classes = len(clf.classes_)
# THIS IS BAD FOR BINARY
n_estimators = clf.n_estimators

predictions = np.array([tree.predict(X) for tree in trees])
predictions = np.rollaxis(predictions, axis=1).reshape(
    (len(X), n_estimators, n_classes))

print(predictions.shape)

# TODO: There's problem here
# print(np.sum(predictions, axis=1).shape)
probs = expit(np.sum(predictions, axis=1))
# print(probs)

np.argmax(probs, axis=1), y

ValueError: cannot reshape array of size 570 into shape (57,10,2)

In [70]:
arr = 
probs = np.array([np.array([1 - prob, prob]) for prob in expit(np.sum(predictions, axis = 0))])
probs

array([[0.77462284, 0.22537716],
       [0.13875402, 0.86124598],
       [0.77771077, 0.22228923],
       [0.13636397, 0.86363603],
       [0.45797214, 0.54202786],
       [0.77870358, 0.22129642],
       [0.18217074, 0.81782926],
       [0.1668994 , 0.8331006 ],
       [0.13624044, 0.86375956],
       [0.76154519, 0.23845481],
       [0.77803804, 0.22196196],
       [0.15825654, 0.84174346],
       [0.77619048, 0.22380952],
       [0.7793897 , 0.2206103 ],
       [0.45797214, 0.54202786],
       [0.15467624, 0.84532376],
       [0.77957465, 0.22042535],
       [0.13628365, 0.86371635],
       [0.49122975, 0.50877025],
       [0.73599776, 0.26400224],
       [0.15881227, 0.84118773],
       [0.13636455, 0.86363545],
       [0.54473573, 0.45526427],
       [0.13587983, 0.86412017],
       [0.47784427, 0.52215573],
       [0.13327572, 0.86672428],
       [0.31743068, 0.68256932],
       [0.77210419, 0.22789581],
       [0.78057362, 0.21942638],
       [0.17538285, 0.82461715],
       [0.

In [62]:
clf.predict_proba(X)

array([[0.77462284, 0.22537716],
       [0.13875402, 0.86124598],
       [0.77771077, 0.22228923],
       [0.13636397, 0.86363603],
       [0.45797214, 0.54202786],
       [0.77870358, 0.22129642],
       [0.18217074, 0.81782926],
       [0.1668994 , 0.8331006 ],
       [0.13624044, 0.86375956],
       [0.76154519, 0.23845481],
       [0.77803804, 0.22196196],
       [0.15825654, 0.84174346],
       [0.77619048, 0.22380952],
       [0.7793897 , 0.2206103 ],
       [0.45797214, 0.54202786],
       [0.15467624, 0.84532376],
       [0.77957465, 0.22042535],
       [0.13628365, 0.86371635],
       [0.49122975, 0.50877025],
       [0.73599776, 0.26400224],
       [0.15881227, 0.84118773],
       [0.13636455, 0.86363545],
       [0.54473573, 0.45526427],
       [0.13587983, 0.86412017],
       [0.47784427, 0.52215573],
       [0.13327572, 0.86672428],
       [0.31743068, 0.68256932],
       [0.77210419, 0.22789581],
       [0.78057362, 0.21942638],
       [0.17538285, 0.82461715],
       [0.