In [1]:
from utils import prepare_jupyter
prepare_jupyter()

In [2]:
import catboost as cb
import numpy as np

from data import Dataset
from structure import CatboostEnsemble

iris_train, iris_val = Dataset.create_iris().split(0.5)
cancer_train, cancer_val = Dataset.create_cancer().split(0.5)
aids_train, aids_val = Dataset.from_openml('aids').split(0.5)

train_set, val_set = cancer_train, cancer_val
# train_set, val_set = iris_train, iris_val

n_classes = train_set.num_classes()

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Categorical features

In [3]:
import json
from sklearn.datasets import fetch_openml

aids = fetch_openml('aids')
feature_names = np.array(aids.feature_names)
categories = np.array(list(aids.categories.keys()))
feature_names, categories

cat_features = np.array([np.argwhere(feature_names == cat)[0, 0] for cat in categories])

In [8]:
clf = cb.CatBoostClassifier(n_estimators=10, max_depth=3, cat_features=cat_features)
clf = clf.fit(aids_train.X, aids_train.y)

MODEL_FILE = '/tmp/catboost.model.json'
clf.save_model(MODEL_FILE, format='json')    
model = json.load(open(MODEL_FILE, 'r'))

Learning rate set to 0.35706
0:	learn: 0.6655432	total: 1.61ms	remaining: 14.5ms
1:	learn: 0.6394976	total: 2.75ms	remaining: 11ms
2:	learn: 0.6273726	total: 3.73ms	remaining: 8.71ms
3:	learn: 0.6262767	total: 4.48ms	remaining: 6.72ms
4:	learn: 0.5909399	total: 5.89ms	remaining: 5.89ms
5:	learn: 0.5872647	total: 6.96ms	remaining: 4.64ms
6:	learn: 0.5863374	total: 8.44ms	remaining: 3.62ms
7:	learn: 0.5717829	total: 14.8ms	remaining: 3.7ms
8:	learn: 0.5424408	total: 17.5ms	remaining: 1.94ms
9:	learn: 0.5348615	total: 22.7ms	remaining: 0us


In [9]:
tree = model['oblivious_trees'][0]
model['features_info']

{'categorical_features': [{'flat_feature_index': 0, 'feature_index': 0},
  {'flat_feature_index': 1, 'feature_index': 1}],
 'ctrs': [{'borders': [3.9999990463256836,
    7.999999046325684,
    9.999999046325684],
   'prior_numerator': 0,
   'shift': 0,
   'target_border_idx': 0,
   'ctr_type': 'Borders',
   'scale': 15,
   'elements': [{'cat_feature_index': 0,
     'combination_element': 'cat_feature_value'}],
   'identifier': '{\n  "type":"Borders",\n  "identifier":\n    [\n      {\n        "cat_feature_index":0,\n        "combination_element":"cat_feature_value"\n      }\n    ]\n}',
   'prior_denomerator': 1},
  {'borders': [4.999999046325684, 5.999999046325684],
   'prior_numerator': 0.5,
   'shift': 0,
   'target_border_idx': 0,
   'ctr_type': 'Borders',
   'scale': 15,
   'elements': [{'cat_feature_index': 0,
     'combination_element': 'cat_feature_value'}],
   'identifier': '{\n  "type":"Borders",\n  "identifier":\n    [\n      {\n        "cat_feature_index":0,\n        "combina

## Rest

In [10]:
# https://github.com/catboost/tutorials/blob/master/apply_model/model_export_as_json_tutorial.ipynb
from sklearn.metrics import accuracy_score

params = {
    'loss_function': 'MultiClass' if train_set.name == 'iris' else 'Logloss',
    'depth': 2,
    'num_trees': 3,
    'verbose': False
}

clf = cb.CatBoostClassifier(**params)
clf.fit(train_set.X, train_set.y)

y_pred = clf.predict(val_set.X)
y_true = val_set.y

accuracy_score(y_true, y_pred)

0.9578947368421052

In [11]:
# Unfortunately there is no way to store it without file
MODEL_FILE = '/tmp/catboost.model.json'
clf.save_model(MODEL_FILE, format='json')

In [12]:
import json

model = json.load(open(MODEL_FILE, 'r'))
trees = model['oblivious_trees']
tree = trees[0]

In [13]:
cb_ensemble = CatboostEnsemble(params)
cb_ensemble.fit(train_set)
print(cb_ensemble.predict_proba(val_set.X[0:2]))
print()
print(cb_ensemble.clf.predict_proba(val_set.X[0:2]))

AttributeError: 'numpy.ndarray' object has no attribute 'X'

In [14]:
def traverse(x, splits, leaf_values, it=0):
    split = splits[it]
    feature_idx = split['float_feature_index']
    border = float(split['border'])
    
    n = len(leaf_values)
    
    if x[feature_idx] > border:
        next_leaf_values = leaf_values[1::2]
    else:
        next_leaf_values = leaf_values[0::2]
    
    if len(next_leaf_values) == 1:
        return next_leaf_values[0]
        
    return traverse(x, splits, next_leaf_values, it=it+1)


# def traverse_multi(x, splits, leaf_values, it=0):
#     split = splits[it]
#     feature_idx = split['float_feature_index']
#     border = float(split['border'])
    
#     n = len(leaf_values)
    
#     if x[feature_idx] > border:
#         next_leaf_values = leaf_values[1::2]
#     else:
#         next_leaf_values = leaf_values[0::2]
    
#     if n_classes > 2 and n_classes == len(next_leaf_values):
#         return next_leaf_values
    
#     if len(next_leaf_values) == 1:
#         return next_leaf_values[0]
        
#     return traverse(x, splits, next_leaf_values, it=it+1)


def evaluate(X, oblivious_trees):
    results = []
    
    for x in X:
        result = []

        for tree in oblivious_trees:
            if n_classes > 2:
                leaf_values = np.array(tree['leaf_values'])
                leaf_values = leaf_values.reshape((n_classes, len(leaf_values)//n_classes))
                result.append([traverse(x, tree['splits'], lv) for lv in leaf_values])
            else:
                result.append(traverse(x, tree['splits'], tree['leaf_values']))

        results.append(result)
        
    return np.array(results)


def sigmoid(x):
    return 1/(1+np.exp(-x))


def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [15]:
X_test = val_set.X[0:2]
print(len(trees))
my_preds = evaluate(X_test, trees)
print(my_preds)

cb_tree = CatboostTree.parse(trees[2], train_set)
cb_tree.predict(X_test)

3
[[-1.24077423 -0.8073566  -0.46290973]
 [ 0.75991268  0.6810476   0.48967347]]


NameError: name 'CatboostTree' is not defined

In [16]:
X_test = val_set.X[0:2]

my_preds = evaluate(X_test, trees)
print(my_preds)
my_preds = np.sum(my_preds, axis=1)

print()
print(my_preds)
print()

my_results_proba = np.array([softmax(el) for el in my_preds])
my_results_cls = np.argmax(my_results_proba, axis=1)

clf_results_proba = clf.predict_proba(X_test)
clf_results_cls = clf.predict(X_test).reshape((len(X_test),))

# print(clf_results_proba[6], my_results_proba[6])
print(my_results_proba)
print()
print(clf_results_proba)
# print(clf_results_cls - my_results_cls)

[[-1.24077423 -0.8073566  -0.46290973]
 [ 0.75991268  0.6810476   0.48967347]]

[-2.51104057  1.93063374]



AxisError: axis 1 is out of bounds for array of dimension 1

In [17]:
X_test = val_set.X

my_preds = evaluate(X_test, trees)
clf_preds_raw = clf.predict(X_test, prediction_type="RawFormulaVal")
clf_preds_cls = clf.predict(X_test)

# every value should be equal
assert(np.sum(my_preds - clf_preds_raw) == 0)

ValueError: operands could not be broadcast together with shapes (285,3) (285,) 

## Jak to działa
Wartości są przepuszczane przez sigmoid lub multiclass, ale najpierw należy je sumować, chyba.

In [18]:
# Sigmoid (dla 2 klas)
X_test = val_set.X

y_clf = clf.predict(X_test)
y_clf_proba = clf.predict_proba(X_test)

results = evaluate(X_test, trees)
results = np.sum(results, axis=1)

if n_classes == 2:
    results_proba = np.array([[1 - r, r] for r in sigmoid(results)])
else:
    results_proba
results_cls = np.argmax(results_proba, axis=1)
results_cls = np.array(results_cls, dtype=np.float32)

assert(np.sum(results_cls - y_clf) == 0)

In [19]:
# Softmax (dla >2 klas)

# Tree parser

In [None]:
from structure.CatboostTree import CatboostTree

cb_tree = CatboostTree.parse(tree, train_set)

In [None]:
from IPython.display import Image
Image('test.png')

In [6]:
from data import Dataset

pyrim = Dataset.from_openml('pyrim')


  " {version}.".format(name=name, version=res[0]['version']))


In [9]:
ls = [(1, 'a'), (2, 'b'), (3, 'c')]
(2, 'a') in ls

False