In [1]:
from utils import prepare_jupyter
prepare_jupyter()

In [3]:
import catboost as cb
import numpy as np

from structure.Dataset import Dataset
from structure.CatboostTree import CatboostTree
from structure.CatboostEnsemble import CatboostEnsemble

iris_train, iris_val = Dataset.create_iris().split(0.5)
cancer_train, cancer_val = Dataset.create_cancer().split(0.5)

train_set, val_set = cancer_train, cancer_val
# train_set, val_set = iris_train, iris_val

n_classes = train_set.num_classes()

In [2]:
# https://github.com/catboost/tutorials/blob/master/apply_model/model_export_as_json_tutorial.ipynb
from sklearn.metrics import accuracy_score

params = {
    'loss_function': 'MultiClass' if train_set.name == 'iris' else 'Logloss',
    'depth': 2,
    'num_trees': 3,
    'verbose': False
}

clf = cb.CatBoostClassifier(**params)
clf.fit(train_set.X, train_set.y)

y_pred = clf.predict(val_set.X)
y_true = val_set.y

accuracy_score(y_true, y_pred)

0.9438596491228071

In [3]:
# Unfortunately there is no way to store it without file
MODEL_FILE = '/tmp/catboost.model.json'
clf.save_model(MODEL_FILE, format='json')

In [4]:
import json

model = json.load(open(MODEL_FILE, 'r'))
trees = model['oblivious_trees']
tree = trees[0]

In [5]:
cb_ensemble = CatboostEnsemble(params)
cb_ensemble.fit(train_set)
print(cb_ensemble.predict_proba(val_set.X[0:2]))
print()
print(cb_ensemble.clf.predict_proba(val_set.X[0:2]))

[0.1830589  1.64173357 1.35752655]
[[0.45436265 0.54563735]
 [0.16222931 0.83777069]
 [0.2046426  0.7953574 ]]

[[0.32493223 0.67506777]
 [0.0793565  0.9206435 ]]


In [6]:
def traverse(x, splits, leaf_values, it=0):
    split = splits[it]
    feature_idx = split['float_feature_index']
    border = float(split['border'])
    
    n = len(leaf_values)
    
    if x[feature_idx] > border:
        next_leaf_values = leaf_values[1::2]
    else:
        next_leaf_values = leaf_values[0::2]
    
    if len(next_leaf_values) == 1:
        return next_leaf_values[0]
        
    return traverse(x, splits, next_leaf_values, it=it+1)


# def traverse_multi(x, splits, leaf_values, it=0):
#     split = splits[it]
#     feature_idx = split['float_feature_index']
#     border = float(split['border'])
    
#     n = len(leaf_values)
    
#     if x[feature_idx] > border:
#         next_leaf_values = leaf_values[1::2]
#     else:
#         next_leaf_values = leaf_values[0::2]
    
#     if n_classes > 2 and n_classes == len(next_leaf_values):
#         return next_leaf_values
    
#     if len(next_leaf_values) == 1:
#         return next_leaf_values[0]
        
#     return traverse(x, splits, next_leaf_values, it=it+1)


def evaluate(X, oblivious_trees):
    results = []
    
    for x in X:
        result = []

        for tree in oblivious_trees:
            if n_classes > 2:
                leaf_values = np.array(tree['leaf_values'])
                leaf_values = leaf_values.reshape((n_classes, len(leaf_values)//n_classes))
                result.append([traverse(x, tree['splits'], lv) for lv in leaf_values])
            else:
                result.append(traverse(x, tree['splits'], tree['leaf_values']))

        results.append(result)
        
    return np.array(results)


def sigmoid(x):
    return 1/(1+np.exp(-x))


def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [7]:
X_test = val_set.X[0:2]
print(len(trees))
my_preds = evaluate(X_test, trees)
print(my_preds)

cb_tree = CatboostTree.parse(trees[2], train_set)
cb_tree.predict(X_test)

3
[[-0.7684336   0.82086679  0.67876327]
 [ 0.9514925   0.82086679  0.67876327]]


array([0.67876327, 0.67876327])

In [None]:
X_test = val_set.X[0:2]

my_preds = evaluate(X_test, trees)
print(my_preds)
my_preds = np.sum(my_preds, axis=1)

print()
print(my_preds)
print()

my_results_proba = np.array([softmax(el) for el in my_preds])
my_results_cls = np.argmax(my_results_proba, axis=1)

clf_results_proba = clf.predict_proba(X_test)
clf_results_cls = clf.predict(X_test).reshape((len(X_test),))

# print(clf_results_proba[6], my_results_proba[6])
print(my_results_proba)
print()
print(clf_results_proba)
# print(clf_results_cls - my_results_cls)

In [None]:
X_test = val_set.X

my_preds = evaluate(X_test, trees)
clf_preds_raw = clf.predict(X_test, prediction_type="RawFormulaVal")
clf_preds_cls = clf.predict(X_test)

# every value should be equal
assert(np.sum(my_preds - clf_preds_raw) == 0)

## Jak to działa
Wartości są przepuszczane przez sigmoid lub multiclass, ale najpierw należy je sumować, chyba.

In [None]:
# Sigmoid (dla 2 klas)
X_test = val_set.X

y_clf = clf.predict(X_test)
y_clf_proba = clf.predict_proba(X_test)

results = evaluate(X_test, trees)
results = np.sum(results, axis=1)

if n_classes == 2:
    results_proba = np.array([[1 - r, r] for r in sigmoid(results)])
else:
    results_proba
results_cls = np.argmax(results_proba, axis=1)
results_cls = np.array(results_cls, dtype=np.float32)

assert(np.sum(results_cls - y_clf) == 0)

In [None]:
# Softmax (dla >2 klas)

# Tree parser

In [None]:
from structure.CatboostTree import CatboostTree

cb_tree = CatboostTree.parse(tree, train_set)

In [None]:
from IPython.display import Image
Image('test.png')