In [54]:
import pandas as pd
import math
from collections import Counter
import numpy as np
import pprint
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Importing data

In [72]:
df = pd.read_csv("titanic-homework.csv").drop(['PassengerId', 'Name'], axis=1)

In [73]:
df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Survived
0,3,male,22,1,0,0
1,1,female,38,1,0,1
2,3,female,26,0,0,1
3,1,female,35,1,0,1
4,3,male,35,0,0,0
...,...,...,...,...,...,...
95,3,male,44,0,0,0
96,1,male,71,0,0,0
97,1,male,23,0,1,1
98,2,female,34,0,1,1


In [75]:
train, test = train_test_split(df, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [76]:
train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Survived
0,1,male,28,0,0,1
1,1,male,23,0,1,1
2,1,female,28,0,0,1
3,3,male,4,3,2,0
4,3,female,8,3,1,0
...,...,...,...,...,...,...
75,1,female,77,1,0,1
76,1,male,13,0,0,1
77,3,male,37,0,0,0
78,3,male,22,0,0,0


In [77]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Survived
0,3,male,28,0,0,0
1,1,male,45,1,0,0
2,2,male,29,0,0,0
3,3,male,44,0,0,0
4,2,male,34,0,0,0
5,2,female,5,1,2,1
6,3,male,35,0,0,0
7,3,female,19,0,0,1
8,1,female,35,1,0,1
9,3,male,2,3,1,0


# Example

In [55]:
edf = pd.DataFrame.from_dict({
    'Uczeń': ['A', 'B', 'C', 'D', 'E', 'F'],
    'Matematyka': [4, 4, 3, 5, 4, 3],
    'Biologia': [4, 5, 4, 3, 4, 5],
    'Polski': [5, 4, 4, 5, 4, 3],
    'Wiek': ['dorosły', 'dorosły', 'młody', 'młody', 'młody', 'młody'],
    'Decyzja': ['T', 'T', 'N', 'N', 'N', 'N']
})

In [56]:
edf

Unnamed: 0,Uczeń,Matematyka,Biologia,Polski,Wiek,Decyzja
0,A,4,4,5,dorosły,T
1,B,4,5,4,dorosły,T
2,C,3,4,4,młody,N
3,D,5,3,5,młody,N
4,E,4,4,4,młody,N
5,F,3,5,3,młody,N


# Helper functions

In [57]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

In [58]:
# measure of dissorder, how impure a set of classes is, if all elements belong to one class entropy is 0
def entropy(part):
    labels = Counter(part)
    p = np.array(list(labels.values())) / labels.total()
    return -sum([p_i * math.log2(p_i) if p_i != 0 else 0 for p_i in p])

In [105]:
# how much uncertainty remains in one random variable if we know the value of other variable
def cond_entropy(division):
    if type(division) is Inequality:
        division = division.prepared_for_cond_entropy()
    sizes = np.array([len(x) for x in division.values()])
    freqs = sizes / sizes.sum()
    entropies = np.array([entropy([p[1] for p in part]) for part in division.values()])
    return freqs.dot(entropies)

In [60]:
# how much we reduce the entropy
def gain(initial_entropy, conditional_entropy):
    return initial_entropy - conditional_entropy

In [95]:
class Inequality():
    def __init__(self, boundary, greater_or_equal, smaller):
        self.boundary = boundary
        self.greater_or_equal = greater_or_equal
        self.smaller = smaller

    def prepared_for_cond_entropy(self):
        return {
            f'>={self.boundary}': self.greater_or_equal,
            f'<{self.boundary}': self.smaller
        }

    def leaf(self):
        if len(self.greater_or_equal) > 0:
            return self.self.greater_or_equal[0]
        else:
            return self.smaller[0]

    def __str__(self):
        return f'Inequality({self.boundary}, {self.greater_or_equal}, {self.smaller})'

In [62]:
def number_division(doc, divisor, dividend, boundary):
    res1 = []
    res2 = []
    for i, row in doc.iterrows():
        if row[dividend] >= boundary:
            res1.append((i, row[divisor]))
        if row[dividend] < boundary:
            res2.append((i, row[divisor]))

    return Inequality(boundary, res1, res2)

In [63]:
def number_division_in_subset(doc, subset, divisor, boundary):
    res1 = []
    res2 = []
    subset = sorted(list(map(lambda x: (x[0], x[1], doc.loc[x[0]][divisor]), subset)), key=lambda x: x[2])

    for i, label, divisor_value in subset:
        if divisor_value >= boundary:
            res1.append((i, label))
        if divisor_value < boundary:
            res2.append((i, label))
    
    return Inequality(boundary, res1, res2)

In [64]:
def candidate_boundaries(doc, divisor, dividend):
    res = []
    doc = doc.sort_values(dividend)
    last_label = doc.iloc[0][divisor]
    for i, row in doc.iterrows():
        if last_label != row[divisor]:
            res.append(row[dividend])
            last_label = row[divisor]
            
    return sorted(res)

In [65]:
def candidate_boundaries_in_subset(doc, subset, divisor):
    subset = list(map(lambda x: (x[0], x[1], doc.loc[x[0]][divisor]), subset))
    subset = sorted(subset, key=lambda x: x[2])
    
    res = []
    last_label = subset[0][1]
    for _, label, divisor_value in subset:
        if last_label != label:
            res.append(divisor_value)
            last_label = label
            
    return sorted(res)

In [66]:
def best_number_division(doc, divisor, dividend):
    boundaries = candidate_boundaries(doc, divisor, dividend)
    best_division = None
    best_entropy = 0
    for boundary in boundaries:
        division = number_division(doc, divisor, dividend, boundary)

        current_entropy = cond_entropy(division.prepared_for_cond_entropy())
        if best_division is None or best_entropy > current_entropy:
            best_entropy = current_entropy
            best_division = division
    return best_division

In [67]:
# only for first iteration, split dataset based on divisor column, with dividend column values
def initial_divide(doc, divisor, dividend):
    division = {}
    for i, row in doc.iterrows():
        if row[divisor] not in division:
            division[row[divisor]] = []
        division[row[divisor]].append((i, row[dividend]))
    return division

In [None]:
# for other iterations, same as initial, but works on subsets
def divide(doc, subset, divisor):
    division = {}
    if doc.dtypes[divisor] in numerics:
        boundaries = candidate_boundaries_in_subset(doc, subset, divisor)
        best_division = None
        best_entropy = 0
        for boundary in boundaries:
            division = number_division_in_subset(doc, subset, divisor, boundary)
    
            current_entropy = cond_entropy(division.prepared_for_cond_entropy())
            if best_division is None or best_entropy > current_entropy:
                best_entropy = current_entropy
                best_division = division
        division = best_division
    else:
        for i, label in subset:
            row = doc.iloc[i]
            if row[divisor] not in division:
                division[row[divisor]] = []
            division[row[divisor]].append((i, label))
    return division

In [122]:
def initial_divide(doc, divisor, dividend):
    division = {}
    if doc.dtypes[divisor] in numerics:
        division = best_number_division(doc, dividend, divisor)
    else:
        for i, row in doc.iterrows():
            if row[divisor] not in division:
                division[row[divisor]] = []
            division[row[divisor]].append((i, row[dividend]))
    return division

In [83]:
def is_leaf(subset):
    if type(subset) is Inequality:
        gr_or_eq_subset = Counter([part[1] for part in subset.greater_or_equal])
        sm_subset = Counter([part[1] for part in subset.smaller])
        a = len(gr_or_eq_subset.keys())
        b = len(sm_subset.keys())
        return (a == 1 and b == 0) or (a == 0 and b == 1)
    x = Counter([part[1] for part in subset])
    return len(x.keys()) == 1

In [16]:
division = initial_divide(edf, 'Matematyka', 'Decyzja')
division

{4: [(0, 'T'), (1, 'T'), (4, 'N')], 3: [(2, 'N'), (5, 'N')], 5: [(3, 'N')]}

In [17]:
initial_entropy = entropy(edf['Decyzja'])
initial_entropy

np.float64(0.9182958340544896)

In [18]:
conditional_entropy = cond_entropy(division)
conditional_entropy

np.float64(0.4591479170272448)

In [19]:
gain(initial_entropy, conditional_entropy)

np.float64(0.4591479170272448)

In [20]:
division2 = initial_divide(edf, 'Biologia', 'Decyzja')
division2

{4: [(0, 'T'), (2, 'N'), (4, 'N')], 5: [(1, 'T'), (5, 'N')], 3: [(3, 'N')]}

In [21]:
conditional_entropy2 = cond_entropy(division2)
conditional_entropy2

np.float64(0.792481250360578)

In [22]:
gain(initial_entropy, conditional_entropy2)

np.float64(0.12581458369391152)

In [23]:
division3 = initial_divide(edf, 'Polski', 'Decyzja')
division3

{5: [(0, 'T'), (3, 'N')], 4: [(1, 'T'), (2, 'N'), (4, 'N')], 3: [(5, 'N')]}

In [24]:
conditional_entropy3 = cond_entropy(division3)
conditional_entropy3

np.float64(0.792481250360578)

In [25]:
gain(initial_entropy, conditional_entropy3)

np.float64(0.12581458369391152)

In [26]:
nl_division = divide(edf, [(0, 'T'), (1, 'T'), (4, 'N')], 'Biologia')
nl_division

{np.int64(4): [(0, 'T'), (4, 'N')], np.int64(5): [(1, 'T')]}

In [27]:
conditional_entropy4 = cond_entropy(nl_division)
conditional_entropy4

np.float64(0.6666666666666666)

In [28]:
gain(initial_entropy, conditional_entropy4)

np.float64(0.2516291673878229)

In [29]:
nl_division2 = divide(edf, [(0, 'T'), (1, 'T'), (4, 'N')], 'Polski')
nl_division2

{np.int64(5): [(0, 'T')], np.int64(4): [(1, 'T'), (4, 'N')]}

In [30]:
conditional_entropy5 = cond_entropy(nl_division2)
conditional_entropy5

np.float64(0.6666666666666666)

In [31]:
gain(initial_entropy, conditional_entropy5)

np.float64(0.2516291673878229)

In [32]:
is_leaf(nl_division[5])

True

# Algorithm

In [354]:
def decision_tree(doc, label):
    initial_entropy = entropy(doc[label])

    tree = {}
    columns = doc.columns.drop(label)

    # calculate initial division entropy gains
    # select the best division (biggest gain)
    best_initial_division = None
    best_gain = 0
    best_column = None
    for column in columns:
        division = initial_divide(doc, column, label)
        conditional_entropy = cond_entropy(division) if type(division) is not Inequality else cond_entropy(division.prepared_for_cond_entropy())
        g = gain(initial_entropy, conditional_entropy)
        if best_gain < g:
            best_column = column
            best_gain = g
            best_initial_division = division
    
    current_entropy = best_gain

    # initialize tree root
    tree[best_column] = {}
    curr_tree = tree[best_column]

    # node queue
    nodes = []

    if doc.dtypes[best_column] not in numerics:
        labels_counter = Counter()
        doc_keys = doc[best_column].unique()
    
        # fill tree node
        for k, v in best_initial_division.items():
            if is_leaf(v):
                curr_tree[k] = v[0][1]
                labels_counter[v[0][1]] += 1
            else:
                curr_tree[k] = {}
                nodes.append((curr_tree[k], v, best_column))

        # fill in absent keys
        absent_keys = set(best_initial_division.keys()).symmetric_difference(set(doc_keys))

        for absent_key in absent_keys:
            curr_tree[absent_key] = labels_counter.most_common()[0][0]

    # do the same but for every subnode
    while len(nodes) > 0:
        curr_tree, subset, last_column = nodes.pop(0)

        # find best division
        best_division = None
        best_gain = 0
        best_column = None
        for column in columns.drop(last_column):
            division = divide(doc, subset, column)
            conditional_entropy = cond_entropy(division) if type(division) is not Inequality else cond_entropy(division.prepared_for_cond_entropy())
            g = gain(initial_entropy, conditional_entropy)
            if best_gain < g:
                best_column = column
                best_gain = g
                best_division = division

        # initialize subnode root
        curr_tree[best_column] = {}
        curr_tree = curr_tree[best_column]

        if doc.dtypes[best_column] not in numerics:
            labels_counter = Counter()
            doc_keys = doc[best_column].unique()
    
            # fill in the subnode leafs
            for k, v in best_division.items():
                if is_leaf(v):
                    curr_tree[k] = v[0][1]
                    labels_counter[v[0][1]] += 1
                else:
                    curr_tree[k] = {}
                    nodes.append((curr_tree[k], v, best_column))
    
            # fill in the missing leafs
            absent_keys = set(best_division.keys()).symmetric_difference(set(doc_keys))
    
            for absent_key in absent_keys:
                curr_tree[absent_key] = labels_counter.most_common()[0][0]
    
    return tree

In [355]:
model = decision_tree(example, 'Decyzja')
model

Matematyka


{'Matematyka': {}}

In [67]:
pprint.pp(model)

{'Matematyka': {4: {'Biologia': {np.int64(4): {'Polski': {np.int64(5): 'T',
                                                          np.int64(4): 'N',
                                                          np.int64(3): 'T'}},
                                 np.int64(5): 'T',
                                 np.int64(3): 'T'}},
                3: 'N',
                5: 'N'}}


In [39]:
train.drop(['PassengerId', 'Name'], axis=1)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Survived
0,3,male,31,0,0,0
1,3,male,59,0,0,0
2,3,male,27,0,0,0
3,2,female,27,1,0,1
4,3,male,24,0,0,0
...,...,...,...,...,...,...
75,3,female,8,3,1,0
76,1,male,57,0,0,0
77,3,male,34,0,0,0
78,3,female,63,0,0,1


In [80]:
train.dtypes

Pclass       int64
Sex         object
Age          int64
SibSp        int64
Parch        int64
Survived     int64
dtype: object

In [81]:
train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Survived'], dtype='object')

In [84]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

In [87]:
train.dtypes['Pclass'] in numerics

True

In [40]:
model = decision_tree(train.drop(['PassengerId', 'Name'], axis=1), 'Survived')
model

{'Age': {31: 0,
  59: 0,
  27: {'Sex': {'male': 0, 'female': 1}},
  24: 0,
  18: {'SibSp': {np.int64(2): 0,
    np.int64(0): 1,
    np.int64(1): 0,
    np.int64(3): 0,
    np.int64(4): 0,
    np.int64(5): 0}},
  44: 0,
  46: 0,
  14: {'Pclass': {np.int64(3): {'SibSp': {np.int64(1): 0,
      np.int64(0): 1,
      np.int64(2): 0,
      np.int64(3): 0,
      np.int64(4): 0,
      np.int64(5): 0}},
    np.int64(2): 1,
    np.int64(1): 1}},
  77: 1,
  58: 1,
  13: 1,
  34: 0,
  4: {'Sex': {'male': 0, 'female': 1}},
  29: 1,
  38: 1,
  55: 1,
  25: 0,
  2: 0,
  52: 0,
  6: 1,
  28: {'Pclass': {np.int64(1): {'SibSp': {np.int64(0): 1,
      np.int64(1): 0,
      np.int64(2): 1,
      np.int64(3): 1,
      np.int64(4): 1,
      np.int64(5): 1}},
    np.int64(3): 0,
    np.int64(2): 0}},
  40: {'Pclass': {np.int64(3): 0, np.int64(1): 1, np.int64(2): 0}},
  21: {'Sex': {'male': 0, 'female': 1}},
  33: 1,
  19: 0,
  32: 0,
  49: 1,
  57: {'Pclass': {np.int64(3): 1, np.int64(1): 0, np.int64(2): 1}}

In [41]:
def enumerate_tree(tree, parent, depth):
    item_i = 0
    for i,k in sorted(enumerate(tree), key=lambda x:x[1]):     
        if isinstance(tree[k], dict):
            enumerate_tree(tree[k], k, depth + 1)
        else:
            sign = ' > ' if item_i == len(tree) - 1 else ' <= '
            if isinstance(k, np.int64) or isinstance(k, str):
                sign = ' = '
            print('|     '*(depth//2),'|---- ', parent, sign , k, sep='')
            print('|     '*(depth//2),'|---- ', tree[k], sep='')
        item_i+= 1

In [42]:
enumerate_tree(model, None, 0)

|---- Age <= 2
|---- 0
|     |---- Sex = female
|     |---- 1
|     |---- Sex = male
|     |---- 0
|---- Age <= 5
|---- 1
|---- Age <= 6
|---- 1
|---- Age <= 7
|---- 0
|---- Age <= 8
|---- 0
|---- Age <= 12
|---- 1
|---- Age <= 13
|---- 1
|     |---- Pclass = 1
|     |---- 1
|     |---- Pclass = 2
|     |---- 1
|     |     |---- SibSp = 0
|     |     |---- 1
|     |     |---- SibSp = 1
|     |     |---- 0
|     |     |---- SibSp = 2
|     |     |---- 0
|     |     |---- SibSp = 3
|     |     |---- 0
|     |     |---- SibSp = 4
|     |     |---- 0
|     |     |---- SibSp = 5
|     |     |---- 0
|---- Age <= 15
|---- 1
|---- Age <= 16
|---- 0
|     |---- SibSp = 0
|     |---- 1
|     |---- SibSp = 1
|     |---- 0
|     |---- SibSp = 2
|     |---- 0
|     |---- SibSp = 3
|     |---- 0
|     |---- SibSp = 4
|     |---- 0
|     |---- SibSp = 5
|     |---- 0
|---- Age <= 19
|---- 0
|---- Age <= 20
|---- 0
|     |---- Sex = female
|     |---- 1
|     |---- Sex = male
|     |---- 0
|---- Age <

In [43]:
def predict(model, example):
    key = list(model.keys())[0]
    subtree = model

    while True:
        subtree = subtree[key][example[key]]
        if type(subtree) is dict:
            key = list(subtree.keys())[0]
        else:
            return subtree

In [44]:
print("Calculating accuracy...")

accurate = 0
for _, row in test.iterrows():
    print(row)
    output = predict(model, row)
    if output == row['Survived']:
        accurate += 1

print(f"Accuracy is {accurate / test.shape[0]}")

Calculating accuracy...
PassengerId                         20
Pclass                               3
Name           Masselmani, Mrs. Fatima
Sex                             female
Age                                 63
SibSp                                0
Parch                                0
Survived                             1
Name: 0, dtype: object
PassengerId                             82
Pclass                                   2
Name           Sheerlinck, Mr. Jan Baptist
Sex                                   male
Age                                     29
SibSp                                    0
Parch                                    0
Survived                                 0
Name: 1, dtype: object
PassengerId                      91
Pclass                            3
Name           Christmann, Mr. Emil
Sex                            male
Age                              29
SibSp                             0
Parch                             0
Survived              

KeyError: 3

In [45]:
predict(model, test.iloc[5])

KeyError: np.int64(3)

In [46]:
test.iloc[5]

PassengerId                                          44
Pclass                                                2
Name           Laroche, Miss. Simonne Marie Anne Andree
Sex                                              female
Age                                                   3
SibSp                                                 1
Parch                                                 2
Survived                                              1
Name: 5, dtype: object

In [47]:
predict(model, test.iloc[0])

1

In [242]:
ex2 = pd.DataFrame.from_dict({
    'age': [18, 21, 24, 26, 34, 41, 46, 52, 56, 70],
    'class': [1, 1, 2, 2, 2, 3, 3, 3, 2, 2],
    'salary': [1000, 1100, 3000, 3100, 3200, 5100, 4900, 5000, 2900, 3050]
})
ex2 = ex2.sample(ex2.shape[0])
ex2

Unnamed: 0,age,class,salary
0,18,1,1000
2,24,2,3000
5,41,3,5100
8,56,2,2900
9,70,2,3050
4,34,2,3200
1,21,1,1100
7,52,3,5000
3,26,2,3100
6,46,3,4900


In [360]:
inequality = initial_divide(ex2, 'class', 'age')
print(inequality)

Inequality(24, [(2, np.int64(2)), (5, np.int64(3)), (8, np.int64(2)), (9, np.int64(2)), (4, np.int64(2)), (7, np.int64(3)), (3, np.int64(2)), (6, np.int64(3))], [(0, np.int64(1)), (1, np.int64(1))])


In [361]:
inequality.test(25)

True

In [362]:
inequality.items()

dict_items([('>=24', [(2, np.int64(2)), (5, np.int64(3)), (8, np.int64(2)), (9, np.int64(2)), (4, np.int64(2)), (7, np.int64(3)), (3, np.int64(2)), (6, np.int64(3))]), ('<24', [(0, np.int64(1)), (1, np.int64(1))])])

In [33]:
class ClassNode():
    def __init__(self, feature_name, nodes={}):
        self.feature_name = feature_name
        self.nodes = nodes

    def add_node(self, node, value):
        self.nodes[value] = node

    def test(self, row):
        print(self)
        feature_value = row[self.feature_name]
        return self.nodes[feature_value].test(row)

    def __str__(self):
        return f'ClassNode({self.feature_name})'

In [34]:
class LeafNode():
    def __init__(self, value):
        self.value = value

    def test(self, row):
        print(self)
        return self.value

    def __str__(self):
        return f'LeafNode({self.value})'

In [53]:
class IntervalNode():
    def __init__(self, feature_name, boundary, gr_eq_node=None, le_node=None):
        self.feature_name = feature_name
        self.boundary = boundary
        self.gr_eq_node = gr_eq_node
        self.le_node = le_node

    def set_gr_eq_node(self, gr_eq_node):
        self.gr_eq_node = gr_eq_node

    def set_le_node(self, le_node):
        self.le_node = le_node
    
    def test(self, row):
        print(self)
        feature_value = row[self.feature_name]
        if feature_value >= self.boundary:
            return self.gr_eq_node.test(row)
        else:
            return self.le_node.test(row)
    
    def __str__(self):
        return f'IntervalNode({self.feature_name}, {self.boundary})'

In [36]:
tree = IntervalNode(
    "Matematyka",
    4,
    IntervalNode(
        "Biologia",
        4,
        IntervalNode(
            "Polski",
            4,
            ClassNode(
                "Wiek",
                {
                    "dorosły": LeafNode("T"),
                    "młody": LeafNode("N")
                }
            ),
            LeafNode("N")
        ),
        LeafNode('N')
    ),
    LeafNode('N')
)


In [37]:
print(tree)

IntervalNode(Matematyka, 4)


In [40]:
edf.iloc[2]

Uczeń             C
Matematyka        3
Biologia          4
Polski            4
Wiek          młody
Decyzja           N
Name: 2, dtype: object

In [41]:
tree.test(edf.iloc[2])

IntervalNode(Matematyka, 4)
LeafNode(N)


'N'

In [None]:
def build_tree(data, label_key):
    initial_entropy = entropy(data[label_key])
    columns = doc.columns.drop(label_key)

    best_division = None
    best_gain = 0
    best_column = None
    for column in columns:
        division = initial_divide(data, column, label_key)
        cond_ent_val = cond_entropy(division)
        g = gain(initial_entropy, cond_ent_val)
        if best_gain < g:
            best_column = column
            best_gain = g
            best_division = division
    current_entropy = best_gain

    if type(best_division) is Inequality:
        tree = IntervalNode(best_column, best_division.boundary)
    else:
        tree = ClassNode(best_column)

    curr_tree = tree
    
    # node queue
    nodes = []
    

In [136]:
data = edf.drop(['Uczeń', 'Wiek'], axis=1)
label_key = 'Decyzja'

In [138]:
initial_entropy = entropy(data[label_key])
columns = data.columns.drop(label_key)

best_division = None
best_gain = 0
best_column = None
for column in columns:
    division = initial_divide(data, column, label_key)
    cond_ent_val = cond_entropy(division)
    g = gain(initial_entropy, cond_ent_val)
    if best_gain < g:
        best_column = column
        best_gain = g
        best_division = division
current_entropy = best_gain

if type(best_division) is Inequality:
    tree = IntervalNode(best_column, best_division.boundary)
    if is_leaf(best_division):
        leaf = best_division.leaf()
        leaf_value = data.loc[leaf[0]][best_column]
        if leaf_value >= best_division.boundary:
            tree.set_gr_eq_node(LeafNode(leaf[1]))
            other_label = [x for x in data[label_key].unique().tolist() if x != leaf[1]][0]
            tree.set_le_node(LeafNode(other_label))
        else:
            tree.set_le_node(LeafNode(leaf[1]))
            other_label = [x for x in data[label_key].unique().tolist() if x != leaf[1]][0]
            tree.set_gr_eq_node(LeafNode(other_label))
else:
    tree = ClassNode(best_column)

    for k, v in best_division.items():
        if is_leaf(v):
            tree.add_node(None, k)


In [128]:
str(tree)

'IntervalNode(Matematyka, 4)'

In [94]:
tree.test(data.iloc[0])

ClassNode(Wiek)


KeyError: 'dorosły'

In [129]:
tree.gr_eq_node

In [130]:
tree.le_node