In [214]:
import pandas as pd
import math
from collections import Counter
import numpy as np
import pprint

# Importing data

In [4]:
df = pd.read_csv("titanic-homework.csv")

In [5]:
df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22,1,0,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,1
2,3,3,"Heikkinen, Miss. Laina",female,26,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,1
4,5,3,"Allen, Mr. William Henry",male,35,0,0,0
...,...,...,...,...,...,...,...,...
95,96,3,"Shorney, Mr. Charles Joseph",male,44,0,0,0
96,97,1,"Goldschmidt, Mr. George B",male,71,0,0,0
97,98,1,"Greenfield, Mr. William Bertram",male,23,0,1,1
98,99,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,1


In [327]:
len(df["Name"].unique())

100

# Example

In [105]:
edf = pd.DataFrame.from_dict({
    'Uczeń': ['A', 'B', 'C', 'D', 'E', 'F'],
    'Matematyka': [4, 4, 3, 5, 4, 3],
    'Biologia': [4, 5, 4, 3, 4, 5],
    'Polski': [5, 4, 4, 5, 4, 3],
    'Decyzja': ['T', 'T', 'N', 'N', 'N', 'N']
})

In [106]:
edf

Unnamed: 0,Uczeń,Matematyka,Biologia,Polski,Decyzja
0,A,4,4,5,T
1,B,4,5,4,T
2,C,3,4,4,N
3,D,5,3,5,N
4,E,4,4,4,N
5,F,3,5,3,N


# Helper functions

In [149]:
def entropy(part):
    labels = Counter(part)
    p = np.array(list(labels.values())) / labels.total()
    return -sum([p_i * math.log2(p_i) if p_i != 0 else 0 for p_i in p])

In [163]:
def cond_entropy(division):
    sizes = np.array([len(x) for x in division.values()])
    freqs = sizes / sizes.sum()
    entropies = np.array([entropy([p[1] for p in part]) for part in division.values()])
    return freqs.dot(entropies)

In [99]:
def gain(initial_entropy, conditional_entropy):
    return initial_entropy - conditional_entropy

In [177]:
def divide(doc, subset, divisor):
    division = {}
    for i, label in subset:
        row = doc.iloc[i]
        if row[divisor] not in division:
            division[row[divisor]] = []
        division[row[divisor]].append((i, label))
    return division

In [137]:
def initial_divide(doc, divisor, dividend):
    division = {}
    for i, row in doc.iterrows():
        if row[divisor] not in division:
            division[row[divisor]] = []
        division[row[divisor]].append((i, row[dividend]))
    return division

In [192]:
def is_leaf(subset):
    x = Counter([part[1] for part in subset])
    return len(x.keys()) == 1

In [154]:
division = initial_divide(edf, 'Matematyka', 'Decyzja')
division

{4: [(0, 'T'), (1, 'T'), (4, 'N')], 3: [(2, 'N'), (5, 'N')], 5: [(3, 'N')]}

In [155]:
initial_entropy = entropy(edf['Decyzja'])
initial_entropy

np.float64(0.9182958340544896)

In [166]:
conditional_entropy = cond_entropy(division)
conditional_entropy

np.float64(0.4591479170272448)

In [167]:
gain(initial_entropy, conditional_entropy)

np.float64(0.4591479170272448)

In [168]:
division2 = initial_divide(edf, 'Biologia', 'Decyzja')
division2

{4: [(0, 'T'), (2, 'N'), (4, 'N')], 5: [(1, 'T'), (5, 'N')], 3: [(3, 'N')]}

In [169]:
conditional_entropy2 = cond_entropy(division2)
conditional_entropy2

np.float64(0.792481250360578)

In [170]:
gain(initial_entropy, conditional_entropy2)

np.float64(0.12581458369391152)

In [171]:
division3 = initial_divide(edf, 'Polski', 'Decyzja')
division3

{5: [(0, 'T'), (3, 'N')], 4: [(1, 'T'), (2, 'N'), (4, 'N')], 3: [(5, 'N')]}

In [172]:
conditional_entropy3 = cond_entropy(division3)
conditional_entropy3

np.float64(0.792481250360578)

In [173]:
gain(initial_entropy, conditional_entropy3)

np.float64(0.12581458369391152)

In [178]:
nl_division = divide(edf, [(0, 'T'), (1, 'T'), (4, 'N')], 'Biologia')
nl_division

{np.int64(4): [(0, 'T'), (4, 'N')], np.int64(5): [(1, 'T')]}

In [180]:
conditional_entropy4 = cond_entropy(nl_division)
conditional_entropy4

np.float64(0.6666666666666666)

In [183]:
gain(initial_entropy, conditional_entropy4)

np.float64(0.2516291673878229)

In [179]:
nl_division2 = divide(edf, [(0, 'T'), (1, 'T'), (4, 'N')], 'Polski')
nl_division2

{np.int64(5): [(0, 'T')], np.int64(4): [(1, 'T'), (4, 'N')]}

In [181]:
conditional_entropy5 = cond_entropy(nl_division2)
conditional_entropy5

np.float64(0.6666666666666666)

In [184]:
gain(initial_entropy, conditional_entropy5)

np.float64(0.2516291673878229)

In [195]:
is_leaf(nl_division[5])

True

# Algorithm

In [200]:
example = edf.drop('Uczeń', axis=1)

In [363]:
def decision_tree(doc, label):
    initial_entropy = entropy(doc[label])

    tree = {}
    columns = doc.columns.drop(label)

    # calculate initial division entropy gains
    # select the best division
    best_initial_division = None
    best_gain = 0
    best_column = None
    for column in columns:
        division = initial_divide(doc, column, label)
        conditional_entropy = cond_entropy(division)
        g = gain(initial_entropy, conditional_entropy)
        if best_gain < g:
            best_column = column
            best_gain = g
            best_initial_division = division
    
    current_entropy = best_gain

    # initialize tree root
    tree[best_column] = {}
    curr_tree = tree[best_column]

    # node queue
    nodes = []

    labels_counter = Counter()
    doc_keys = doc[best_column].unique()

    # fill tree node
    for k, v in best_initial_division.items():
        if is_leaf(v):
            curr_tree[k] = v[0][1]
            labels_counter[v[0][1]] += 1
        else:
            curr_tree[k] = {}
            nodes.append((curr_tree[k], v, best_column))

    # fill in absent keys
    absent_keys = set(best_initial_division.keys()).symmetric_difference(set(doc_keys))

    for absent_key in absent_keys:
        curr_tree[absent_key] = labels_counter.most_common()[0][0]

    # do the same but for every subnode
    while len(nodes) > 0:
        curr_tree, subset, last_column = nodes.pop(0)

        # find best division
        best_division = None
        best_gain = 0
        best_column = None
        for column in columns.drop(last_column):
            division = divide(doc, subset, column)
            conditional_entropy = cond_entropy(division)
            g = gain(initial_entropy, conditional_entropy)
            if best_gain < g:
                best_column = column
                best_gain = g
                best_division = division

        # initialize subnode root
        curr_tree[best_column] = {}
        curr_tree = curr_tree[best_column]

        labels_counter = Counter()
        doc_keys = doc[best_column].unique()

        # fill in the subnode leafs
        for k, v in best_division.items():
            if is_leaf(v):
                curr_tree[k] = v[0][1]
                labels_counter[v[0][1]] += 1
            else:
                curr_tree[k] = {}
                nodes.append((curr_tree[k], v, best_column))

        # fill in the missing leafs
        absent_keys = set(best_division.keys()).symmetric_difference(set(doc_keys))

        for absent_key in absent_keys:
            curr_tree[absent_key] = labels_counter.most_common()[0][0]
    
    return tree

In [364]:
model = decision_tree(example, 'Decyzja')
model

{'Matematyka': {4: {'Biologia': {np.int64(4): {'Polski': {np.int64(5): 'T',
      np.int64(4): 'N',
      np.int64(3): 'T'}},
    np.int64(5): 'T',
    np.int64(3): 'T'}},
  3: 'N',
  5: 'N'}}

In [365]:
pprint.pp(model)

{'Matematyka': {4: {'Biologia': {np.int64(4): {'Polski': {np.int64(5): 'T',
                                                          np.int64(4): 'N',
                                                          np.int64(3): 'T'}},
                                 np.int64(5): 'T',
                                 np.int64(3): 'T'}},
                3: 'N',
                5: 'N'}}


In [366]:
model = decision_tree(df.drop(['PassengerId', 'Name'], axis=1), 'Survived')
model

{'Age': {22: 0,
  38: 1,
  26: {'Sex': {'female': 1, 'male': 0}},
  35: {'Pclass': {np.int64(1): 1, np.int64(3): 0, np.int64(2): 0}},
  34: {'Sex': {'male': 0, 'female': 1}},
  54: 0,
  2: 0,
  27: {'Sex': {'female': 1, 'male': 0}},
  14: {'Pclass': {np.int64(2): 1,
    np.int64(3): {'SibSp': {np.int64(0): 1,
      np.int64(1): 0,
      np.int64(2): 1,
      np.int64(3): 1,
      np.int64(4): 1,
      np.int64(5): 1}},
    np.int64(1): 1}},
  4: {'Sex': {'female': 1, 'male': 0}},
  58: 1,
  20: 0,
  39: 0,
  55: 1,
  12: 1,
  31: 0,
  63: 1,
  15: 1,
  28: {'Pclass': {np.int64(1): {'SibSp': {np.int64(0): 1,
      np.int64(1): 0,
      np.int64(2): 1,
      np.int64(3): 1,
      np.int64(4): 1,
      np.int64(5): 1}},
    np.int64(3): 0,
    np.int64(2): 0}},
  8: 0,
  44: 0,
  19: {'Sex': {'male': 0, 'female': 1}},
  6: 1,
  78: 0,
  40: {'Pclass': {np.int64(1): 1, np.int64(3): 0, np.int64(2): 1}},
  77: 1,
  66: 0,
  42: 0,
  21: {'Sex': {'male': 0, 'female': 1}},
  18: {'SibSp': {np.

In [367]:
def predict(model, example):
    key = list(model.keys())[0]
    subtree = model

    while True:
        subtree = subtree[key][example[key]]
        if type(subtree) is dict:
            key = list(subtree.keys())[0]
        else:
            return subtree

In [368]:
predict(model, df.iloc[4])

0

In [369]:
df.iloc[4]

PassengerId                           5
Pclass                                3
Name           Allen, Mr. William Henry
Sex                                male
Age                                  35
SibSp                                 0
Parch                                 0
Survived                              0
Name: 4, dtype: object