In [247]:
import numpy as np
import pandas as pd

In [248]:
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [249]:
cad_file = "../data/CAD.data"
fishing_file = "../data/fishing.data"
contact_lenses_file = "../data/contact-lenses.data"
caesarian_file = "../data/caesarian.data"

In [250]:
cad = pd.read_csv(cad_file, header=None, comment='#')
fishing = pd.read_csv(fishing_file, header=None, comment='#')
contact_lenses = pd.read_csv(contact_lenses_file, header=None, comment='#')
caesarian = pd.read_csv(caesarian_file, header=None, comment='#')

In [251]:
# function to get colnames from comments
import string

def get_colnames(file):
    with open(file) as f:
        lines = []
        for line in f:
            if line.startswith('#'):
                lines.append(line.rstrip().strip('#'))
        names = [line.translate(str.maketrans('', '', string.punctuation)).split()[0] for line in lines]
        return names[2:-2] + [names[-1]]

In [252]:
# get colnames from comments
cad_colnames = get_colnames(cad_file)
fishing_colnames = get_colnames(fishing_file)
contact_lenses_colnames = get_colnames(contact_lenses_file)
caesarian_colnames = get_colnames(caesarian_file)

In [253]:
cad.columns = cad_colnames
fishing.columns = fishing_colnames
contact_lenses.columns = contact_lenses_colnames
caesarian.columns = caesarian_colnames

In [254]:
import pickle

infile = "../data/decision_tree_data.pickle"

with open(infile,'wb') as f:
    pickle.dump(cad, f)
    pickle.dump(fishing, f)
    pickle.dump(contact_lenses, f)
    pickle.dump(caesarian, f)

In [255]:
with open(infile, 'rb') as f:
    cad1 = pickle.load(f)
    fishing1 = pickle.load(f)
    contact_lenses1 = pickle.load(f)
    caesarian1 = pickle.load(f)

In [258]:
def get_entropy(arr):
    arr = [0 if v is None else v for v in arr]
    probs = np.divide(arr, np.sum(arr))
    log2p = [0 if prob == 0 else np.log2(prob) for prob in probs]
    return -np.sum(np.multiply(probs, log2p))

In [259]:
from collections import OrderedDict
import operator
import copy

def get_attribute_w_max_information_gain(df):

    class_value_counts = df.iloc[:,-1:].value_counts().values
    class_entropy = get_entropy(class_value_counts)
    class_values = pd.unique(df.iloc[:,-1])

    information_gain_dict = {}

    # assumes class variable is in the last column
    attributes = [x for x in df.columns[:-1]]

    for attr in attributes:

        if df[attr].dtype == 'object':
            value_count_dict = df[attr].value_counts()
            value_count_sum = value_count_dict.sum()
            cardinality_dict = (value_count_dict/value_count_sum).to_dict()

            count_arr = []
            entropy_dict = {}

            var_values = df[attr].unique()

            col_ind = df.columns.get_loc(attr)
            attr_class_count = df.iloc[:,[col_ind,-1]].value_counts()

            for var_value in var_values:
                for class_value in class_values:
                    count_arr.append(attr_class_count.get((var_value, class_value)))
                entropy_dict[var_value] = get_entropy(count_arr)
                count_arr = []


            cardinalities = list(dict(OrderedDict(sorted(cardinality_dict.items()))).values())
            entropies = list(dict(OrderedDict(sorted(entropy_dict.items()))).values())
            information_gain = class_entropy - np.sum(np.multiply(cardinalities, entropies))


            information_gain_dict[attr] = information_gain

        elif df[attr].dtype == 'int64':

            col_ind = df.columns.get_loc(attr)
            new_df = df.iloc[:,[col_ind,-1]]

            new_df_sorted = new_df.sort_values(by=[attr]).reset_index(drop=True)

            split_values = []

            for j in range(len(new_df_sorted) - 1):
                if new_df_sorted.iloc[j, 1] != new_df_sorted.iloc[j + 1, 1]:
                    split_value = (new_df_sorted.iloc[j, 0] + new_df_sorted.iloc[j + 1, 0]) / 2
                    split_values.append(split_value)

            info_gain_dict_num = {}

            for value in split_values:

                num_entropy_dict = {}

                new_df_sorted2 = copy.deepcopy(new_df_sorted)
                ind = new_df_sorted2[attr] < float(value)
                ind1 = new_df_sorted2[attr] >= float(value)

                label1 = '< ' + str(value)
                label2 = '>= ' + str(value)

                new_df_sorted2.iloc[ind, 0] = label1
                new_df_sorted2.iloc[ind1, 0] = label2

                attr_class_count = new_df_sorted2.value_counts()

                count_arr = []
                for label in (label1, label2):
                    for class_value in class_values:
                        count_arr.append(attr_class_count.get((label, class_value)))
                    num_entropy_dict[label] = get_entropy(count_arr)
                    count_arr = []

                value_count_dict = new_df_sorted2[attr].value_counts()

                value_count_sum = value_count_dict.sum()
                cardinality_dict = (value_count_dict/value_count_sum).to_dict()

                cardinalities = list(dict(OrderedDict(sorted(cardinality_dict.items()))).values())
                entropies = list(dict(OrderedDict(sorted(num_entropy_dict.items()))).values())
                information_gain = class_entropy - np.sum(np.multiply(cardinalities, entropies))
                info_gain_dict_num[value] = information_gain
            max_gain_key = max(info_gain_dict_num.items(), key=operator.itemgetter(1))[0]
            information_gain_dict[('Age' + '<' + str(max_gain_key))] = info_gain_dict_num[max_gain_key]

    return max(information_gain_dict.items(), key=operator.itemgetter(1))[0]

In [265]:
from treelib import Tree
import uuid, re

def grow_tree(tree, branch, new_id, parent_id):
    if tree.size() == 0:
        tree.create_node(branch, new_id)
    else:
        tree.create_node(branch, new_id, parent=parent_id)

tree = Tree()

def id3(df, parent = None):
    class_values_length = len(pd.unique(df.iloc[:,-1]))

    branch_id = uuid.uuid4()

    if class_values_length == 1:
        leaf_value = df.iloc[0,-1:].values[0]
        grow_tree(tree, leaf_value, branch_id, parent)

    elif len(df.columns) == 1:
        leaf_value = df.iloc[:,-1:].mode().values[0][0]
        grow_tree(tree, leaf_value, branch_id, parent)

    else:
        node_var = get_attribute_w_max_information_gain(df)
        grow_tree(tree, node_var, branch_id, parent)

        if node_var in df.columns.values:
            is_num_var = False
            col_values = list(np.unique(df[node_var].values))
        else:
            is_num_var = True
            node_var_name_split = str.split(node_var, '<')
            var_name = node_var_name_split[0]
            var_num_value = node_var_name_split[1]
            split1 = var_name + '<' + var_num_value
            split2 = var_name + '>=' + var_num_value
            col_values = [split1, split2]

        new_parent = branch_id

        if is_num_var:
            new_branch_id1 = uuid.uuid4()
            new_branch_id2 = uuid.uuid4()
            tree.create_node(col_values[0], new_branch_id1, parent=new_parent)
            tree.create_node(col_values[1], new_branch_id2, parent=new_parent)
            name_value = re.split('<|>=', col_values[0])
            name = name_value[0]
            val = name_value[1]
            ind1 = df[name] < float(val)
            ind2 = df[name] >= float(val)
            df1 = df[ind1]
            df2 = df[ind2]
            if df1.size == 0:
                tree.remove_node(new_branch_id1)
            else:
                new_bid = uuid.uuid4()
                if len(np.unique(df1.iloc[:,-1:].values)) == 1:
                    leaf_value = df1.iloc[0,-1:].values[0]
                    grow_tree(tree, leaf_value, new_bid, new_branch_id1)
                elif df2.size == 0:
                    leaf_value = df1.iloc[:,-1:].mode().values[0][0]
                    grow_tree(tree, leaf_value, new_bid, new_branch_id1)
                else:
                    id3(df1, new_branch_id1)

            if df2.size == 0:
                tree.remove_node(new_branch_id2)
            else:
                new_bid = uuid.uuid4()
                if len(np.unique(df2.iloc[:,-1:].values)) == 1:
                    leaf_value = df2.iloc[0,-1:].values[0]
                    grow_tree(tree, leaf_value, new_bid, new_branch_id2)
                elif df1.size == 0:
                    leaf_value = df2.iloc[:,-1:].mode().values[0][0]
                    grow_tree(tree, leaf_value, new_bid, new_branch_id2)
                else:
                    id3(df2, new_branch_id2)

        else:
            for value in col_values:
                new_branch_id = uuid.uuid4()
                tree.create_node(value, new_branch_id, parent=new_parent)
                ind = df[node_var] == value
                df_v = df[ind].drop(node_var, axis=1)
                id3(df_v, new_branch_id)
    return tree

In [266]:
tree=Tree()
id3(caesarian1).show()

Cardiac
├── abnormal
│   └── BP
│       ├── high
│       │   └── Delivery
│       │       ├── late
│       │       │   └── Age<34.5
│       │       │       ├── Age<34.5
│       │       │       │   └── no
│       │       │       └── Age>=34.5
│       │       │           └── yes
│       │       ├── normal
│       │       │   └── yes
│       │       └── premature
│       │           └── yes
│       ├── low
│       │   └── yes
│       └── normal
│           └── Age<26.5
│               ├── Age<26.5
│               │   └── no
│               └── Age>=26.5
│                   └── Delivery
│                       ├── late
│                       │   └── no
│                       └── normal
│                           └── Age<28.5
│                               ├── Age<28.5
│                               │   └── yes
│                               └── Age>=28.5
│                                   └── Age<29.0
│                                       └── Age>=29.0
│                           

In [267]:
tree = Tree()
dtree_cad = id3(cad1)
tree = Tree()
dtree_fishing = id3(fishing1)
tree = Tree()
dtree_contact_lenses = id3(contact_lenses1)
tree=Tree()
dtree_caesarian = id3(caesarian1)

In [268]:
dtree_cad.paths_to_leaves()

[[UUID('fd0a7ae6-009b-4414-8846-3bf621cd7f41'),
  UUID('f05e872a-d03e-4d40-950b-b44bdc27c6b1'),
  UUID('479d8cdb-e664-4cd2-9eea-4ba6688c3a20'),
  UUID('5be08b7d-bf2f-46d2-8c52-f3f5cecc3e02'),
  UUID('6e04ef7b-fad0-4fbf-96c0-239a9e108027')],
 [UUID('fd0a7ae6-009b-4414-8846-3bf621cd7f41'),
  UUID('f05e872a-d03e-4d40-950b-b44bdc27c6b1'),
  UUID('479d8cdb-e664-4cd2-9eea-4ba6688c3a20'),
  UUID('f01163b1-e24a-4b0e-a6f9-871b975593c9'),
  UUID('8adeb749-13fa-442e-b262-599239aa49e7')],
 [UUID('fd0a7ae6-009b-4414-8846-3bf621cd7f41'),
  UUID('759d8b9e-02df-4b7e-b18b-e93661b9089b'),
  UUID('0c44581b-57dd-4a00-a19b-a400b50bc464'),
  UUID('7cb9491b-3665-40f3-a3cb-228385e98306'),
  UUID('494a4115-0945-4db9-a015-0ad80aaaaced')],
 [UUID('fd0a7ae6-009b-4414-8846-3bf621cd7f41'),
  UUID('759d8b9e-02df-4b7e-b18b-e93661b9089b'),
  UUID('0c44581b-57dd-4a00-a19b-a400b50bc464'),
  UUID('0b287758-e018-4dcc-8639-8d71c6542d6c'),
  UUID('9f6ea932-a8b7-4cd4-a662-a107bec902e8')],
 [UUID('fd0a7ae6-009b-4414-8846-3bf6

In [269]:
dtree_cad.show()
dtree_fishing.show()
dtree_contact_lenses.show()
dtree_caesarian.show()

Cholesterol
├── Borderline
│   └── Gender
│       ├── F
│       │   └── No
│       └── M
│           └── No
├── High
│   └── Gender
│       ├── F
│       │   └── No
│       └── M
│           └── Yes
└── Normal
    └── No

Sky
├── Cloudy
│   └── Yes
├── Rainy
│   └── Air
│       ├── Cool
│       │   └── No
│       └── Warm
│           └── Wind
│               ├── Strong
│               │   └── Yes
│               └── Weak
│                   └── No
└── Sunny
    └── Wind
        ├── Strong
        │   └── Yes
        └── Weak
            └── Water
                ├── Cold
                │   └── No
                ├── Moderate
                │   └── Yes
                └── Warm
                    └── No

tearrate
├── normal
│   └── astigmatism
│       ├── no
│       │   └── age
│       │       ├── pre-presbyopic
│       │       │   └── soft
│       │       ├── presbyopic
│       │       │   └── prescription
│       │       │       ├── hypermetrope
│       │       │       │   └── soft


In [270]:
import pygraphviz as pgv

def draw_tree(tr, name):
    tr.to_graphviz(name + '.dot')
    G = pgv.AGraph(name + '.dot')
    G.layout(prog="dot")
    G.draw(name + '.png')


In [271]:
draw_tree(dtree_cad, 'cad')
draw_tree(dtree_fishing, 'fishing')
draw_tree(dtree_contact_lenses, 'contact_lenses')
draw_tree(dtree_caesarian, 'caesarian')