In [12]:
import numpy as np
import pandas as pd
import math

from id3 import ID3DecisionTree

# Helper Functions

In [18]:
def matrix_of_attrs_belonging_to_labels(attr_label_arr):
    """Computes matrix matching p_i, n_i, ... for any number of values.

    The notation p_i and n_i is from 
    https://hunch.net/~coms-4771/quinlan.pdf

    :param attr_label_arr: 2D <class 'numpy.ndarray'> where the first column
        is the attribute and the values that the attribute can take 
        on while the second column is the labels (classification)
        for each value of the attribute.

    :return: <class 'list'> of <class 'list'>
    """
    
    # Compute the unique discrete values that the attributes can
    # take on
    unique_attr_values = np.unique(attr_label_arr[:, 0])

    # Compute the unique discrete values that the labels can
    # take on
    unique_label_values = np.unique(attr_label_arr[:, 1])

    # A matrix with 1 row for each discrete value the attribute
    # can take on and 1 column for each discrete value the 
    # label can take on. This data structure holds the counts
    # of each attribute value that matches the discrete label value
    # Example, 
    # ['overcast' 'rain' 'sunny'] for labels = {0, 1}
    # outlook [[0, 4], [2, 3], [3, 2]]
    # This means that the outlook attribute which can take on 3 discrete
    # values has 0 overcast days that are also labeled 0, while it has 
    # 4 overcast days that are also labeled 1.
    attr_label_count_matrix = []

    # Iterate through each unique value of the attributes
    # and count the occurences for which the attribute value 
    # is unique value AND the label is a unique value of that label
    for unique_attr_value in unique_attr_values:
        counts = []
        for unique_label_value in unique_label_values:
            # Boolean vector... elements are True where condition
            # holds, False otherwise
            match_vector = np.logical_and(
                attr_label_arr[:, 0] == unique_attr_value, 
                attr_label_arr[:, 1] == unique_label_value)

            # Non-zero means the instances in which the condition is True
            match_count = np.count_nonzero(match_vector)

            # Append the value to the counts list
            counts.append(match_count)

        # Append the counts list to the parent matrix
        attr_label_count_matrix.append(counts)

    # Resulting matrix
    return attr_label_count_matrix

# Instantiate Tree

In [14]:
# Instantiate tree
tree = ID3DecisionTree(X=None, y=None)

# Load Example Data

In [15]:
df = pd.read_excel('sample_data.xlsx')

In [16]:
df

Unnamed: 0,outlook,temperature,humidity,windy,class
0,sunny,hot,high,False,N
1,sunny,hot,high,True,N
2,overcast,hot,high,False,P
3,rain,mild,high,False,P
4,rain,cool,normal,False,P
5,rain,cool,normal,True,N
6,overcast,cool,normal,True,P
7,sunny,mild,high,False,N
8,sunny,cool,normal,False,P
9,rain,mild,normal,False,P


In [19]:
# Testing helper function
outlook_class_arr = df[['outlook', 'class']].to_numpy()
display(outlook_class_arr)

matrix_of_attrs_belonging_to_labels(outlook_class_arr)

array([['sunny', 'N'],
       ['sunny', 'N'],
       ['overcast', 'P'],
       ['rain', 'P'],
       ['rain', 'P'],
       ['rain', 'N'],
       ['overcast', 'P'],
       ['sunny', 'N'],
       ['sunny', 'P'],
       ['rain', 'P'],
       ['sunny', 'P'],
       ['overcast', 'P'],
       ['overcast', 'P'],
       ['rain', 'N']], dtype=object)

['overcast' 'rain' 'sunny']
['N' 'P']
[[0, 4], [2, 3], [3, 2]]


In [5]:
tree.entropy([3, 2])

0.9709505944546686

In [6]:
tree.expected_information(label_counts=[9, 5], attr_counts=[[0, 4], [2, 3], [3, 2]])

0.6935361388961918

In [7]:
# information gain for outlook
tree.information_gain(tree.entropy([9, 5]), 
    tree.expected_information(
        label_counts=[9, 5], attr_counts=[[0, 4], [2, 3], [3, 2]]))

0.2467498197744391

In [32]:
# 3.3. Dealing with Continuous Valued Data
outlook_sunny_arr = np.array(
    [[0.68, 1], 
    [0.72, 1], 
    [0.87, 0], 
    [0.9, 0], 
    [0.91, 0]]) 
    #[0.93, 'o']])
print(outlook_sunny_arr.shape)
display(outlook_sunny_arr)

(5, 2)


array([[0.68, 1.  ],
       [0.72, 1.  ],
       [0.87, 0.  ],
       [0.9 , 0.  ],
       [0.91, 0.  ]])

In [34]:
# To pick thresholds, find all indices for which 
# the class (contained in the left most column) differ between
# adjacent rows
threshold_tup_indices = []
for row in range(len(outlook_sunny_arr)-1):
    cur_label = outlook_sunny_arr[row, -1]
    next_label = outlook_sunny_arr[row+1, -1]
    if cur_label != next_label:
        threshold_tup_indices.append((row, row+1))

print(threshold_tup_indices)
for threshold_ix in threshold_tup_indices:
    bin_ = (outlook_sunny_arr[threshold_ix[0], 0] \
        + outlook_sunny_arr[threshold_ix[1], 0]) / 2

    print(bin_)
    print(outlook_sunny_arr[threshold_ix[0]: threshold_ix[1]+1])
    print()

[(1, 2)]
0.7949999999999999
[[0.72 1.  ]
 [0.87 0.  ]]



In [38]:
# Bin data -- > and <=, should be reversed for open lab
bin_ = 0.795
discretized_outlook_sunny_arr = outlook_sunny_arr.copy()
for ix, row in enumerate(outlook_sunny_arr):
    if row[0] > bin_:
        discretized_outlook_sunny_arr[ix, 0] = 0
    else:
        discretized_outlook_sunny_arr[ix, 0] = 1

print(discretized_outlook_sunny_arr)

[[1. 1.]
 [1. 1.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


In [None]:
unique_class_labels = np.unique(df['class'])
print(unique_class_labels)
for col in df.columns[:-1]:
    col_arr = df[[col, 'class']].to_numpy()
    unique_col_values = np.unique(df[col])
    print(unique_col_values)
    n_i = []
    for unique_val in unique_col_values:
        pairs = []
        for unique_label in unique_class_labels:
            pairs.append(
                np.count_nonzero(
                    np.logical_and(col_arr[:, 0] == unique_val, col_arr[:, 1] == unique_label)))
        n_i.append(pairs)

    print(col, n_i)

['N' 'P']
['overcast' 'rain' 'sunny']
outlook [[0, 4], [2, 3], [3, 2]]
['cool' 'hot' 'mild']
temperature [[1, 3], [2, 2], [2, 4]]
['high' 'normal']
humidity [[4, 3], [1, 6]]
[False  True]
windy [[2, 6], [3, 3]]


In [None]:
simple_arr = arr[:, [0, -1]]
print(simple_arr)
np.count_nonzero(np.logical_and(simple_arr[:, 0] == 'overcast', 
    simple_arr[:, 1] == 1))

[['sunny' 1]
 ['sunny' 1]
 ['sunny' 1]
 ['sunny' 1]
 ['sunny' 1]
 ['overcast' 1]
 ['overcast' 1]
 ['overcast' 1]
 ['overcast' 1]
 ['rain' 0]
 ['rain' 0]
 ['rain' 0]
 ['rain' 0]
 ['rain' 0]]


4

In [None]:
tree.information(labels_counts)

0.9402859586706309