In [2]:
import numpy as np
import matplotlib.pyplot as plt
import math

In [3]:
clean_arr = np.loadtxt("wifi_db/clean_dataset.txt")
noisy_arr = np.loadtxt("wifi_db/noisy_dataset.txt")

In [4]:
LABEL_COL = clean_arr.shape[1] - 1 # LABEL_COL = 7
DATASET_SIZE = clean_arr.shape[0]
labels = [1, 2, 3, 4]

In [5]:
class Node():
    def __init__(self, **kwargs):
        self.value = kwargs.get('value', 0)
        self.attr = kwargs.get('attr', 0)
        self.left = kwargs.get('left', None)
        self.right = kwargs.get('right', None)
    
    def set_left(self, left):
        self.left = left
    
    def set_right(self, right):
        self.right = right

In [6]:
def calc_label_instances(dataset, label):
    return np.count_nonzero(dataset[:, LABEL_COL] == label)


In [24]:
# since the log operation may return nan (eg log 0), replace all nan occurences with 0
def entropy(dataset):
    res = np.zeros(len(labels))
    for i in range(len(labels)):
        res[i] = calc_label_instances(dataset, labels[i])
    res = res / DATASET_SIZE
    log_res = np.where(res > 0, np.log2(res), res)
    res = res * log_res
    return - np.sum(res)

In [25]:
# calculate info gain after splitting data set, assume dataset is sorted already
def remainder(dataset, split_index):
    left_info_gain = (split_index + 1) / DATASET_SIZE * entropy(dataset[: (split_index + 1)])
    right_info_gain = (DATASET_SIZE - split_index)/DATASET_SIZE * entropy(dataset[(split_index + 1):])
    return left_info_gain + right_info_gain


def calc_info_gain(start_entropy, dataset, split_index):
    return start_entropy - remainder(dataset, split_index)

In [51]:
def get_mid(a, b):
    return (a + b) / 2

def find_split(trng_data):
    split_attribute = -1
    split_value = 0
    split_index = 0
    overall_highest_info_gain = 0
    # start_entropy calculated at the start, saving recomputation
    start_entropy = entropy(trng_data)
    dataset_size, _ = trng_data.shape
    
    # sort all data amongst columns
    # loop through each attribute
    for i in range(LABEL_COL):
        sorted_dataset = trng_data[trng_data[:, i].argsort()]
#         print('sorted dataset')
#         print(sorted_dataset[:, [i, LABEL_COL]]);
        current_split_value = 0
        feature_highest_info_gain = 0
        current_split_index = 0

        for j in range(dataset_size - 1):
            if sorted_dataset[j][LABEL_COL] == sorted_dataset[j + 1][LABEL_COL]: continue

            mid = get_mid(sorted_dataset[j][i], sorted_dataset[j + 1][i])
            info_gain = calc_info_gain(start_entropy, sorted_dataset, j)

            if (not math.isnan(info_gain)) and info_gain > feature_highest_info_gain:
                current_split_value = mid
                feature_highest_info_gain = info_gain
                current_split_index = j

        if overall_highest_info_gain < feature_highest_info_gain:
            overall_highest_info_gain = feature_highest_info_gain
            split_attribute = i
            split_value = current_split_value
            split_index = current_split_index
#         print("Feature " + str(i) + " info gain: " + str(feature_highest_info_gain))

#     print("Start Entropy: " + str(start_entropy))
#     print("Info Gain: " + str(overall_highest_info_gain))
        
    return (split_attribute, split_value, split_index)

find_split(clean_arr)

  log_res = np.where(res > 0, np.log2(res), res)


(0, -54.5, 1011)

In [54]:
def decision_tree_learning(training_dataset, depth):
    first_label = training_dataset[0][LABEL_COL]
    if np.all(training_dataset[:, LABEL_COL] == first_label):
        return (Node(value=first_label), depth)
    else:
        split_attr, split_value, split_index = find_split(training_dataset)
        curr_node = Node(value=split_value, attr=split_attr)
        left_branch, left_depth = decision_tree_learning(training_dataset[:split_index + 1], depth + 1)
        right_branch, right_depth = decision_tree_learning(training_dataset[split_index + 1:], depth + 1)
        curr_node.set_left(left_branch)
        curr_node.set_right(right_branch)
    return (curr_node, max(left_depth, right_depth))

In [64]:
root, depth = decision_tree_learning(clean_arr, 0)

# print using py_plot
def print_nodes(node):
    if node.left != None:
        print_nodes(node.left)
    if node.attr != None:
        print(node.attr)
    print(node.value)
    if node.right != None:
        print_nodes(node.right)
        


  log_res = np.where(res > 0, np.log2(res), res)


In [63]:
print_nodes(root)

0
1.0
0
-48.5
0
2.0
5
-73.0
0
2.0
4
-58.0
0
2.0
3
-51.0
0
2.0
2
-50.0
0
2.0
2
-50.0
0
2.0
1
-55.5
0
3.0
6
-77.0
0
3.0
2
-48.0
0
3.0
4
-58.5
0
3.0
4
-56.5
0
3.0
4
-56.5
0
3.0
0
-54.5
0
3.0
4
-51.0
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-43.5
0
4.0
2
-44.0
0
4.0
2
-44.0
0
4.0


In [None]:
entropy(clean_arr)
calc_label_instances(clean_arr, 1)
remainder(1, clean_arr)
arr = clean_arr.transpose()
info_gain(arr[0], 1000)
len(arr[0])
clean_arr.shape

In [None]:
def evaluate(test_db, trained_tree):
    