In [147]:
import numpy as np
import matplotlib.pyplot as plt
import math

In [148]:
clean_arr = np.loadtxt("wifi_db/clean_dataset.txt")
noisy_arr = np.loadtxt("wifi_db/noisy_dataset.txt")

In [149]:
LABEL_COL = clean_arr.shape[1] - 1 # LABEL_COL = 7
DATASET_SIZE = clean_arr.shape[0]
labels = [1, 2, 3, 4]

In [150]:
class Node():
    def __init__(self, **kwargs):
        self.value = kwargs.get('value', 0)
        self.attr = kwargs.get('attr', 0)
        self.left = kwargs.get('left', None)
        self.right = kwargs.get('right', None)
        self.depth = kwargs.get('depth', 0)
    
    def set_left(self, left):
        self.left = left
    
    def set_right(self, right):
        self.right = right

In [151]:
def calc_label_instances(dataset, label):
    return np.count_nonzero(dataset[:, LABEL_COL] == label)


In [152]:
# since the log operation may return nan (eg log 0), replace all nan occurences with 0
def entropy(dataset):
    res = np.zeros(len(labels))
    for i in range(len(labels)):
        res[i] = calc_label_instances(dataset, labels[i])
    res = res / DATASET_SIZE
    log_res = np.log2(res)
    res = res * log_res
    res = list(map(lambda x: 0 if math.isnan(x) else x, res))
    return - np.sum(res)

In [153]:
# calculate info gain after splitting data set, assume dataset is sorted already
def remainder(dataset, split_index):
    left_info_gain = (split_index + 1) / DATASET_SIZE * entropy(dataset[: (split_index + 1)])
    right_info_gain = (DATASET_SIZE - split_index)/DATASET_SIZE * entropy(dataset[(split_index + 1):])
    return left_info_gain + right_info_gain


def calc_info_gain(start_entropy, dataset, split_index):
    return start_entropy - remainder(dataset, split_index)

In [154]:
def get_mid(a, b):
    return (a + b) / 2

def find_split(trng_data):
    split_attribute = -1
    split_value = 0
    split_index = 0
    overall_highest_info_gain = 0
    # start_entropy calculated at the start, saving recomputation
    start_entropy = entropy(trng_data)
    
    # sort all data amongst columns
    # loop through each attribute
    for i in range(LABEL_COL):
        sorted_dataset = trng_data[trng_data[:, i].argsort()]
        current_split_value = 0
        feature_highest_info_gain = 0
        current_split_index = 0

        for j in range(DATASET_SIZE - 1):
            if sorted_dataset[j][LABEL_COL] == sorted_dataset[j + 1][LABEL_COL]: continue

            mid = get_mid(sorted_dataset[j][i], sorted_dataset[j + 1][i])
            info_gain = calc_info_gain(start_entropy, sorted_dataset, j)

            if (not math.isnan(info_gain)) and info_gain > feature_highest_info_gain:
                current_split_value = mid
                feature_highest_info_gain = info_gain
                current_split_index = j

        if overall_highest_info_gain < feature_highest_info_gain:
            overall_highest_info_gain = feature_highest_info_gain
            split_attribute = i
            split_value = current_split_value
            split_index = current_split_index
        print("Feature " + str(i) + " info gain: " + str(feature_highest_info_gain))

    print("Start Entropy: " + str(start_entropy))
    print("Info Gain: " + str(overall_highest_info_gain))
        
    return (split_attribute, split_value, split_index)

find_split(clean_arr)

Feature 0 info gain: 0.9683275385070402
Feature 1 info gain: 0.5084116016763769
Feature 2 info gain: 0.659991086013672
Feature 3 info gain: 0.9120599467454866
Feature 4 info gain: 0.7440715061500625
Feature 5 info gain: 0.7034834916612516
Feature 6 info gain: 0.6938619921426792
Start Entropy: 2.0
Info Gain: 0.9683275385070402


(0, -54.5, 1011)

In [11]:
def decision_tree_learning(training_dataset, depth):
    return

In [12]:
entropy(clean_arr)
calc_label_instances(clean_arr, 1)
remainder(1, clean_arr)
arr = clean_arr.transpose()
info_gain(arr[0], 1000)
len(arr[0])
clean_arr.shape

nan


TypeError: remainder() missing 1 required positional argument: 'sorted_row_label'

In [None]:
def evaluate(test_db, trained_tree):
    