In [1]:
import tensorflow as tf
from tensorflow import keras
import kerastuner
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime
import dill
import tqdm
import bisect

plt.rcParams["figure.figsize"] = (20, 5)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [2]:
a = [1, 3, 4, 5, 6, 7, 8]
bisect.bisect(a, 3)

2

In [3]:
bisect.insort(a, 2)

In [4]:
a

[1, 2, 3, 4, 5, 6, 7, 8]

In [5]:
def create_random_tree(output_dim):
    outputs = list(range(output_dim))
    #shuffle(outputs)

    while len(outputs) > 2:
        temp_outputs = []
        for i in range(0, len(outputs), 2):
            if len(outputs) - (i+1) > 0:
                temp_outputs.append([outputs[i], outputs[i+1]])
            else:
                temp_outputs.append(outputs[i])
        outputs = temp_outputs
    return outputs

In [6]:
create_random_tree(10)

[[[[0, 1], [2, 3]], [[4, 5], [6, 7]]], [8, 9]]

In [7]:
from random import shuffle
from copy import copy

class TreeTools:
    def __init__(self):
        #memoization for _count_nodes functions
        self._count_nodes_dict = {}
                
    def _get_subtrees(self, tree):
        yield tree
        for subtree in tree:
            if type(subtree) == list:
                for x in self._get_subtrees(subtree):
                    yield x

    # Returns pairs of paths and leaves of a tree
    def _get_leaves_paths(self, tree):
        for i, subtree in enumerate(tree):
            if type(subtree) == list:
                for path, value in self._get_leaves_paths(subtree):
                    yield [i] + path, value
            else:
                yield [i], subtree
    
    # Returns the number of nodes in a tree (not including root)
    def _count_nodes(self, tree):
        if id(tree) in self._count_nodes_dict:
            return self._count_nodes_dict[id(tree)]
        size = 0
        for node in tree:
            if type(node) == list:
                size += 1 + self._count_nodes(node)
        self._count_nodes_dict[id(self._count_nodes_dict)] = size
        return size


    # Returns all the nodes in a path
    def _get_nodes(self, tree, path):
        next_node = 0
        nodes = []
        for decision in path:
            nodes.append(next_node)
            next_node += 1 + self._count_nodes(tree[:decision])
            tree = tree[decision]
        return nodes


# turns a list to a binary tree
def random_binary_full_tree(outputs):
    outputs = copy(outputs)
    shuffle(outputs)

    while len(outputs) > 2:
        temp_outputs = []
        for i in range(0, len(outputs), 2):
            if len(outputs) - (i+1) > 0:
                temp_outputs.append([outputs[i], outputs[i+1]])
            else:
                temp_outputs.append(outputs[i])
        outputs = temp_outputs
    return outputs

In [8]:
tree = random_binary_full_tree(list(range(10)))
print('Our tree:',tree)

tree_tools = TreeTools()

print('All subtrees:')
for subtree in tree_tools._get_subtrees(tree):
    print('\t {} (Len : {})'.format(subtree, len(subtree)))

print('All paths and leaves:')
for subtree in tree_tools._get_leaves_paths(tree):
    print('\t',subtree)
    
print('Number of nodes in the tree:',tree_tools._count_nodes(tree))

print('all nodes in path [0, 0, 0, 0]:')
for nodes in tree_tools._get_nodes(tree, [0, 0, 0, 0]):
    print('\t',nodes)

print('all nodes in path [1, 0]:')
for nodes in tree_tools._get_nodes(tree, [1, 0]):
    print('\t',nodes)

Our tree: [[[[3, 9], [5, 1]], [[7, 0], [6, 8]]], [4, 2]]
All subtrees:
	 [[[[3, 9], [5, 1]], [[7, 0], [6, 8]]], [4, 2]] (Len : 2)
	 [[[3, 9], [5, 1]], [[7, 0], [6, 8]]] (Len : 2)
	 [[3, 9], [5, 1]] (Len : 2)
	 [3, 9] (Len : 2)
	 [5, 1] (Len : 2)
	 [[7, 0], [6, 8]] (Len : 2)
	 [7, 0] (Len : 2)
	 [6, 8] (Len : 2)
	 [4, 2] (Len : 2)
All paths and leaves:
	 ([0, 0, 0, 0], 3)
	 ([0, 0, 0, 1], 9)
	 ([0, 0, 1, 0], 5)
	 ([0, 0, 1, 1], 1)
	 ([0, 1, 0, 0], 7)
	 ([0, 1, 0, 1], 0)
	 ([0, 1, 1, 0], 6)
	 ([0, 1, 1, 1], 8)
	 ([1, 0], 4)
	 ([1, 1], 2)
Number of nodes in the tree: 8
all nodes in path [0, 0, 0, 0]:
	 0
	 1
	 2
	 3
all nodes in path [1, 0]:
	 0
	 8


In [9]:
class hier_softmax:
    def __init__(self, tree, contex_size, model):
        self._tree_tools = TreeTools()
        self.str2weight = {}
        #create a weight matrix and bias vector for each node in the tree
        for i, subtree in enumerate(self._tree_tools._get_subtrees(tree)):
            self.str2weight["softmax_node_"+str(i)+"_w"] = model.add_parameters((len(subtree), contex_size))
            self.str2weight["softmax_node_" + str(i) + "_b"] = model.add_parameters(len(subtree))
        
        #create a dictionary from each value to its path
        value_to_path_and_nodes_dict = {}
        for path, value in self._tree_tools._get_leaves_paths(tree):
            nodes = self._tree_tools._get_nodes(tree, path)
            value_to_path_and_nodes_dict[data.char2int[value]] = path, nodes
        self.value_to_path_and_nodes_dict = value_to_path_and_nodes_dict
        self.model = model
        self.tree = tree
    
    #get the loss on a given value (for training)
    def get_loss(self, context, value):
        loss = []
        path, nodes = self.value_to_path_and_nodes_dict[value]
        for p, n in zip(path, nodes):
            w = dy.parameter(self.str2weight["softmax_node_"+str(n)+"_w"])
            b = dy.parameter(self.str2weight["softmax_node_" + str(n) + "_b"])
            probs = tf.nn.softmax(w*context+b)
            #loss.append(-tf.math.log(dy.pick(probs, p)))
            print(probs)
            print(p)
        #return dy.esum(loss)

    #get the most likely
    def generate(self, context):
        best_value = None
        best_loss = float(100000)
        for value in self.value_to_path_and_nodes_dict:
            loss = self.get_loss(context, value)
            if loss < best_loss:
                best_loss = loss
                best_value = value
        return best_value

In [10]:
class HuffanBinaryTree:
    def __init__(self):
        #memoization for _count_nodes functions
        self.count_nodes_dict = {}
                
    def get_subtrees(self, tree):
        yield tree
        for subtree in tree:
            if type(subtree) == list:
                for x in self.get_subtrees(subtree):
                    yield x

    # Returns pairs of paths and leaves of a tree
    def get_leaves_paths(self, tree):
        for i, subtree in enumerate(tree):
            if type(subtree) == list:
                for path, value in self.get_leaves_paths(subtree):
                    yield [i] + path, value
            else:
                yield [i], subtree
    
    # Returns the number of nodes in a tree (not including root)
    def count_nodes(self, tree):
        if id(tree) in self.count_nodes_dict:
            return self.count_nodes_dict[id(tree)]
        size = 0
        for node in tree:
            if type(node) == list:
                size += 1 + self.count_nodes(node)
        self.count_nodes_dict[id(self.count_nodes_dict)] = size
        return size


    # Returns all the nodes in a path
    def get_nodes(self, tree, path):
        next_node = 0
        nodes = []
        for decision in path:
            nodes.append(next_node)
            next_node += 1 + self.count_nodes(tree[:decision])
            tree = tree[decision]
        return nodes

In [47]:
a = [1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 5, 5, 5, 6, 6, 6]
freq = pd.Series(a).value_counts(ascending=True).to_dict()
b = list(freq.keys())

In [48]:
b, freq

([3, 1, 2, 5, 6, 4], {3: 1, 1: 2, 2: 3, 5: 3, 6: 3, 4: 4})

In [55]:
# Creating tree nodes
class NodeTree(object):
    def __init__(self, left=None, right=None):
        self.left = left
        self.right = right

    def children(self):
        return (self.left, self.right)

    def nodes(self):
        return (self.left, self.right)

    def __str__(self):
        return '%s_%s' % (self.left, self.right)


# Main function implementing huffman coding
def huffman_code_tree(node, path, left=True):
    if type(node) is not NodeTree:
        return {node: path}
    (l, r) = node.children()
    d = dict()

    l_path = copy(path)
    l_path.append(0)
    r_path = copy(path)
    r_path.append(1)

    d.update(huffman_code_tree(l, l_path, True))
    d.update(huffman_code_tree(r, r_path, False))
    return d


In [56]:
freq

{3: 1, 1: 2, 2: 3, 5: 3, 6: 3, 4: 4}

In [58]:
nodes = list(freq.items())

while len(nodes) > 1:
    (key1, c1) = nodes[-1]
    (key2, c2) = nodes[-2]
    nodes = nodes[:-2]
    node = NodeTree(key1, key2)
    nodes.append((node, c1 + c2))

    nodes = sorted(nodes, key=lambda x: x[1], reverse=True)

huffmanCode = huffman_code_tree(nodes[0][0], [])

In [60]:
huffmanCode

{4: [0, 0],
 6: [0, 1],
 2: [1, 0],
 3: [1, 1, 0, 0],
 1: [1, 1, 0, 1],
 5: [1, 1, 1]}

In [61]:
freq

{3: 1, 1: 2, 2: 3, 5: 3, 6: 3, 4: 4}