In [11]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os
import json
import datetime
import dill
import tqdm
from queue import PriorityQueue

In [12]:
class hier_softmax:
    def __init__(self, tree, contex_size, model):
        self._tree_tools = TreeTools()
        self.str2weight = {}
        #create a weight matrix and bias vector for each node in the tree
        for i, subtree in enumerate(self._tree_tools._get_subtrees(tree)):
            self.str2weight["softmax_node_"+str(i)+"_w"] = model.add_parameters((len(subtree), contex_size))
            self.str2weight["softmax_node_" + str(i) + "_b"] = model.add_parameters(len(subtree))
        
        #create a dictionary from each value to its path
        value_to_path_and_nodes_dict = {}
        for path, value in self._tree_tools._get_leaves_paths(tree):
            nodes = self._tree_tools._get_nodes(tree, path)
            value_to_path_and_nodes_dict[data.char2int[value]] = path, nodes
        self.value_to_path_and_nodes_dict = value_to_path_and_nodes_dict
        self.model = model
        self.tree = tree
    
    #get the loss on a given value (for training)
    def get_loss(self, context, value):
        loss = []
        path, nodes = self.value_to_path_and_nodes_dict[value]
        for p, n in zip(path, nodes):
            w = dy.parameter(self.str2weight["softmax_node_"+str(n)+"_w"])
            b = dy.parameter(self.str2weight["softmax_node_" + str(n) + "_b"])
            probs = tf.nn.softmax(w*context+b)
            #loss.append(-tf.math.log(dy.pick(probs, p)))
            print(probs)
            print(p)
        #return dy.esum(loss)

    #get the most likely
    def generate(self, context):
        best_value = None
        best_loss = float(100000)
        for value in self.value_to_path_and_nodes_dict:
            loss = self.get_loss(context, value)
            if loss < best_loss:
                best_loss = loss
                best_value = value
        return best_value

In [13]:
train_set = np.genfromtxt("data/{}_train_set.csv".format("SEG_Wavenet"), delimiter="\n", dtype=np.int64)
val_set = np.genfromtxt("data/{}_val_set.csv".format("SEG_Wavenet"), delimiter="\n", dtype=np.int64)

In [14]:
dataset = np.r_[train_set, val_set]
dataset

array([  0,   0,   0, ..., 897, 242, 961], dtype=int64)

## NOTE
Intermediate Nodes are indexed by **Preorder Traversal**

In [15]:
vocab_size = 16293
num_of_nodes = vocab_size - 1   # num of intermediate nodes

In [16]:
with open("outputs/tree_mapping.json", "r") as j:
    tree_mapping = json.load(j)     # category : [path, nodes in path]
tree_mapping = {int(i):j for i, j in tree_mapping.items()}      # As JSON converts key values to string
dict(list(tree_mapping.items())[:5]) 

{2192: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]],
 2396: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]],
 2647: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14]],
 1579: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14]],
 2247: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16]]}

In [17]:
tree_mapping[0], tree_mapping[1]

([[0, 1], [0, 1]], [[1, 0, 1], [0, 5436, 5437]])

In [18]:
tree_matrix = np.full([vocab_size, 2, num_of_nodes], -1, dtype=np.int32)    # tree_matrix[][0] : path / tree_matrix[][1] : nodes

In [19]:
for category, (path, nodes) in tree_mapping.items():
    tree_matrix[category][0][:len(path)] = path
    tree_matrix[category][1][:len(path)] = nodes

In [26]:
class HierachicalSoftmax(keras.Model):
    def __init__(self, tree_mapping, vocab_size):
        super().__init__()

        self.vocab_size = vocab_size
        # decision_function = {1:1, 0:-1}
        self.decision_function = tf.lookup.StaticHashTable(
            tf.lookup.KeyValueTensorInitializer(tf.constant([0, 1], dtype=tf.int32), tf.constant([-1, 1], dtype=tf.float32)), -1
        )      

        self.tree_mapping = np.full([vocab_size, 2, vocab_size-1], -1, dtype=np.int32)
        for category, (path, nodes) in tree_mapping.items():
            self.tree_mapping[category][0][:len(path)] = path
            self.tree_mapping[category][1][:len(path)] = nodes
        self.tree_mapping = tf.constant(self.tree_mapping)

    def build(self, input_shape):
        self.initializer = keras.initializers.GlorotNormal()
        self.softmax_weights = tf.Variable(self.initializer(shape=(len(self.tree_mapping)-1, input_shape[-1])))
    
    @tf.function
    def call(self, inputs):
        total_loss = tf.TensorArray(dtype=tf.float32, size=vocab_size)
        for x in inputs:
            for i in tf.range(self.vocab_size):
                path = self.tree_mapping[i][0]
                nodes = self.tree_mapping[i][1]
                loss = tf.TensorArray(dtype=tf.float32, size=path.shape[0])
                for j in tf.range(path.shape[0]):
                    w = self.softmax_weights[nodes[j]]
                    d = self.decision_function.lookup(path[j])
                    sigma = tf.nn.softmax(w*x)
                    loss = loss.write(j, tf.tensordot(sigma, d, axes=0))
                total_loss = total_loss.write(i, tf.reduce_prod(loss.stack(), axis=-1))
        return total_loss.stack()

    def train_step(self, data):
        loss = []
        for p, n in zip(path, nodes):
            raw_prob = tf.nn.softmax(x * self.softmax_weights[n])
            loss.append(tf.tensordot(raw_prob, self.decision_function[p]))
        return tf.reduce_prod(loss, axis=-1)

In [27]:
hsm = HierachicalSoftmax(tree_mapping, vocab_size)

In [28]:
x = tf.constant([0, 1], dtype=tf.float32)
x

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0., 1., 2., 3.], dtype=float32)>

In [29]:
hsm(x)

In [None]:
t = tf.TensorArray(tf.int32, size=2)
t = t.write(0, tf.constant(1))
t = t.write(1, tf.constant(3))

In [121]:
t.stack()

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 3])>