In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os
import json
import datetime
import dill
import tqdm
from queue import PriorityQueue

In [2]:
class hier_softmax:
    def __init__(self, tree, contex_size, model):
        self._tree_tools = TreeTools()
        self.str2weight = {}
        #create a weight matrix and bias vector for each node in the tree
        for i, subtree in enumerate(self._tree_tools._get_subtrees(tree)):
            self.str2weight["softmax_node_"+str(i)+"_w"] = model.add_parameters((len(subtree), contex_size))
            self.str2weight["softmax_node_" + str(i) + "_b"] = model.add_parameters(len(subtree))
        
        #create a dictionary from each value to its path
        value_to_path_and_nodes_dict = {}
        for path, value in self._tree_tools._get_leaves_paths(tree):
            nodes = self._tree_tools._get_nodes(tree, path)
            value_to_path_and_nodes_dict[data.char2int[value]] = path, nodes
        self.value_to_path_and_nodes_dict = value_to_path_and_nodes_dict
        self.model = model
        self.tree = tree
    
    #get the loss on a given value (for training)
    def get_loss(self, context, value):
        loss = []
        path, nodes = self.value_to_path_and_nodes_dict[value]
        for p, n in zip(path, nodes):
            w = dy.parameter(self.str2weight["softmax_node_"+str(n)+"_w"])
            b = dy.parameter(self.str2weight["softmax_node_" + str(n) + "_b"])
            probs = tf.nn.softmax(w*context+b)
            #loss.append(-tf.math.log(dy.pick(probs, p)))
            print(probs)
            print(p)
        #return dy.esum(loss)

    #get the most likely
    def generate(self, context):
        best_value = None
        best_loss = float(100000)
        for value in self.value_to_path_and_nodes_dict:
            loss = self.get_loss(context, value)
            if loss < best_loss:
                best_loss = loss
                best_value = value
        return best_value

In [3]:
train_set = np.genfromtxt("data/{}_train_set.csv".format("SEG_Wavenet"), delimiter="\n", dtype=np.int64)
val_set = np.genfromtxt("data/{}_val_set.csv".format("SEG_Wavenet"), delimiter="\n", dtype=np.int64)

In [4]:
dataset = np.r_[train_set, val_set]
dataset

array([  0,   0,   0, ..., 897, 242, 961], dtype=int64)

## NOTE
Intermediate Nodes are indexed by **Preorder Traversal**

In [5]:
vocab_size = 16293
num_of_nodes = vocab_size - 1   # num of intermediate nodes

In [6]:
with open("outputs/tree_mapping.json", "r") as j:
    tree_mapping = json.load(j)     # category : [path, nodes in path]
tree_mapping = {int(i):j for i, j in tree_mapping.items()}      # As JSON converts key values to string
dict(list(tree_mapping.items())[:5]) 

{2192: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]],
 2396: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]],
 2647: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14]],
 1579: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14]],
 2247: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16]]}

In [7]:
tree_mapping[0], tree_mapping[1]

([[0, 1], [0, 1]], [[1, 0, 1], [0, 5436, 5437]])

In [8]:
path_matrix = np.full([vocab_size, num_of_nodes], -1, dtype=np.int32)
nodes_matrix = np.full([vocab_size, num_of_nodes], -1, dtype=np.int32)

In [9]:
for category, (path, nodes) in tree_mapping.items():
    path_matrix[category][:len(path)] = path
    nodes_matrix[category][:len(path)] = nodes

In [10]:
path_matrix.shape, nodes_matrix.shape

((16293, 16292), (16293, 16292))

In [104]:
x = tf.constant([[0, 1]], dtype=tf.float32)
x

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0., 1.]], dtype=float32)>

In [105]:
y = tf.constant([4], dtype=tf.int32)
y

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([4])>

    @tf.function
    def call(self, inputs, training=False):
        self.cache = tf.zeros([num_nodes])

        total_loss = tf.TensorArray(dtype=tf.float32, size=vocab_size)
        for i in tf.range(self.vocab_size):
            loss = self.get_loss(inputs, i)
            total_loss = total_loss.write(i, tf.reduce_prod(loss, axis=-1))
        return total_loss.stack()
    
    @tf.function
    def get_loss(self, x, category):
        path = tf.gather(self.path_matrix, category)
        nodes = tf.gather(self.nodes_matrix, category)

        loss = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        for i in tf.range(self.num_nodes):
            n = tf.gather(nodes, i)
            if n == -1:
                break
            w = tf.gather(self.softmax_weights, n)
            d = self.decision_function.lookup(tf.gather(path, i))
            sigma = tf.math.sigmoid(tf.tensordot(tf.matmul(x, w), d, axes=0))
            loss = loss.write(i, sigma)
        return tf.reduce_prod(loss.stack(), axis=0)

In [125]:
class HierachicalSoftmax(keras.Model):
    def __init__(self, tree_mapping, vocab_size):
        super().__init__()

        self.vocab_size = vocab_size
        self.num_nodes = self.vocab_size - 1
        # decision_function = {1:1, 0:-1}
        self.decision_function = tf.lookup.StaticHashTable(
            tf.lookup.KeyValueTensorInitializer(tf.constant([0, 1], dtype=tf.int32), tf.constant([-1, 1], dtype=tf.float32)), -1
        )

        path_matrix = np.full([self.vocab_size, self.num_nodes], -1, dtype=np.int32)
        nodes_matrix = np.full([self.vocab_size, self.num_nodes], -1, dtype=np.int32)
        for category, (path, nodes) in tree_mapping.items():
            path_matrix[category][:len(path)] = path
            nodes_matrix[category][:len(path)] = nodes

        self.path_matrix = tf.constant(path_matrix)
        self.nodes_matrix = tf.constant(nodes_matrix)

    def build(self, input_shape):
        self.initializer = keras.initializers.GlorotNormal()
        self.softmax_weights = tf.Variable(self.initializer(shape=(self.num_nodes, input_shape[-1], 1)))
    
    @tf.function
    def call(self, inputs, training=False):
        total_loss = tf.TensorArray(dtype=tf.float32, size=vocab_size)
        cache = tf.zeros([self.num_nodes])

        for category in tf.range(self.vocab_size):
            path = tf.gather(self.path_matrix, category)
            nodes = tf.gather(self.nodes_matrix, category)

            loss = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
            for i in tf.range(self.num_nodes):
                n = tf.gather(nodes, i)
                if n == -1:
                    break

                c = tf.gather(cache, n)
                if c == 0:
                    w = tf.gather(self.softmax_weights, n)
                    d = self.decision_function.lookup(tf.gather(path, i))
                    sigma = tf.math.sigmoid(tf.tensordot(tf.matmul(inputs, w), d, axes=0))
                    tf.tensor_scatter_nd_update(cache, [[n]], tf.squeeze(sigma, axis=0))
                else:
                    sigma = tf.gather(cache, n)
                tf.print(sigma)
                #loss = loss.write(i, sigma)
            #total_loss = total_loss.write(i, tf.reduce_prod(loss, axis=-1))

        #return total_loss.stack()        
        '''
        total_loss = tf.TensorArray(dtype=tf.float32, size=vocab_size)
        for i in tf.range(self.vocab_size):
            loss = self.get_loss(inputs, i)
            total_loss = total_loss.write(i, tf.reduce_prod(loss, axis=-1))
        return total_loss.stack()
        '''
        

    @tf.function
    def get_loss(self, x, category):
        path = tf.gather(self.path_matrix, category)
        nodes = tf.gather(self.nodes_matrix, category)

        loss = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        for i in tf.range(self.num_nodes):
            n = tf.gather(nodes, i)
            if n == -1:
                break

            w = tf.gather(self.softmax_weights, n)
            d = self.decision_function.lookup(tf.gather(path, i))
            sigma = tf.math.sigmoid(tf.tensordot(tf.matmul(x, w), d, axes=0))
            loss = loss.write(i, sigma)
        return tf.reduce_prod(loss.stack(), axis=0)

    def train_step(self, data):
        x, y = data
        loss = self.get_loss(x, y)
        #with gradientTape
        return loss

In [126]:
hsm = HierachicalSoftmax(tree_mapping, vocab_size)

In [127]:
hsm(x)

41]]
[[0.501631856]]
[[0.496853948]]
[[0.497305632]]
[[0.50019908]]
[[0.498943776]]
[[0.499369651]]
[[0.502636]]
[[0.500157952]]
[[0.49922356]]
[[0.499372661]]
[[0.498524636]]
[[0.502120852]]
[[0.49806428]]
[[0.497920841]]
[[0.501631856]]
[[0.496853948]]
[[0.497305632]]
[[0.50019908]]
[[0.498943776]]
[[0.499369651]]
[[0.497363955]]
[[0.498404831]]
[[0.498687387]]
[[0.501571715]]
[[0.5012061]]
[[0.502925754]]
[[0.497489482]]
[[0.497920841]]
[[0.501631856]]
[[0.496853948]]
[[0.497305632]]
[[0.50019908]]
[[0.498943776]]
[[0.500630379]]
[[0.499760747]]
[[0.500125289]]
[[0.501387477]]
[[0.501607]]
[[0.499600023]]
[[0.499968529]]
[[0.497610241]]
[[0.497920841]]
[[0.501631856]]
[[0.496853948]]
[[0.497305632]]
[[0.50019908]]
[[0.498943776]]
[[0.499369651]]
[[0.497363955]]
[[0.498404831]]
[[0.501312613]]
[[0.497549325]]
[[0.501527905]]
[[0.499538958]]
[[0.500597477]]
[[0.497920841]]
[[0.501631856]]
[[0.496853948]]
[[0.497305632]]
[[0.50019908]]
[[0.498943776]]
[[0.499369651]]
[[0.497363955]]
[[

In [None]:
c = tf.zeros([10])
c

In [73]:
q = tf.constant([2], dtype=tf.int32)
q

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([2])>

In [75]:
q2 = tf.expand_dims(q, axis=-1)
q2

<tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[2]])>

In [83]:
tf.squeeze(q2, axis=0)

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([2])>

In [76]:
tf.tensor_scatter_nd_update(c, q2, tf.constant([2], dtype=tf.float32))

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 2., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>