## One-step sampled-softmax

Example of sampled softmax for learning from large-scale vocabulary Seq-2Seq tasks (e.g. text generation in which number of class labels is vocabulary size in document corpus)

In [1]:
!pip install keras

[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# -*- coding: utf-8 -*-
#__author__ = "@inimah"
#__date__ = "20.04.2018"

from keras.layers import Dense, Lambda, Reshape
from keras.layers import Embedding
from keras.layers import Input, Concatenate, Masking, Layer, Flatten
from keras.layers import LSTM
from keras.models import Model
import numpy as np
from keras.utils import to_categorical
import tensorflow as tf

Using TensorFlow backend.


### Define class for creating custom layer to sample from class label distribution and operate sampled softmax loss

In [0]:
class SamplingLayer(Layer):
    def __init__(self, num_sampled, num_classes, mode, **kwargs):
        self.num_sampled = num_sampled
        self.num_classes = num_classes
        self.mode = mode
        super(SamplingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        dense_shape, classes_shape = input_shape
        self.kernel = self.add_weight(name='kernel',
                                      shape=(self.num_classes, dense_shape[1]),
                                      initializer='uniform',
                                      trainable=True)
        self.bias = self.add_weight(name='bias',
                                      shape=(self.num_classes,),
                                      initializer='uniform',
                                      trainable=True)  

        super(SamplingLayer, self).build(input_shape)  

    def call(self, inputs_and_labels):
        inputs, labels = inputs_and_labels
        if self.mode == "train":
            loss = tf.nn.sampled_softmax_loss(
                weights=self.kernel,
                biases=self.bias,
                labels=labels,
                inputs=inputs,
                num_sampled=self.num_sampled,
                num_classes=self.num_classes,
                num_true=1)

        elif self.mode == "eval":
            logits = tf.matmul(inputs, tf.transpose(self.kernel))
            logits = tf.nn.bias_add(logits, self.bias)
            labels_one_hot = tf.one_hot(labels, self.num_classes)
            loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=labels_one_hot,
                logits=logits)

        return loss

    def compute_output_shape(self, input_shape):
        dense_shape, classes_shape = input_shape
        return (dense_shape[0], self.num_classes)

Create model with one-step softmax prediction

In [4]:
vocab_size = 30
n_features = 3
batch_size = 10
sequence_len = 5
embedding_size = 10

# Inputs
x_input = Input((sequence_len,), dtype='int32', name='in_seq')
aux_features = Input((sequence_len, n_features,), dtype='float', name='in_aux')
labels = Input((sequence_len,1), dtype='int32', name='labels_') 

# masking and projecting to embedding layer
masked_x_input = Masking(mask_value=0, name='masking_layer')(x_input)
in_embed = Embedding(output_dim=embedding_size, input_dim=vocab_size, input_length=sequence_len, name='embedding_layer')(masked_x_input)

in_merged = Concatenate(name='merged_inputs')([in_embed, aux_features])

# LSTM layer with return sequences
lstm_layer = LSTM(256, return_sequences=True, name='lstm_layer')(in_merged)

losses = []
#loss_evals = [] # use this for validation (uncomment)
for t in range(sequence_len):
  lstm_t = Lambda(lambda x: lstm_layer[:,t,:], name='lstm-%s'%t)(lstm_layer)
  label_t = Lambda(lambda x: labels[:,t,:], name='label-%s'%t)(labels)
  loss = SamplingLayer(10, vocab_size, mode='train', name='sampled_layer-%s'%t)([lstm_t, label_t])
  #eval_loss = SamplingLayer(10, vocab_size, mode='eval', name='eval_layer-%s'%t)([lstm_t, label_t]) # use this for validation (uncomment)
  losses.append(loss)
  #loss_evals.append(eval_loss) # use this for validation (uncomment)
#losses_ = losses + loss_evals # use this for validation (uncomment)

#model = Model(inputs=[x_input, aux_features, labels], outputs=losses_) (uncomment)
model = Model(inputs=[x_input, aux_features, labels], outputs=losses)
model.compile(loss=lambda y_true, loss: loss, optimizer='Adam')

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [5]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
in_seq (InputLayer)             (None, 5)            0                                            
__________________________________________________________________________________________________
masking_layer (Masking)         (None, 5)            0           in_seq[0][0]                     
__________________________________________________________________________________________________
embedding_layer (Embedding)     (None, 5, 10)        300         masking_layer[0][0]              
__________________________________________________________________________________________________
in_aux (InputLayer)             (None, 5, 3)         0                                            
__________________________________________________________________________________________________
merged_inp

In [0]:
# generate random labels with size (batch_size, sequence_length)
y = np.random.randint(vocab_size, size=(batch_size, sequence_len))
# transform y_true labels to one hot categorial encoding
y_one_hot = to_categorical(y, vocab_size)

# reshape y to 3D dimension (batch_size, sequence_length, 1)
y = y.reshape((y.shape[0], y.shape[1], 1))

x_in = np.array([np.random.choice(np.arange(vocab_size), sequence_len, replace=True) for _ in range(batch_size)])
x_features = np.random.rand(batch_size, sequence_len, n_features)

outputs = list(y_one_hot.swapaxes(0,1))

## uncomment these to validate sampled softmax approach
# in this example, we duplicate output, each for sampling layer (mode='train') and eval / validation layer (mode='eval')
# for real use, one part can be training set labels, second part can be validation labels
# outs = outputs + outputs


In [7]:
# model.fit([x_in, x_features, y], outs, epochs=10) ## uncomment these to validate sampled softmax approach

model.fit([x_in, x_features, y], outputs, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3ecbab85c0>