## Example of Sequence-to-Sequence with sampled softmax

In [0]:
# -*- coding: utf-8 -*-
#__author__ = "@inimah"
#__date__ = "21.04.2018"

In [0]:
# Install Keras with pip
!pip install -q keras

In [3]:
import keras
keras.__version__

Using TensorFlow backend.


'2.1.5'

In [0]:
from __future__ import print_function

import os
import sys
import numpy as np
import nltk
import string
from string import punctuation
import re

In [0]:
import _pickle as cPickle

# reading file in pickle format
def readPickle(pickleFilename):
	f = open(pickleFilename, 'rb')
	obj = cPickle.load(f)
	f.close()
	return obj

def savePickle(dataToWrite,pickleFilename):
	f = open(pickleFilename, 'wb')
	cPickle.dump(dataToWrite, f)
	f.close()

In [0]:
local_download_path = os.path.expanduser('~/datatita')


### Reading preprocessed data

In [0]:
wiki_doc_topics = readPickle(os.path.join(local_download_path,'wiki_doc_topics.pkl'))

In [0]:
train_docs = readPickle(os.path.join(local_download_path,'wiki_train_docs_bigru.pkl'))

In [0]:
train_keyphrases = readPickle(os.path.join(local_download_path,'wiki_train_keyphrases_bigru.pkl'))

In [0]:
train_docids = readPickle(os.path.join(local_download_path,'wiki_train_docids_bigru.pkl'))

### Tokenization and vocabulary indexing

In [0]:
unlisted_punct = ['-', '_', '+', '#']
punct = ''.join([p for p in string.punctuation if p not in unlisted_punct])

In [0]:
import re
def tokenizeWords(text):
  
  regex = re.compile('[%s]' % re.escape(punct))
  clean_text = regex.sub('', text)
  #clean_text = re.sub(r"[\-\+\_]+\ *", " ", clean_text)
  clean_text = re.sub(r"[\-\_]+\ *", " ", clean_text)
  tokens = clean_text.split()
  
  return [t.lower() for t in tokens]

In [0]:
def clean_keyphrases(keyphrase_list):
  
  kp_list = []
  
  for kp in keyphrase_list:
    
    regex = re.compile('[%s]' % re.escape(punct))
    text = regex.sub('', kp)
    #text = re.sub(r"[\-\+\_]+\ *", " ", text)
    text = re.sub(r"[\-\_]+\ *", " ", text)
    text = text.lower()
    
    kp_list.append(text)

  return kp_list

In [0]:
def indexingVocabulary(array_of_words):
    
    # frequency of word across document corpus
    tf = nltk.FreqDist(array_of_words)
    wordIndex = list(tf.keys())
    
    wordIndex.insert(0,'<pad>')
    wordIndex.append('<start>')
    wordIndex.append('<end>')
    wordIndex.append('<unk>')
    # indexing word vocabulary : pairs of (index,word)
    vocab=dict([(i,wordIndex[i]) for i in range(len(wordIndex))])
    
    return vocab

In [0]:
all_words = []
tokenized_train_docs = []
tokenized_train_keyphrases = []

for doc in train_docs:
  all_words.extend(tokenizeWords(doc))
  tokenized_train_docs.append(tokenizeWords(doc))
   
      
for keyphrase in train_keyphrases:
  all_words.extend(tokenizeWords(keyphrase))
  tokenized_train_keyphrases.append(tokenizeWords(keyphrase))

In [0]:
term_freq = nltk.FreqDist(all_words)

In [17]:
print("unique words in corpus in descending order (according to their frequency): %s"%str(len(term_freq)))

unique words in corpus in descending order (according to their frequency): 7641


In [0]:
common_words = term_freq.most_common(len(term_freq))
arr_common = np.array(common_words)
words = arr_common[:,0]

In [0]:
indices_words = indexingVocabulary(words)
words_indices = dict((v,k) for (k,v) in indices_words.items())

In [20]:
print("vocabulary size: %s"%str(len(indices_words)))

vocabulary size: 7645


### Preparing training and validation set

In [0]:
import numpy as np

In [0]:
encoder_length = 300 # maximum sequence length (number of words) in encoder layer
decoder_length = 5 # maximum sequence length (number of words) in decoder layer

Transforming data into integer format of X, Y sequences

In [0]:
X = np.zeros((len(tokenized_train_docs), encoder_length), dtype=np.int32) 

In [0]:
y_in = np.zeros((len(tokenized_train_docs), decoder_length+1), dtype=np.int32) 
y_out = np.zeros((len(tokenized_train_docs), decoder_length+1), dtype=np.int32) 

In [0]:
for i, doc in enumerate(tokenized_train_docs):
    
    len_doc = len(doc)
    if len_doc > encoder_length:
      txt = doc[:encoder_length]
    else:
      txt = doc
    for t, word in enumerate(txt):
      X[i, t] = words_indices[word]

In [0]:
for i, doc in enumerate(tokenized_train_keyphrases):
  
    len_doc = len(doc)
    
    if len_doc > decoder_length:
        txt = doc[:decoder_length]
    else:
        txt = doc
        
    txt_in = list(txt)
    txt_out = list(txt)
    
    txt_in.insert(0,'<start>')
    txt_out.append('<end>')
    
    for j, word in enumerate(txt_in):
        y_in[i, j] = words_indices[word]
        
    for j, word in enumerate(txt_out):
        y_out[i, j] = words_indices[word]

In [0]:
# shuffling the order of data pairs (randomize permutation is stored to be reusable)
rand_ids = readPickle(os.path.join(local_download_path,'rand_idx_train'))

In [0]:
X_train = []
y_train_in = []
y_train_out = []
X_valid = []
y_valid_in = []
y_valid_out = []

n_train = int(0.8 * X.shape[0])
for i, idx in enumerate(rand_ids):
  if i < n_train:
    X_train.append(X[idx])
    y_train_in.append(y_in[idx])
    y_train_out.append(y_out[idx])
  else:
    X_valid.append(X[idx])
    y_valid_in.append(y_in[idx])
    y_valid_out.append(y_out[idx])
  

In [0]:
X_train = np.array(X_train)
y_train_in = np.array(y_train_in)
y_train_out = np.array(y_train_out)

In [0]:
X_valid = np.array(X_valid)
y_valid_in = np.array(y_valid_in)
y_valid_out = np.array(y_valid_out)

### Model

In [0]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Embedding
from keras.layers import LSTM, GRU, concatenate
from keras.layers import Dense, Lambda, Reshape
import keras.backend as K
from keras.models import load_model

### Encoder model

In [0]:
# encoder input (a whole text without splitting into sentences)
in_encoder = Input(shape=(encoder_length,), dtype='int32', name='encoder-input')

In [0]:
enc_embedding = Embedding(len(indices_words), 100, input_length=encoder_length, name='embedding_encoder')
input_embedded = enc_embedding(in_encoder)

In [0]:
fwd_encoder = GRU(128, return_state=True, name='fwd-encoder')
bwd_encoder = GRU(128, return_state=True, name='bwd-encoder', go_backwards=True)
encoder_outputs_1, state_h_1 = fwd_encoder(input_embedded)
encoder_outputs_2, state_h_2 = bwd_encoder(input_embedded)
bidir_encoder_out = concatenate([encoder_outputs_1, encoder_outputs_2],axis=-1)
bidir_encoder_state = concatenate([state_h_1, state_h_2],axis=-1)

bidir_encoder = [bidir_encoder_out, bidir_encoder_state]

### Sampling class (sampled softmax)

modified to hold true when operating in 3D sequences

In [0]:
from keras.layers import Layer
class SamplingLayer(Layer):
    def __init__(self, num_sampled, num_classes, mode, **kwargs):
        self.num_sampled = num_sampled
        self.num_classes = num_classes
        self.mode = mode
        super(SamplingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        dense_shape, classes_shape = input_shape
        self.kernel = self.add_weight(name='kernel',
                                      shape=(self.num_classes, dense_shape[1]),
                                      initializer='uniform',
                                      trainable=True)
        self.bias = self.add_weight(name='bias',
                                      shape=(self.num_classes,),
                                      initializer='uniform',
                                      trainable=True)  

        super(SamplingLayer, self).build(input_shape)  

    def call(self, inputs_and_labels):
        inputs, labels = inputs_and_labels
        if self.mode == "train":
            loss = tf.nn.sampled_softmax_loss(
                weights=self.kernel,
                biases=self.bias,
                labels=labels,
                inputs=inputs,
                num_sampled=self.num_sampled,
                num_classes=self.num_classes,
                num_true=1)

        elif self.mode == "eval":
            logits = tf.matmul(inputs, tf.transpose(self.kernel))
            logits = tf.nn.bias_add(logits, self.bias)
            labels_one_hot = tf.one_hot(labels, self.num_classes)
            loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=labels_one_hot,
                logits=logits)

        return loss

    def compute_output_shape(self, input_shape):
        dense_shape, classes_shape = input_shape
        return (dense_shape[0], self.num_classes)

### Decoder model with sampled softmax

In [0]:
in_decoder = Input(shape=(None, ), name='decoder-input', dtype='int32')
dec_embedding = Embedding(len(indices_words), 100, name='embedding_decoder')
dec_input_embedded = dec_embedding(in_decoder)

In [0]:
labels = Input((decoder_length+1,1), dtype='int32', name='labels_')

In [38]:
fwd_decoder = GRU(256, return_sequences=True, return_state=True, name='fwd-decoder')
dec_outputs, dec_state_h = fwd_decoder(dec_input_embedded, initial_state=bidir_encoder_state)


losses = []
for t in range(decoder_length+1):
  dec_outputs_t = Lambda(lambda x: dec_outputs[:,t,:], name='dec_out-%s'%t)(dec_outputs)
  label_t = Lambda(lambda x: labels[:,t,:], name='label-%s'%t)(labels)
  loss = SamplingLayer(500, len(indices_words), mode='train', name='sampled_layer-%s'%t)([dec_outputs_t, label_t])
  losses.append(loss)


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [0]:
model = Model(inputs=[in_encoder, in_decoder, labels], outputs=losses)
model.compile(loss=lambda y_true, loss: loss, optimizer='rmsprop')

In [40]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder-input (InputLayer)      (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_encoder (Embedding)   (None, 300, 100)     764500      encoder-input[0][0]              
__________________________________________________________________________________________________
decoder-input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
fwd-encoder (GRU)               [(None, 128), (None, 87936       embedding_encoder[0][0]          
__________________________________________________________________________________________________
bwd-encode

### Train sampled softmax

Transform y_true labels to one hot encoding

In [41]:
y_train_out.shape

(568, 6)

In [0]:
from keras.utils import to_categorical
y_decoder_train_out = to_categorical(y_train_out, len(indices_words))

In [43]:
y_decoder_train_out.shape

(568, 6, 7645)

In [0]:
# reshape y to 3D dimension (batch_size, sequence_length, 1)
y = y_train_out.reshape((y_train_out.shape[0], y_train_out.shape[1], 1))

In [45]:
y.shape

(568, 6, 1)

swap axis of one hot encoded y_labels since the output of our sampled-softmax model is a list from decoder time steps

In [0]:
outputs = list(y_decoder_train_out.swapaxes(0,1))

In [47]:
np.array(outputs).shape

(6, 568, 7645)

In [48]:
model.fit([X_train, y_train_in, y], outputs, validation_split=0.2, batch_size=32, epochs=10)

Train on 454 samples, validate on 114 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




<keras.callbacks.History at 0x7ff375ffec18>