### Recurrent Neural Networks 

In [1]:
#import packages
from __future__ import print_function
import math
import os
import time
import numpy as np

In [2]:
#Class for for indexing words of the input document
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [3]:
#Defining Corpus class to index the words of the input document
#Dictionary class is used by the Corpus class to index the words of the input document
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(path + 'train.txt')
        self.valid = self.tokenize(path + 'valid.txt')
        self.test = self.tokenize(path + 'test.txt')

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = np.zeros((tokens,), dtype='int32')
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

### Load data as batches

To speed up the subsequent data flow in the RNN model, we pre-process the loaded data as batches. This procedure is defined in the following ``batchify`` function.

The dataset download link http://goo.gl/vT4cEw. 

In [4]:
data_path = "/tmp/nlp/ptb"

corpus = Corpus(data_path)

def batchify(data, batch_size):
    """Reshape data into (num_example, batch_size)"""
    nbatch = data.shape[0] // batch_size
    data = data[:nbatch * batch_size]
    data = data.reshape((batch_size, 1, nbatch)).T
    return data

batch_size = 32
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)

In [5]:
#RNN Model
from bigdl.nn.keras.layer import *
from bigdl.nn.keras.topology import Sequential

class RNNModel():
    """A model with an encoder, recurrent layer, and a decoder."""   
    #batch_size was defined 32
    def __init__(self, mode, vocab_size, num_hidden, arg_input_shape = (1, batch_size), dropout=0.5):
            self.model = Sequential()
            if mode == 'rnn_relu':
                self.model.add(SimpleRNN(num_hidden, activation = "relu", input_shape = arg_input_shape))
            elif mode == 'rnn_tanh':
                self.model.add(SimpleRNN(num_hidden, input_shape = arg_input_shape))
            elif mode == 'lstm':
                self.model.add(LSTM(num_hidden, input_shape = arg_input_shape))
            elif mode == 'gru':
                self.model.add(GRU(num_hidden, input_shape = arg_input_shape))
            else:
                raise ValueError("Invalid mode %s. Options are rnn_relu, "
                                 "rnn_tanh, lstm, and gru"%mode)
            
            self.decoder = Dense(vocab_size, activation = "tanh")
            self.model.add(self.decoder)
            self.num_hidden = num_hidden

### Model Building

For demonstration, LSTM is the chosen RNN model type. For other RNN options, one can replace the 'lstm' string to 'rnn_relu', 'rnn_tanh', or 'gru'.

In [6]:
ntokens = len(corpus.dictionary)
model_type = 'lstm'
num_hid = 100
LSTM = RNNModel(model_type, ntokens, num_hid)

creating: createKerasSequential
creating: createKerasLSTM
creating: createKerasDense


In [7]:
print(LSTM.model.get_input_shape())
print(LSTM.model.get_output_shape())

(None, 1, 32)
(None, 10000)


In [8]:
#Configure Model
from bigdl.nn.criterion import *
LSTM.model.compile(optimizer='sgd', loss=CrossEntropyCriterion(), metrics=['accuracy'])

creating: createCrossEntropyCriterion
creating: createDefault
creating: createSGD
creating: createTop1Accuracy


In [10]:
#Execute Training
LSTM.model.fit(train_data, corpus.train[:len(train_data)], batch_size=8, nb_epoch=10,
validation_data=(val_data[:500], corpus.valid[:500]))