# British National Corpus LSTM

### Learns a language model capable of generating / autocompleting sentences.

---

## Create a generator which loads data from storage.

In [2]:
import os
import numpy as np
import tensorflow as tf
import json

In [3]:
# Load a minibatch from storage given its index
def load_minibatch(index, var):
    shape = np.load(os.path.join('DATA', 'MINIBATCHES', f'{index}_shape.npy'))
    for i in range(len(shape)):
        if var.shape[i] == None:
            continue
        elif var.shape[i] != shape[i]:
            raise ValueError('Shape of minibatch and variable are incompatible')
    
    indices = np.load(os.path.join('DATA', 'MINIBATCHES', f'{index}_indices.npy'))
    values = np.load(os.path.join('DATA', 'MINIBATCHES', f'{index}_values.npy'))
    sparse_tensor = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=shape)
    var.assign(tf.cast(tf.sparse.to_dense(sparse_tensor), dtype=tf.float32))

In [4]:
# Batch Generator Class
class BatchGenerator(object):
    def __init__(self, num_batches, hidden_size):
        self.num_batches = num_batches
        self.current_idx = 0
        self.x = tf.Variable(np.zeros([500, 30, 10002]), dtype=tf.float32, shape=[None, 30, 10002])
        
    def generate(self):
        while True:
            load_minibatch(self.current_idx, self.x)
            self.current_idx = (self.current_idx + 1) % self.num_batches
            X = np.stack([np.zeros([500, 10002])] + [self.x[:, i, :].numpy() for i in range(29)], axis=1)
            Y = [self.x[:, i, :].numpy() for i in range(30)]
            hidden_state_0 = np.zeros((500, hidden_size))
            cell_state_0 = np.zeros((500, hidden_size))
            yield [[X, hidden_state_0, cell_state_0], Y]
            
num_batches = 6806
batch_generator = BatchGenerator(6806, 128).generate()

In [5]:
# Load our numberizing dictionary
with open(os.path.join('DATA', 'numberize.json'), 'r') as f:
    numberize = json.load(f)

# Load our reverse-numberizing dictionary
with open(os.path.join('DATA', 'reverse_numberize.json'), 'r') as f:
    reverse_numberize = json.load(f)

# Converts a list of token indices to a sentence string.
def verbalize_sentence(numberized_sentence):
    words = [reverse_numberize[str(word_number)] for word_number in numberized_sentence if word_number != 10001]
    return ' '.join(words)

# Converts a tensor of vectorized sentences to a list of sentence strings.
def verbalize(vectorized_sentences):
    numberized_sentences = tf.argmax(vectorized_sentences, axis=2).numpy()
    return list(map(verbalize_sentence, numberized_sentences))

In [6]:
# Giving an example of the above functions
var = tf.Variable(tf.zeros([500, 30, 10002]), dtype=tf.float32, shape=[None, 30, 10002])
load_minibatch(0, var)

verbalize(var[0:10,:,:])

['AIDS ( <UNK> <UNK> <UNK> <UNK> ) is a condition caused by a virus called HIV ( Human <UNK> <UNK> <UNK> ) .',
 "This virus affects the body 's defence system so that it can not fight infection .",
 'through <UNK> sexual intercourse with an infected partner .',
 'through infected blood or blood products .',
 'from an infected mother to her baby .',
 'The medical aspects can be cancer , <UNK> , sudden <UNK> , <UNK> , dramatic weight loss or any combination of these .',
 'Often infected people are rejected by family and friends , leaving them to face this chronic condition alone .',
 'there is no <UNK> or cure currently available .',
 '10 million people worldwide are infected with HIV .',
 'you can be infected for between <UNK> years without <UNK> it .']

## Define the LSTM Model

In [7]:
from keras import Model, Input
from keras.layers import Dense, Activation, Dropout, LSTM, Lambda, Reshape
from keras.optimizers import Adam

sequence_length = 30
vocab_size = 10002
hidden_size = 128

Reshape_cell = Reshape((1, vocab_size)) 
LSTM_cell = LSTM(hidden_size, return_state = True)
Dense_cell = Dense(vocab_size, activation='softmax')

def LSTM_model(sequence_length, hidden_size, vocab_size):
    """
    Arguments:
    sequence_length -- length of input sequences
    hidden_size -- number of hidden units in LSTM cell
    vocab_size -- size of vocabulary in corpus 
    
    Returns:
    a keras model
    """
    
    # Define inputs to LSTM
    X = Input(shape=(sequence_length, vocab_size), name='sequence_input')
    hidden_state_0 = Input(shape=(hidden_size,), name='hidden_state_0')
    cell_state_0 = Input(shape=(hidden_size,), name='cell_state_0')
    
    hidden_state = hidden_state_0
    cell_state = cell_state_0
    outputs = []
    for t in range(sequence_length):
        # Select t-th time slice
        x = Lambda(lambda y: y[:, t, :])(X)
        # Rehapse and apply LSTM_cell to time slice
        x = Reshape_cell(x)
        hidden_state, _, cell_state = LSTM_cell(inputs=x, initial_state=[hidden_state, cell_state])
        # Apply Dense_cell to output
        output = Dense_cell(hidden_state)
        outputs.append(output)
        
    model = Model(inputs=[X, hidden_state_0, cell_state_0], outputs=outputs)
    
    return model

model = LSTM_model(sequence_length, hidden_size, vocab_size)
optimizer = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

Using TensorFlow backend.


## Train the Model

In [None]:
batches_per_epoch = 6706
num_epochs = 1

# Using data given by the batch generator to train the model
model.fit_generator(batch_generator, batches_per_epoch, num_epochs)

In [None]:
# Save the weights after training
model.save_weights('model2.h5')

## Load pre-trained weights

In [11]:
model.load_weights('model.h5')

## Generator Model

### Generates novel sentences according to the trained langauge model.

In [17]:
# Replaces a probability distribution with a one-hot vector 
# representing a random choice according to that distribution.

def one_hot_random_sample(distribution):
    # Modify the distribution to make <UNK> token impossible
    mask = 1 - tf.reshape(tf.one_hot(indices=10000, depth=vocab_size), shape=(1,vocab_size))
    rescaled_mask = (1.0 - distribution[0, 10000]) * mask
    modified_distribution = rescaled_mask * distribution
    
    # Sample according to the modified distribution
    sample_index = tf.random.categorical(tf.math.log(modified_distribution), 1)
    result = tf.one_hot(indices=sample_index, depth=vocab_size)
    return result

# A cell to randomly sample
Random_sample_cell = Lambda(one_hot_random_sample)

# This model generates random sentences according to the pretrained LSTM
def GeneratorModel(hidden_size, vocab_size, sequence_length):
    X = Input(shape=(1, vocab_size), name='initial_input')
    hidden_state_0 = Input(shape=(hidden_size,), name='hidden_state_0')
    cell_state_0 = Input(shape=(hidden_size,), name='cell_state_0')
    
    hidden_state = hidden_state_0
    cell_state = cell_state_0
    
    outputs = []
    x = Lambda(lambda y: y[:, 0, :])(X)
    x = Reshape_cell(x)
    hidden_state, _, cell_state = LSTM_cell(inputs=x, initial_state=[hidden_state, cell_state])
    distribution = Dense_cell(hidden_state)
    result = Random_sample_cell(distribution)
    outputs.append(result)
    for t in range(1, sequence_length):
        hidden_state, _, cell_state = LSTM_cell(inputs=result, initial_state=[hidden_state, cell_state])
        distribution = Dense_cell(hidden_state)
        result = Random_sample_cell(distribution)
        outputs.append(result)
        
    model = Model(inputs=[X, hidden_state_0, cell_state_0], outputs=outputs)
    
    return model

generator_model = GeneratorModel(hidden_size, vocab_size, sequence_length)

In [18]:
# Decodes the output of the generator model into a sentence string.
def decode(result):
    word_indices = [np.argmax(word_vector) for word_vector in result]
    sentence = verbalize_sentence(word_indices)
    return sentence

# Generates a sentence using the language model
def generate_sentence():
    X = np.zeros((1,1,vocab_size))
    hidden_state_0 = np.zeros((1, hidden_size))
    cell_state_0 = np.zeros((1, hidden_size))

    result = generator_model.predict([X, hidden_state_0, cell_state_0])
    sentence = decode(result)
    return sentence

In [19]:
# Generate 100 sentences
for i in range(100):
    print(generate_sentence())

• .
, it is invented out boxes .
But one of the arrested within a brass year and contest noted .
I we convinced it to these is there .
For course that and these motor speaker is that is us to entirely all terms of abuse suffers .
And I , you was a bit good start .
Not one which has have a look .
I 've got my erm touched one .
And it 's on so , if if think in these people 'll start .
Among the telephone pound miles on error from endless Wednesday of Ireland 's a death of became , but they had them to send , think it .
The idea for the law like you can n't do now on the next talents or the meaning walking to yourself .
Mum Yes , I you her .
That 's need to she , yeah .
They do n't reduce them those of the function .
If you 're catch about the house , I was trying to the early , that manager 's statistical Grant is also improved .
She and Government the head like , in four , ways , especially distribution administrators and Germany , I 'm sure it seemed to trace of particular ( it .
‘ Oh 