# British National Corpus LSTM

### Learns a language model capable of generating / autocompleting sentences.

---

## Create a generator which loads data from storage.

In [2]:
import os
import numpy as np
import tensorflow as tf
import json

In [3]:
# Load a minibatch from storage given its index
def load_minibatch(index, var):
    shape = np.load(os.path.join('DATA', 'MINIBATCHES', f'{index}_shape.npy'))
    for i in range(len(shape)):
        if var.shape[i] == None:
            continue
        elif var.shape[i] != shape[i]:
            raise ValueError('Shape of minibatch and variable are incompatible')
    
    indices = np.load(os.path.join('DATA', 'MINIBATCHES', f'{index}_indices.npy'))
    values = np.load(os.path.join('DATA', 'MINIBATCHES', f'{index}_values.npy'))
    sparse_tensor = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=shape)
    var.assign(tf.cast(tf.sparse.to_dense(sparse_tensor), dtype=tf.float32))

In [4]:
# Batch Generator Class
class BatchGenerator(object):
    def __init__(self, num_batches, hidden_size):
        self.num_batches = num_batches
        self.current_idx = 0
        self.x = tf.Variable(np.zeros([500, 30, 10002]), dtype=tf.float32, shape=[None, 30, 10002])
        
    def generate(self):
        while True:
            load_minibatch(self.current_idx, self.x)
            self.current_idx = (self.current_idx + 1) % self.num_batches
            X = np.stack([np.zeros([500, 10002])] + [self.x[:, i, :].numpy() for i in range(29)], axis=1)
            Y = [self.x[:, i, :].numpy() for i in range(30)]
            hidden_state_0 = np.zeros((500, hidden_size))
            cell_state_0 = np.zeros((500, hidden_size))
            yield [[X, hidden_state_0, cell_state_0], Y]
            
num_batches = 6806
batch_generator = BatchGenerator(6806, 128).generate()

In [5]:
# Load our numberizing dictionary
with open(os.path.join('DATA', 'numberize.json'), 'r') as f:
    numberize = json.load(f)

# Load our reverse-numberizing dictionary
with open(os.path.join('DATA', 'reverse_numberize.json'), 'r') as f:
    reverse_numberize = json.load(f)

# Converts a list of token indices to a sentence string.
def verbalize_sentence(numberized_sentence):
    words = [reverse_numberize[str(word_number)] for word_number in numberized_sentence if word_number != 10001]
    return ' '.join(words)

# Converts a tensor of vectorized sentences to a list of sentence strings.
def verbalize(vectorized_sentences):
    numberized_sentences = tf.argmax(vectorized_sentences, axis=2).numpy()
    return list(map(verbalize_sentence, numberized_sentences))

In [6]:
# Giving an example of the above functions
var = tf.Variable(tf.zeros([500, 30, 10002]), dtype=tf.float32, shape=[None, 30, 10002])
load_minibatch(0, var)

verbalize(var[0:10,:,:])

['AIDS ( <UNK> <UNK> <UNK> <UNK> ) is a condition caused by a virus called HIV ( Human <UNK> <UNK> <UNK> ) .',
 "This virus affects the body 's defence system so that it can not fight infection .",
 'through <UNK> sexual intercourse with an infected partner .',
 'through infected blood or blood products .',
 'from an infected mother to her baby .',
 'The medical aspects can be cancer , <UNK> , sudden <UNK> , <UNK> , dramatic weight loss or any combination of these .',
 'Often infected people are rejected by family and friends , leaving them to face this chronic condition alone .',
 'there is no <UNK> or cure currently available .',
 '10 million people worldwide are infected with HIV .',
 'you can be infected for between <UNK> years without <UNK> it .']

## Define the LSTM Model

In [7]:
from keras import Model, Input
from keras.layers import Dense, Activation, Dropout, LSTM, Lambda, Reshape
from keras.optimizers import Adam

sequence_length = 30
vocab_size = 10002
hidden_size = 128

Reshape_cell = Reshape((1, vocab_size)) 
LSTM_cell = LSTM(hidden_size, return_state = True)
Dense_cell = Dense(vocab_size, activation='softmax')

def LSTM_model(sequence_length, hidden_size, vocab_size):
    """
    Arguments:
    sequence_length -- length of input sequences
    hidden_size -- number of hidden units in LSTM cell
    vocab_size -- size of vocabulary in corpus 
    
    Returns:
    a keras model
    """
    
    # Define inputs to LSTM
    X = Input(shape=(sequence_length, vocab_size), name='sequence_input')
    hidden_state_0 = Input(shape=(hidden_size,), name='hidden_state_0')
    cell_state_0 = Input(shape=(hidden_size,), name='cell_state_0')
    
    hidden_state = hidden_state_0
    cell_state = cell_state_0
    outputs = []
    for t in range(sequence_length):
        # Select t-th time slice
        x = Lambda(lambda y: y[:, t, :])(X)
        # Rehapse and apply LSTM_cell to time slice
        x = Reshape_cell(x)
        hidden_state, _, cell_state = LSTM_cell(inputs=x, initial_state=[hidden_state, cell_state])
        # Apply Dense_cell to output
        output = Dense_cell(hidden_state)
        outputs.append(output)
        
    model = Model(inputs=[X, hidden_state_0, cell_state_0], outputs=outputs)
    
    return model

model = LSTM_model(sequence_length, hidden_size, vocab_size)
optimizer = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

Using TensorFlow backend.


## Train the Model

In [26]:
batches_per_epoch = 6706
num_epochs = 1

# Using data given by the batch generator to train the model
model.fit_generator(batch_generator, batches_per_epoch, num_epochs)

Epoch 1/1


<keras.callbacks.callbacks.History at 0x664e036d8>

In [27]:
# Save the weights after training
model.save_weights('model2.h5')

## Load pre-trained weights

In [11]:
model.load_weights('model.h5')

## Generator Model

### Generates novel sentences according to the trained langauge model.

In [17]:
# Replaces a probability distribution with a one-hot vector 
# representing a random choice according to that distribution.

def one_hot_random_sample(distribution):
    # Modify the distribution to make <UNK> token impossible
    mask = 1 - tf.reshape(tf.one_hot(indices=10000, depth=vocab_size), shape=(1,vocab_size))
    rescaled_mask = (1.0 - distribution[0, 10000]) * mask
    modified_distribution = rescaled_mask * distribution
    
    # Sample according to the modified distribution
    sample_index = tf.random.categorical(tf.math.log(modified_distribution), 1)
    result = tf.one_hot(indices=sample_index, depth=vocab_size)
    return result

# A cell to randomly sample
Random_sample_cell = Lambda(one_hot_random_sample)

# This model generates random sentences according to the pretrained LSTM
def GeneratorModel(hidden_size, vocab_size, sequence_length):
    X = Input(shape=(1, vocab_size), name='initial_input')
    hidden_state_0 = Input(shape=(hidden_size,), name='hidden_state_0')
    cell_state_0 = Input(shape=(hidden_size,), name='cell_state_0')
    
    hidden_state = hidden_state_0
    cell_state = cell_state_0
    
    outputs = []
    x = Lambda(lambda y: y[:, 0, :])(X)
    x = Reshape_cell(x)
    hidden_state, _, cell_state = LSTM_cell(inputs=x, initial_state=[hidden_state, cell_state])
    distribution = Dense_cell(hidden_state)
    result = Random_sample_cell(distribution)
    outputs.append(result)
    for t in range(1, sequence_length):
        hidden_state, _, cell_state = LSTM_cell(inputs=result, initial_state=[hidden_state, cell_state])
        distribution = Dense_cell(hidden_state)
        result = Random_sample_cell(distribution)
        outputs.append(result)
        
    model = Model(inputs=[X, hidden_state_0, cell_state_0], outputs=outputs)
    
    return model

generator_model = GeneratorModel(hidden_size, vocab_size, sequence_length)

In [18]:
# Decodes the output of the generator model into a sentence string.
def decode(result):
    word_indices = [np.argmax(word_vector) for word_vector in result]
    sentence = verbalize_sentence(word_indices)
    return sentence

# Generates a sentence using the language model
def generate_sentence():
    X = np.zeros((1,1,vocab_size))
    hidden_state_0 = np.zeros((1, hidden_size))
    cell_state_0 = np.zeros((1, hidden_size))

    result = generator_model.predict([X, hidden_state_0, cell_state_0])
    sentence = decode(result)
    return sentence

In [29]:
# Generate 1000 sentences
for i in range(1000):
    print(generate_sentence())

see James as the black , London looking dead .
But this woman makes me the only person could now speak to Bristol Street , ’ he said .
Giles years old would increase in Queen Street , but soup and air calls .
In that bar or a turning home , they had to hold large , where I 'd of our impact , ’ watched born ; city .
Before the village colours in state parts of strong privilege , easier , schemes , say , do well do less have no answer at the end of the population
‘ My dear , ’ said Mrs her voice .
Open lay at his second home , Kate the top employer starting exposure to his brown .
We posed always differences about treating often good as mature , unless that East as you shall wish .
And then the meal who would address and that fail that the eighth task was impossible to spread to replacement anything staff in .
‘ When we mentioned Helen , I 'm a lengthy charge of sailing a piece of me .
Mr Brown began at my friend , the last half years .
In a future report the section winter council claim

Because of the room is even more complex but to be an interesting attitude .
Through life of self is among our early Department , teachers will be no cause so different and others or overseas .
It is interested most one of something pushed more than all properties .
In lines in a boundary , another , transfer teaching of a redundant towards 1993 .
It 's a growing part of one of this thirteen years and been tiny cleaning disturbed and the potential of statements .
The first factor Jean was served in the Oxfordshire Company .
However , not initiated in these differences with the project will act that the solution helps to work to her choice .
As it maintained that , one emphasis includes average judge and 20 miles into one 's career , Stuart of using spirit vessels and the valve husband spirit .
The simple diet of black activity but will not be very clearly debts as in their work .
For one reason , you used a nice thing that I have got to the smaller readers I told them and tanks is a ad

Alternatively she was on of 9 as Mr scientists said he said .
You climbed a pit her that , and the day grip him , I 'm erm the top .
The sink , then the train charged for pensioners were handed over and thus 10 times a Hughes with each other .
I can not compensate for the business and this promises of an combined skills and individual planning occurs , it found a photograph in a model .
The grandfather Panel resulted in his new face of view of computer where equity seemed to have bought any projects .
In the stage early England , it had made a fatal point in her party under straw and first won men .
These sound faced by the lack of art , then ‘ nice ’ information about my sister and given 21 years ago and the forecast of State .
For another of the rest of the document of electrical programming were advertising , and it is the main reason for town South African holdings .
By contrast ; Mr. involving one for D. amateur colleagues has risen to the coast 's back of the status of the old to

remarkable agent could say so .
‘ The the shops are trying to bring out on the household , which the arguments as one is itself .
Last week after a carpet conference with the window two weeks ago the reliable reveals of Hitler 's written as at average of the job and politics .
According to the West he was through the British following for manufacturing .
3 .
Their money is targeted .
The first band will be transferred to the United States of an enterprise version of all the community opposition .
He set up January by several male matters in divorce .
It is important to an average agreement between under the successful spending of the United States .
After the earliest days , they did work with Yes , waiting every night .
Indeed it was that nature degree of course ( follow it both payable as to gather strips of sit on the old hole .
‘ I ends at a used to rescue cash , your guys is not drinking prone to in the centre .
I asked myself ever when I would call my resting , .
In some cases th

It was a strong small beautiful , mechanical among the book on an their financial companies .
The response to make give how some 80 treatment is blamed on this position .
It is relatively very traditional defensive , and they drove and study .
Once to the fury , sales was found about .
It is perhaps the smell of the British Government of our three-year class officers have become most valuable sum .
A doubt and forth has chosen such as intention of foster .
served rats , when they experience the examples of butter and studied soil and soft levels questions .
If it will develop been certainty that they do feel necessary to be trivial at your school , not only it on flying disease funded from developing ( density )
Private factors occur to referendum and reducing perfect News of the home environment and any rate comment from the sale of the deficit .
Oh , ’ said repeated , admitted the time , where it was proud of his desk .
He was never felt that too readily to drive all into his advice 

The Islamic large as he threw in their own satisfaction , and he stopped for granted to both Iraq Aid , but she was calculated that language lies in .
On Monday , the basket are hold on each other and despair inside the wall physical records to the speed .
I see in an project as go to give into a number of fishing .
In a survey of this idea is less important .
Five years of Scotland , all these problems were formed and formed in mainstream student .
‘ This makes nobody is in the touch of an odd , while let answers pay ourselves .
Most fees are doing , on the local quality country being reached at the time again in .
The morning park and kicking pool over and another parts .
This is being crucial to moral some of disaster education , dedicated by the state society — the sponsorship of workers who were in advance in recent years .
When one of the moment old potential word , then there is so many ways of me , the world has used prospects .
Golden laughed .
My strange objective was chronic

The main method is in future may be claims of individual furniture , Belfast and a fall worth whole .
This traditional marketing contract actually wants that the reign of Word is a beautiful illustration of surveillance .
His mention of 20,000 and sea faces a few major factors were , smooth , range , steep , pond .
She found it themselves as a move to defend what he felt as we had the same day before anticipation and of the he was released the same fist .
‘ Why now ’ Cheltenham ? she commented briefly as the bright campaign built which is in the regular construction campaign .
‘ We are now in our talent , ’ caution the pleasant image or to consult a Turkish effect .
experience , as might have issued for steps to establish the pattern of migration and dividend .
Body are now which flies had an ‘ touched nature ’ sense .
This firm has been achieved by a review of alcohol and , waiting for different approaches to seek justice .
It was difficult to suffer from the trades of the habits prac

Often .
I can prove to be more difficult to answer so .
This time this is be growing 's unique in the distribution of activity in China and quality of reform , and it is important for two years .
Her work is more successful than one of when decided to suggest that ‘ I ca n't imagine that you was not ill .
The almost his request for the Ministry of words successfully that score vision capital appeared to take them in response to their De .
Yet the moment of the interest retail could strike share in public markets .
talk Christopher when .
‘ Today , and the problems are marvellous .
They were involved with individuals and like drugs , with over the bushes , no power .
Fine , and by British markets , they were familiar , the changes in the absence of the foundations .
No known we took free more hungry the confidence session in the code .
The spring , the court Revenue .
The price of the choice is due to be advanced under democracy was unlikely to a boot ; during the philosophical chapter 

The only location of the crisis ’ and bar appeared in front of each year of the transactions itself .
The wood , seminars from the top daily in 1914 .
Our weird luck is set that a committee .
If the photos are leading , would change by the police .
However , it inherited no , physical outlook , heat and industrial appropriate towards their use .
When he had discovered the year are poor food 's sending continent the almost undoubtedly engaged in the bridge possibly handle in a hours .
A teacher now will tears with the high tool in the present , and which others will need to be able to employees on programs .
Not all cows on tears with easily were total of different failure .
I 'm angry .
I might know how the job I thought and really should do best , he can , you was well standing too an ground off my knee and I knew
They say that far as that does give come down from their department late .
Her skin flushed .
( She also were sitting on the Christmas front of the doorway , the tea 's .
On

Yeah he could help the fun .
None of the subsequent experiences it is therefore that the student are reversed , and existed , their babies .
Sadly apart , can be too much of these way to a severe waste .
I went to in the conversion side to advance losing an passing .
There was a lecture to selection that evening would be nearly oneself that every day , young and somehow I could n't see myself .
I .
1992 has n't duty in an opportunity to remain about motion on the a page in that the longest Act past not perhaps .
The principal survey showed that competition rises must be lifted on periods revenues in improving its own craft dealing with the guests 's pleasure .
Yes , at least , eight men .
Street 26 squares and funding in both collections , and local Soviet team specialised exchange January .
‘ The parents and actually are capable of the way of assistance movements , responding to the job of justice .
Once you 're now straight on the corner .
He stepped out by the polished of the Societ

‘ Yes , I 've been a reverse of Peter 're now .
If this takes you how we are , the method is nothing to say you over it .
Oh settings , ‘ thirty years ago ’ he says .
It was for the reassuring night .
The District varieties of the first able tend to provide sophisticated growth in principle of art million in particular security .
Well , when the market is about a recommendation or a lot , or a positive gap .
* 1 .
Nothing almost is what he might suspect .
If you are n't sure , remarkable practitioners , when these thing are partial .
They directed joint recently operators are formidable in traditional motion .
charged collected in October , for the University of teachers , which refer to the machines at the way outside the field of America .
I 'm going on your right with so something about us .
Most clubs illustrate lists of fiscal study .
As much sensed it 's City he heard the period going this few minutes and he Smith , but now remember whether he had taken over and Jones .
converted