In [1]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import os

In [2]:
path="/content/drive/MyDrive/NLP_Project/1661-0.txt"
data=open(path).read().lower()
data=data[:180000]
raw_text=data


##Load Data 

In [3]:
raw_text



##Preprocess Data - (for giving input to the neural network ; text is converted to integer)

In [4]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))


In [5]:
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)


Total Characters:  180000
Total Vocab:  65


In [6]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)


Total Patterns:  179900


Transform the list of input sequences into the form [samples, time steps, features] expected by an LSTM network.

Next we need to rescale the integers to the range 0-to-1 to make the patterns easier to learn by the LSTM network that uses the sigmoid activation function by default.
last,we need to convert the output patterns (single characters converted to integers) into a one hot encoding. 

In [7]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [8]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')


The network is slow to train (about 300 seconds per epoch on an Nvidia K520 GPU). Because of the slowness and because of our optimization requirements, we will use model checkpointing to record all of the network weights to file each time an improvement in loss is observed at the end of the epoch. We will use the best set of weights (lowest loss) to instantiate our generative model in the next section.

In [None]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
Epoch 00001: loss improved from inf to 2.96669, saving model to weights-improvement-01-2.9667.hdf5
Epoch 2/20
Epoch 00002: loss improved from 2.96669 to 2.76908, saving model to weights-improvement-02-2.7691.hdf5
Epoch 3/20
Epoch 00003: loss improved from 2.76908 to 2.67376, saving model to weights-improvement-03-2.6738.hdf5
Epoch 4/20
Epoch 00004: loss improved from 2.67376 to 2.59268, saving model to weights-improvement-04-2.5927.hdf5
Epoch 5/20
Epoch 00005: loss improved from 2.59268 to 2.52828, saving model to weights-improvement-05-2.5283.hdf5
Epoch 6/20
Epoch 00006: loss improved from 2.52828 to 2.46799, saving model to weights-improvement-06-2.4680.hdf5
Epoch 7/20
Epoch 00007: loss improved from 2.46799 to 2.41723, saving model to weights-improvement-07-2.4172.hdf5
Epoch 8/20
Epoch 00008: loss improved from 2.41723 to 2.36409, saving model to weights-improvement-08-2.3641.hdf5
Epoch 9/20
Epoch 00009: loss improved from 2.36409 to 2.31816, saving model to weights-impro

<keras.callbacks.History at 0x7f07e052b410>

#Generating Text with an LSTM Network

In [None]:
# load the network weights
filename = "weights-improvement-20-1.9346.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')


In [12]:
## reverse mapping
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# pick a random seed
import sys
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print( "\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone")

Seed:
"  said--'

'i didn't!' the march hare interrupted in a great hurry.

'you did!' said the hatter.

'i  "
con't know it wou dan t tail to bet tee ' said the daterpillar.

'whll  she hadt  she dareepi oo tooe toine,  the huepets sase then she had not it an the could, and was sote to gev aeaun to tene the wan no the faree haree bare and the was no ao all oo the say  and the toins war in at the cirlt. 
'io she sas a latter on the soaes tu taek thet ' she taid to herself, 'i thsh toe coewe   the huepets sas  the dormouse sas in at ilcs, and the toin ho the woile aadk to the white 'and the was to meree to her hn an the rinee, and sas no the sore of the table, and the was not in a lore of tar oo the tan   the huehets or hrtn and the kuosre, and the toins on the woule she tas oo the table. 
'io a gan hatter then ' said the daterpillar.

'thll  she hadt, an cnl  a danter on the career  so soe toin, 

'ih a drod ont thingng,' said alice, 'in's al all fo a lorte 'and the toils to be a lettl

The fact that this character based model of the book produces output like this is very impressive. It gives you a sense of the learning capabilities of LSTM networks.
The results are not perfect. In the next section we look at improving the quality of results by developing a much larger LSTM network.

## Larger LSTM

keeping the number of memory units the same at 256, but adding a second layer.

In [9]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')


In [12]:
#change the filename of the checkpointed weights 
# define the checkpoint
path='/content/drive/MyDrive/NLP/NLP_Project/assets/'
filepath=path+"weights-improvement-{epoch:02d}-{loss:.4f}-bigger.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [13]:
# fit the model
model.fit(X, y, epochs=10, batch_size=64, callbacks=callbacks_list)

Epoch 1/10
Epoch 00001: loss improved from inf to 2.72968, saving model to /content/drive/MyDrive/NLP/NLP_Project/assets/weights-improvement-01-2.7297-bigger.hdf5
Epoch 2/10
Epoch 00002: loss improved from 2.72968 to 2.49940, saving model to /content/drive/MyDrive/NLP/NLP_Project/assets/weights-improvement-02-2.4994-bigger.hdf5
Epoch 3/10
Epoch 00003: loss improved from 2.49940 to 2.35336, saving model to /content/drive/MyDrive/NLP/NLP_Project/assets/weights-improvement-03-2.3534-bigger.hdf5
Epoch 4/10
Epoch 00004: loss improved from 2.35336 to 2.25125, saving model to /content/drive/MyDrive/NLP/NLP_Project/assets/weights-improvement-04-2.2512-bigger.hdf5
Epoch 5/10
Epoch 00005: loss improved from 2.25125 to 2.16929, saving model to /content/drive/MyDrive/NLP/NLP_Project/assets/weights-improvement-05-2.1693-bigger.hdf5
Epoch 6/10
Epoch 00006: loss did not improve from 2.16929
Epoch 7/10
Epoch 00007: loss improved from 2.16929 to 2.11756, saving model to /content/drive/MyDrive/NLP/NLP_P

<keras.callbacks.History at 0x7fd58a053310>

In [16]:
# load the network weights
import sys
path='/content/drive/MyDrive/NLP/NLP_Project/assets/'
filename = path+"weights-improvement-10-1.9236-bigger.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(100):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print( "\nDone.")

Seed:
" ur
reason breaks down under them and acknowledges me to be right. now, mr.
jabez wilson here has bee "
n so be to the coor of the coor of the coor of the coor of the coor of the coor of the coor of the c
Done.
