In [42]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

### a) b) Using all 4 books

### c)  i) Files are concatenated in 'Part1.txt'

In [43]:
file = open('Part1.txt',encoding='utf-8').read()
file = file.lower()

In [44]:
chars = sorted(list(set(file)))
c2i = dict((c, i) for i, c in enumerate(chars))

### c) ii) Using ASCII encoding

In [45]:
no_of_chars = len(file)
no_of_vocab = len(chars)
print ("Total Characters: ", no_of_chars)
print ("Total Vocab: ", no_of_vocab)

Total Characters:  1483924
Total Vocab:  98


### c) iii) Using window size 99
### c) iv) Inputs to the network will be the first W − 1 charachters in each sequence
### c) v) Output encoded using one-hot encoding scheme

In [51]:
win_size = 99
dataX = []
dataY = []
for i in range(0, n_chars - win_size, 1):
    try:
        seq_in = file[i:i + win_size]
        seq_out = file[i + win_size]
    except:
        continue
    dataX.append([c2i[char] for char in seq_in])
    dataY.append(c2i[seq_out])
no_of_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)
X = numpy.reshape(dataX, (no_of_patterns, win_size, 1))
X = X / float(n_vocab)
y = np_utils.to_categorical(dataY)

Total Patterns:  1595678


### c) vi) Using a single hidden layer (N = 256)
### c) vii) Using a softmax output layer
### c) viii) No test data set being used

In [52]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

### c) ix) As per available computational power, 15 epochs chosen
### c) x) Using model checkpoint to save weights

In [53]:
weights_file="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(weights_file, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [54]:
model.fit(X, y, epochs=15, batch_size=128, callbacks=callbacks_list)

Epoch 1/15

Epoch 00001: loss improved from inf to 2.74727, saving model to weights-improvement-01-2.7473.hdf5
Epoch 2/15

Epoch 00002: loss improved from 2.74727 to 2.46819, saving model to weights-improvement-02-2.4682.hdf5
Epoch 3/15

Epoch 00003: loss improved from 2.46819 to 2.28291, saving model to weights-improvement-03-2.2829.hdf5
Epoch 4/15

Epoch 00004: loss improved from 2.28291 to 2.16335, saving model to weights-improvement-04-2.1634.hdf5
Epoch 5/15

Epoch 00005: loss improved from 2.16335 to 2.08204, saving model to weights-improvement-05-2.0820.hdf5
Epoch 6/15

Epoch 00006: loss improved from 2.08204 to 2.02202, saving model to weights-improvement-06-2.0220.hdf5
Epoch 7/15

Epoch 00007: loss improved from 2.02202 to 1.97740, saving model to weights-improvement-07-1.9774.hdf5
Epoch 8/15

Epoch 00008: loss improved from 1.97740 to 1.94081, saving model to weights-improvement-08-1.9408.hdf5
Epoch 9/15

Epoch 00009: loss improved from 1.94081 to 1.91125, saving model to weig

<keras.callbacks.History at 0x12a67d5c0>

### c) xi) Generating text using LSTM

In [60]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [107]:
seed_sentence = [c2i[char] for char in "there are those who take mental phenomena naively, just as they would physical phenomena. this school of psychologists tends not to emphasize the object."]
print(len(seed_sentence))

153


In [84]:
import sys

### Ouput

In [110]:
pattern = seed_sentence[0:99]
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(no_of_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

nelic and the poents of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of the seisn of t

### Inference:
- We can see how the LSTM is repeating the words, this can mean:
 - The number of books were less
 - The number were epcohs were less
 - On using more of both of the above, we can definitely get better results