In [57]:
#!wget http://www.lawrence.edu/fast/greggj/CMSC490/shakespeare.zip
#!unzip shakespeare.zip

**Getting the words from the file**

In [58]:
def getWord(file):
    char = file.read(1)
    if not char:
        return ''
    char = char.lower()

    while char < 'a' or char > 'z':
        char = file.read(1)        
        if not char: 
            return ''
        char = char.lower()

    str = ''
    while char >= 'a' and char <= 'z':
        str = str + char
        char = file.read(1)        
        if not char: 
            return str
        char = char.lower()
    
    if char == 'â€™':
        str = ''
        char = file.read(1)
        if not char: 
            return str
        char = char.lower()
        while char >= 'a' and char <= 'z':
            char = file.read(1)        
            if not char: 
                return str
            char = char.lower()
        return getWord(file)
    return str

**Getting substrings**

In [59]:
import numpy as np

def makeSubstrings(word):
    problem_instances = []
    target_values = []
    
    if len(word) >= 4 and len(word) <= 9:
        for i in range(2,len(word)):
            if i == len(word)-1:
                subword = word
                target = ' '
            else:
                subword = word[0:i]
                target = word[i]
            problem_instances.append(subword)
            target_values.append(target)
           
    return np.array(problem_instances), np.array(target_values)

<span style="color:red">In the code above the statement target = word[i+1] should be target = word[i]</span>

**One-hot encoding**

In [60]:
import numpy as np

def makeSequence(str):
  if len(str) < 10:
    num_padding = 10 - len(str)
    for _ in range(0,num_padding):
      str = str + " "
      
  onehotarr = []
  for ch in str:
    onehot = np.array([0] * 27)
    
    if ch == ' ':
      position = 26
    else:
      position = ord(ch) - ord('a')
     
    onehot[position] = 1  
    onehotarr.append(onehot)
  
  onehotarr = np.array(onehotarr)

  return onehotarr

**Reading the text file into the arrays**

In [61]:
import numpy as np
n = 100000

rawWords = 0
with open('shakespeare.txt', 'r') as file:
    for line in file:
        num = len(line.split())
        rawWords += num
file.close()

words = []
problem_instances = []
problem_instances = np.array(problem_instances)
target_values = []
target_values = np.array(target_values)
onehot_problems = []
onehot_targets = []

file = open('shakespeare.txt', 'r')
for _ in range(0, rawWords):
    word = getWord(file)
    words.append(word)
file.close()

j=0
for i in range(0,n):
    word = words[j]
    problem, target = makeSubstrings(word)
    problem_instances = np.concatenate((problem_instances, problem), axis=0)  
    target_values = np.concatenate((target_values, target), axis=0)  
    j += 1
    i += len(problem_instances)
    
for i in range(0,len(problem_instances)):
    oh_prob = makeSequence(problem_instances[i])
    oh_tar = makeSequence(target_values[i])
    onehot_problems.append(oh_prob)
    onehot_targets.append(oh_tar)


onehot_problems = np.array(onehot_problems)
onehot_targets = np.concatenate(onehot_targets)

<span style="color:red">The way you assembled your targets is incorrect. Note that you are using makeSequence() on your targets. That function returns a list of 10 vectors, not 1. This means that you are making ten times as many targets as you need. Further, since your targets consist of a single letter, only the first letter
in that sequence of 10 will be a non-space character, while the other 9 will all be spaces. This means that a little over 90% of your target values are the space
character. All you will end up doing is training a network to output ' '. If it does that, it will be right more than 90% of the time.</span>

**Training, Validation, and Test Sets**

In [62]:
train_ratio = 0.6
validation_ratio = 0.2
test_ratio = 0.2

num_examples = len(onehot_problems)

train_end = int(train_ratio * num_examples)
validation_end = train_end + int(validation_ratio * num_examples)

train_input = onehot_problems[:train_end]
train_target = onehot_targets[:train_end]

validation_input = onehot_problems[train_end:validation_end]
validation_target = onehot_targets[train_end:validation_end]

test_input = onehot_problems[validation_end:]
test_target = onehot_targets[validation_end:]

**The Model**

In [63]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(16, input_shape=(10, 27), return_sequences=False))
model.add(Dense(27, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit( 
    train_input, train_target,
    validation_data=(validation_input, validation_target),
    epochs=10,  
    batch_size=32  
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3d70171ea0>

In [64]:
p = model.predict(np.array([makeSequence('hous')]))
print(chr(ord('a')))
print(p)
#it seems to always pick the space as the most probable outcome. I don't know why so I'm just taking comfort in the fact that at least the second best p is the correct guess

a
[[1.4934370e-03 2.0491339e-04 1.1166770e-03 2.3804887e-03 1.5634406e-02
  7.5066154e-04 2.4627792e-03 3.4647384e-03 2.2556472e-03 4.2907773e-06
  6.4736319e-04 3.0154707e-03 1.2844884e-03 4.1956315e-03 1.4564706e-03
  7.4128521e-04 5.0564493e-05 6.0058879e-03 6.5865442e-03 7.5003323e-03
  3.0027020e-03 6.3747924e-04 6.5228558e-04 1.0580634e-05 1.5125592e-03
  2.1872482e-05 9.3291050e-01]]


<span style="color:red">Mostly correct, with one very big mistake. See the comment above for details. 77/80</span>