In [2]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Activation
from keras.utils import to_categorical

Using TensorFlow backend.


In [0]:
# load doc into memory
def load_doc(filename):
	file = open(filename, 'r')
	text = file.read()
	file.close()
	return text

# save document as txt file
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [4]:
raw_text = load_doc('shakespeare.txt')
tokens = raw_text.split()
raw_text = ' '.join(tokens)
print(raw_text)



In [5]:
# organize into sequences of characters
length = 10
char_sequences = []
for i in range(length, len(raw_text)):
	# select sequence of tokens
	seq = raw_text[i-length:i+1]
	# store
	char_sequences.append(seq)
print('Total Sequences: %d' % len(char_sequences))

Total Sequences: 92450


In [0]:
# write sequences to new file
save_doc(char_sequences, 'sequences.txt')

In [7]:
raw_sequences = load_doc('sequences.txt')
lines = raw_sequences.split('\n')
print(lines)



In [0]:
# map letters to numbers
chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))

In [9]:
# create sequence mapping for every sequence
sequences = []
for line in lines:
  encoded_seq = [mapping[char] for char in line]
  sequences.append(encoded_seq)

# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 70


In [0]:
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]

In [0]:
# keras.utils.to_categorical 
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = np.array(sequences)
y = to_categorical(y, num_classes=vocab_size)

In [12]:
print(X.shape)
print(y.shape)

(92450, 10, 70)
(92450, 70)


In [16]:
# model with 150 units

model = Sequential()
model.add(LSTM(150,input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size,activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer ='adam',metrics=['accuracy'])
model.fit(X,y,epochs=30, batch_size=20, verbose=2, shuffle=False);
scores = model.evaluate(X,y,verbose=1,batch_size=20)
print('Accurracy: {}'.format(scores[1])) 

Epoch 1/30
 - 121s - loss: 2.2881 - acc: 0.3541
Epoch 2/30
 - 120s - loss: 1.9068 - acc: 0.4400
Epoch 3/30
 - 121s - loss: 1.7623 - acc: 0.4738
Epoch 4/30
 - 119s - loss: 1.6671 - acc: 0.4976
Epoch 5/30
 - 118s - loss: 1.5939 - acc: 0.5180
Epoch 6/30
 - 120s - loss: 1.5318 - acc: 0.5340
Epoch 7/30
 - 118s - loss: 1.4752 - acc: 0.5484
Epoch 8/30
 - 117s - loss: 1.4224 - acc: 0.5632
Epoch 9/30
 - 119s - loss: 1.3711 - acc: 0.5778
Epoch 10/30
 - 119s - loss: 1.3212 - acc: 0.5917
Epoch 11/30
 - 118s - loss: 1.2727 - acc: 0.6056
Epoch 12/30
 - 118s - loss: 1.2271 - acc: 0.6203
Epoch 13/30
 - 118s - loss: 1.1830 - acc: 0.6331
Epoch 14/30
 - 118s - loss: 1.1413 - acc: 0.6471
Epoch 15/30
 - 118s - loss: 1.1024 - acc: 0.6581
Epoch 16/30
 - 118s - loss: 1.0660 - acc: 0.6690
Epoch 17/30
 - 120s - loss: 1.0325 - acc: 0.6794
Epoch 18/30
 - 118s - loss: 1.0009 - acc: 0.6878
Epoch 19/30
 - 117s - loss: 0.9725 - acc: 0.6966
Epoch 20/30
 - 117s - loss: 0.9475 - acc: 0.7035
Epoch 21/30
 - 117s - loss: 0