In [2]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

## Text to train on, utf-8 encoded from a book

In [3]:
filename = "/Users/pranavbarot/Desktop/DataSci/wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

## Map each character to a number

In [7]:
chars = sorted(list(set(raw_text)))
char_to_int = dict((c,i) for i,c in enumerate(chars))

In [10]:
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  144413
Total Vocab:  48


In [11]:
seq_length = 100

dataX = []
dataY = []

for i in range(0, n_chars-seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)


Total Patterns:  144313


## one hot encoding for the characters

In [12]:
X = np.reshape(dataX, (n_patterns, seq_length, 1))

X = X/float(n_vocab)

y = np_utils.to_categorical(dataY)

## Keras LSTM, dropout = 0.2, cross entropy loss, 'adam' optimizer is for speed

In [13]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [14]:

# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]


## not going to fit it on my own computer! training time too long and expensive

In [15]:
model.fit(X, y, epochs = 20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
Epoch 00001: loss improved from inf to 2.99722, saving model to weights-improvement-01-2.9972.hdf5
Epoch 2/20
 20608/144313 [===>..........................] - ETA: 9:05 - loss: 2.8362

KeyboardInterrupt: 

## Once the model is trained, the dict from above can be used to generate a sequence of texts

## These would be based on the probablities for each of the 48 character classes

## If the model was good, an intelligent sentence would be generated, similar to the writing style of the input text
