In [1]:
# import dependencies
import numpy
import sys
import urllib
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [41]:
#load data
file = open('frankstein.txt').read()

In [62]:
# tokenization 
# standardization
def tokenize_words(input) :
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

processed_inputs = tokenize_words(file)

In [63]:
# chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))


In [64]:
# check if words to chars or chars to num (?:) has worked?
input_len = len(processed_inputs)
vocab_len = len(chars)
print('Total number of characters',input_len)
print('Total vocab:',vocab_len)

Total number of characters 83157
Total vocab: 41


In [65]:
# seq length
seq_length = 100
x_data = []
y_data = []


In [66]:
#loop through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append([char_to_num[out_seq]])

n_patterns = len(x_data)
print('Total Patterns : ',n_patterns)

Total Patterns :  83057


In [67]:
# convert input seq to np array and so on
X = numpy.reshape(x_data,(n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [68]:
#one hot encoding
y = np_utils.to_categorical(y_data) 

In [69]:
#creating the model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1],X.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [70]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')


In [71]:
# saving weights
filepath = 'model_weights_saved.hdf5'
checkpoint =  ModelCheckpoint(filepath,monitor='loss',verbose = 1,save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [72]:
# fit model let it train
model.fit(X,y,epochs=4, batch_size=256, callbacks=desired_callbacks)


Epoch 1/4

Epoch 00001: loss improved from inf to 2.96239, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.96239 to 2.92280, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.92280 to 2.88093, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.88093 to 2.70765, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7f5be5289e48>

In [73]:
#recompile model with the saved weights
filename = 'model_weights_saved.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [74]:
# output  of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [75]:
#random seed to help generate
start = numpy.random.randint(0,len(x_data)-1)
pattern = x_data[start]
print('Random Seed :')
print("\"",''.join([num_to_char[value] for value in pattern]) , "\"")

Random Seed :
" es nature show works hiding places ascend heavens discovered blood circulates nature air breathe acq "


In [76]:
# generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1,len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

e sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere sere ser