In [17]:
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense , Dropout , LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# load data
file = open("data.txt").read()

In [20]:
# tokenization - process of breaking a stream of text up into words , phrases , symbols or other meaningful elements 
# standarization
def tokenize_words(input):
    input = input.lower()
    # instantiating the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # tokenizing the text into tokens
    tokens = tokenizer.tokenize(input)
    # filtering the stopwords using lambda
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)
# preprocessing the input  data
processed_inputs = tokenize_words(file)

In [21]:
# chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i , c in enumerate(chars))

In [22]:

#check if words to chars or chars to num(?!) has worked?
input_len = len(processed_inputs)
vocab_len = len(chars)
print (" Total number of characters:" , input_len)
print("Toatl vocab:" , vocab_len)

 Total number of characters: 23999
Toatl vocab: 35


In [23]:
# seq length
seq_length = 100
x_data = []
y_data = []

In [24]:
# loop through the sequence
for i in range (0 , input_len - seq_length , 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
n_patterns = len(x_data)
print("Total Patterns:" , n_patterns)

Total Patterns: 23899


In [25]:
# conert input sequence into np array and so on
X = numpy.reshape(x_data , (n_patterns , seq_length , 1))
X = X/float(vocab_len)

In [27]:

# one hot encoding
y=np_utils.to_categorical(y_data)

In [28]:
# creating the model
model = Sequential()
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(256 , return_sequences =True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1] , activation ='softmax'))

In [29]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer = 'adam')

In [30]:
# saving weights
filepath ="model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath , monitor = 'loss' , verbose =1 , save_best_only = True , mode = 'min')
desired_callbacks = [checkpoint]

In [31]:
model.fit(X, y , epochs =4 , batch_size = 256 , callbacks = desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.98430, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.98430 to 2.92501, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.92501 to 2.91787, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.91787 to 2.91323, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x1ff163e2640>

In [32]:
# recompile model with the saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam')

In [33]:

#output of the model back into characters
num_to_char = dict((i,c) for i , c in enumerate(chars))

In [34]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" apparitionsoonexplainedpermissionmotherprevailedrusticguardiansyieldchargefondsweetorphanpresencesee "


In [36]:

for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee