In [38]:
# Importing Dependencies
import numpy as np
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,LSTM,Flatten
from keras.utils import np_utils
from tensorflow.keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Welcome\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
file = open("frankenstein.txt").read()

In [40]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [41]:
# Chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [42]:
# Check if words to chars or chars to num (?!) has worked ?
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 269566
Total vocab: 38


In [43]:
# Seq length
seq_length = 100
x_data = []
y_data = []

In [44]:
# Loop through Sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 269466


In [45]:
# Convert input sequence to np array and so on
X = np.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [46]:
# one-hot encoding
y = np_utils.to_categorical(y_data)

In [47]:
# Creating the model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [48]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer= 'adam')

In [49]:
# Saving Weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [50]:
# Fit the model and train it
model.fit(X, y, epochs=4, batch_size=256, callbacks=[checkpoint])

Epoch 1/4
Epoch 00001: loss improved from inf to 2.87244, saving model to model_weights_saved.hdf5
Epoch 2/4
Epoch 00002: loss improved from 2.87244 to 2.57808, saving model to model_weights_saved.hdf5
Epoch 3/4
Epoch 00003: loss improved from 2.57808 to 2.41125, saving model to model_weights_saved.hdf5
Epoch 4/4
Epoch 00004: loss improved from 2.41125 to 2.29203, saving model to model_weights_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x1a30596bf08>

In [51]:
# Recompile model with the saved weights
filename = 'model_weights_saved.hdf5'
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'Adam')

In [52]:
# Output of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [53]:
# Random seed to help generate
start = np.random.randint(0 , len(x_data) - 1)
pattern = x_data[start]
print('Random Seed : ')
print("\"",''.join([num_to_char[value] for value in pattern]),"\"")

Random Seed : 
" death knell appeared like dream yet distinct oppressive reality sun far descended still sat shore sa "


In [54]:
# Generate the text
for i in range(1000):
    x = np.reshape(pattern , (1 , len(pattern) , 1))
    x = x / float(vocab_len)
    prediction = model.predict(x , verbose = 0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

tsed serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter serter s





