## Import libraries

In [None]:
import numpy as np
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# from keras.utils import np_utils

In [None]:
# load data
file = open("/content/frankenstein.txt").read()

In [None]:
# tokenization
# standardization
def tokenize_words(input):
  input = input.lower()
  tokenizer = RegexpTokenizer(r'\w+')
  tokens = tokenizer.tokenize(input)
  filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
  return "".join(filtered)

processed_inputs = tokenize_words(file)

In [None]:
# chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i, c in enumerate(chars))

In [None]:
# check if words to chars or chars to num (?!) has worked?
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total numbers of characters:", input_len)
print("Total vocab:", vocab_len)

Total numbers of characters: 232972
Total vocab: 37


In [None]:
# seq length
seq_length = 100
x_data = []
y_data = []

In [None]:
# loop through the sequence
for i in range(0, input_len - seq_length, 1):
  in_seq = processed_inputs[i:i + seq_length]
  out_seq = processed_inputs[i + seq_length]
  x_data.append([char_to_num[char] for char in in_seq])
  y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print("Total Patterns:", n_patterns)

Total Patterns: 232872


In [None]:
# conver input sequence to np array and so on
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X =X/float(vocab_len)

In [None]:
# one-hot encoding
y = np.eye(np.max(y_data) + 1)[y_data]

In [None]:
# creating the model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [None]:
# compile the model
model.compile(loss ='categorical_crossentropy', optimizer='adam')

In [None]:
# saving weights
filepath = 'model_weights_saved.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose = 1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [None]:
# fit model and let it train
model.fit(X,y, epochs=4, batch_size=256, callbacks= desired_callbacks)

Epoch 1/4
Epoch 1: loss improved from inf to 2.93031, saving model to model_weights_saved.hdf5
Epoch 2/4


  saving_api.save_model(


Epoch 2: loss improved from 2.93031 to 2.91173, saving model to model_weights_saved.hdf5
Epoch 3/4
Epoch 3: loss improved from 2.91173 to 2.89797, saving model to model_weights_saved.hdf5
Epoch 4/4
Epoch 4: loss improved from 2.89797 to 2.85912, saving model to model_weights_saved.hdf5


<keras.src.callbacks.History at 0x7fc97a84af50>

In [None]:
# recompile model with the saved weights
filename = 'model_weights_saved.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# output of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [None]:

# random seed to help generate
start = np.random.randint(0, len(x_data) -1 )
pattern = x_data[start]
print("Random Send:")
print("\"", "".join([num_to_char[value] for value in pattern]), "\"")

Random Send:
" eredseveralmisfortunessincedepartureclervalgenevaalreadyrecoveredspiritsreportedpointmarryinglivelyp "


In [None]:
# generate the text
for i in range(1000):
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  prediction = model.predict(x, verbose=0)
  index = np.argmax(prediction)
  result = num_to_char[index]
  seq_in = [num_to_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern = pattern[1: len(pattern)]

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee