In [50]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint

In [3]:
inputs = open('data/input.txt').read().lower()
inputs_separated = inputs.split('\n\n')

In [9]:
inputs_treated = [s.replace("\n", " ").replace("[wp] ", "") for s in inputs_separated]

In [19]:
chars = sorted(list(set(inputs.replace('\n', ' '))))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [28]:
inputs_exploded = [[char_to_int[c] for c in s] for s in inputs_treated]

In [27]:
n_chars = len(inputs)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  6378570
Total Vocab:  122


In [32]:
features = []
labels = []

training_length = 120

# Iterate through the sequences of tokens
for seq in inputs_exploded:

    # Create multiple training examples from each sequence
    for i in range(training_length, len(seq)):
        
        # Extract the features and label
        extract = seq[i - training_length:i + 1]

        # Set the features and label
        features.append(extract[:-1])
        labels.append(extract[-1])
        
features = np.array(features)

In [33]:
features.shape

(1399396, 120)

In [34]:
num_words = len(char_to_int) + 1
label_array = np.zeros((len(features), num_words), dtype=np.int8)
for example_index, word_index in enumerate(labels):
    label_array[example_index, word_index] = 1

In [42]:
X = np.reshape(features, (len(features), training_length, 1))
X = X / float(len(char_to_int) - 1)

In [46]:
train_set_idx, test_set_idx = train_test_split(np.arange(len(features)), test_size=0.1)

In [47]:
X_train = X[train_set_idx]
X_test = X[test_set_idx]
y_train = label_array[train_set_idx]
y_test = label_array[test_set_idx]

In [57]:
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1], activation='softmax'))
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [58]:
checkpoint_filepath = "checkpoints/weights-lstm-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
model.fit(X_train, y_train, epochs=50, batch_size=32, callbacks=callbacks_list)

Epoch 1/50
  15712/1259456 [..............................] - ETA: 6:44:20 - loss: 3.1393 - acc: 0.1419