Text Generation

Associated Medium Post: https://medium.com/@kevin_guo/text-generation-7bfbb76ec854
Try it: http://kevins.fun/projects/text-gen/

In [None]:
import numpy as np
import string

from keras.models import Sequential
from keras.layers import LSTM, Dropout, Activation, Dense
from keras.callbacks import ModelCheckpoint

characters = list(string.printable)
sequence_length = 60
batch_size = 512

char_dict = {c:i for i,c in enumerate(characters)}
print("Number of Characters = ", len(characters))

def batch_generator(text, count):
    while True:
        for batch_ix in range(count):
            X = np.zeros((batch_size, sequence_length, len(characters)))
            y = np.zeros((batch_size, len(characters)))

            batch_offset = batch_size * batch_ix

            for sample_ix in range(batch_size):
                sample_start = batch_offset + sample_ix
                for s in range(sequence_length):
                    X[sample_ix, s, char_dict[text[sample_start+s]]] = 1
                    y[sample_ix, char_dict[text[sample_start+s+1]]]=1
            yield X, y


In [None]:
train_file = "chemprot_training_abstracts.tsv"
val_file = "chemprot_test_abstracts.tsv"

with open(train_file, "r") as f:
    text_train = f.read()
    s = ""
    for i in range(len(text_train)):
      if text_train[i] in characters: # only use allowed characters
        s += text_train[i]
    text_train = s
with open(val_file, "r") as f:
    text_val = f.read()
    s = ""
    for i in range(len(text_train)):
      if text_val[i] in characters: # only use allowed characters
        s += text_val[i]
    text_val = s

print("Total of", len(text_train) + len(text_val), "characters")

train_batch_count = (len(text_train) - sequence_length) // batch_size
val_batch_count = (len(text_val) - sequence_length) // batch_size
print("Training batch count: ", train_batch_count)
print("Validation batch count: ",  val_batch_count)

In [None]:
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(sequence_length, len(characters))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=True, input_shape=(sequence_length, len(characters))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=True, input_shape=(sequence_length, len(characters))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False, input_shape=(sequence_length, len(characters))))
model.add(Dropout(0.2))
model.add(Dense(len(characters)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam")

In [None]:
model.fit_generator(
    batch_generator(text_train, count=train_batch_count), train_batch_count, epochs=25,
    callbacks=[ModelCheckpoint("./epoch{epoch:02d}-val-loss{val_loss:.4f}_weights", save_weights_only=True)],
    validation_data=batch_generator(text_val, count=val_batch_count), validation_steps=val_batch_count
)