## LSTM Generator Model

*Based on http://nadbordrozd.github.io/blog/2016/09/17/text-generation-with-keras-char-rnns/*

In [23]:
import os, sys, codecs
import numpy as np

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.callbacks import ModelCheckpoint

In [2]:
# we limit ourselves to the following chars.
# Uppercase letters will be represented by prefixing them with a U
# - a trick proposed by Zygmunt Zajac http://fastml.com/one-weird-trick-for-training-char-rnns/
chars = u'\n !"#$%&\'()*+,-./0123456789:;<=>?@[\\]abcdefghijklmnopqrstuvwxyzU' + u'\xa0\xbb\xbf\xc1\xc9\xcd\xd1\xd3\xda\xe0\xe1\xe9\xed\xf1\xf3\xfa\xfc\u2015'
charset = set(chars)
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


def fix_char(c):
    if c.isupper():
        return 'U' + c.lower()
    elif c in charset:
        return c
    elif c in [u'\xa1', u'\xb0', u'\u201c', u'\u201d']:
        return '"'
    elif c in [u'\u2013', u'\u2014', u'\u2212', u'\u2500']:
        return '-'
    elif c == u'\u2026':
        return '...'
    else:
        return ''


def encode(text):
    return ''.join(fix_char(c) for c in text)


def decode(chars):
    upper = False
    for c in chars:
        if c == 'U':
            upper = True
        elif upper:
            upper = False
            yield c.upper()
        else:
            yield c

In [30]:
def generate_text_slices(path, seqlen=40, step=3):
    with codecs.open(path, "r", "utf-8") as f:
        text = f.read().replace('"','')

    # limit the charset, encode uppercase etc
    text = encode(text)
    yield len(text), text[:seqlen]

    while True:
        for i in range(0, len(text) - seqlen, step):
            sentence = text[i: i + seqlen]
            next_char = text[i + seqlen]
            yield sentence, next_char


def generate_arrays_from_file(path, seqlen=40, step=3, batch_size=10):
    slices = generate_text_slices(path, seqlen, step)
    text_len, seed = slices.next()
    samples = (text_len - seqlen + step - 1)/step
    yield samples, seed

    while True:
        X = np.zeros((batch_size, seqlen, len(chars)), dtype=np.bool)
        y = np.zeros((batch_size, len(chars)), dtype=np.bool)
        for i in range(batch_size):
            sentence, next_char = slices.next()
            for t, char in enumerate(sentence):
                X[i, t, char_indices[char]] = 1
            y[i, char_indices[next_char]] = 1
        yield X, y


In [4]:
def sample(a, temperature=1.0):
    # helper function to sample an index from a probability array
    a = np.log(a) / temperature
    a = np.exp(a) / np.sum(np.exp(a))
    # this is stupid but np.random.multinomial throws an error if the probabilities
    # sum to > 1 - which they do due to finite precision
    while sum(a) > 1:
        a /= 1.000001
    return np.argmax(np.random.multinomial(1, a, 1))


def generate(model, seed, diversity):
    _, maxlen, _ = model.input_shape
    assert len(seed) >= maxlen
    sentence = seed[len(seed)-maxlen: len(seed)]
    while True:
        x = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x[0, t, char_indices[char]] = 1.

        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        yield next_char
        sentence = sentence[1:] + next_char


In [32]:
def generate_and_print(model, seed, diversity, n):
    sys.stdout.write(' with seed: ')
    sys.stdout.write(''.join(decode(seed)))
    sys.stdout.write(':\n')

    generator = decode(generate(model, seed, diversity))
    sys.stdout.write(''.join(decode(seed)))

    full_text = []
    for _ in range(n):
        next_char = generator.next()
        sys.stdout.write(next_char.encode("utf-8"))
        sys.stdout.flush()
        full_text.append(next_char)

    return ''.join(full_text)


In [38]:
aws_access = {}
aws_access["Key"] = ""
aws_access["Secret"] = ""

In [40]:
import tinys3
# save the model definitions and weights to files and optionaly upload them to S3
def save_model(model, local_path, bucket, access_data, upload_to_s3=False):
    model.save_weights(local_path+'model_weights.h5')
    with open(local_path+"model_definition.yaml", "w") as yaml_file:
        yaml_file.write(model.to_yaml())
    
    if upload_to_s3:
        conn = tinys3.Connection(access_data["Key"], access_data["Secret"], tls=True)
        f = open(local_path+'model_weights.h5','rb')
        conn.upload('model_weights.h5', f, bucket)
        f = open(local_path+'model_definition.yaml','rb')
        conn.upload('model_definition.yaml', f, bucket)

In [42]:
# main train function
def train_lstm(model, input_path, validation_path, save_dir, step=3, batch_size=512, iters=10, save_every=1):
    _, seqlen, _ = model.input_shape
    train_gen = generate_arrays_from_file(input_path, seqlen=seqlen,
                                    step=step, batch_size=batch_size)
    samples, seed = train_gen.next()

    print 'samples per epoch %s' % samples
    last_epoch = model.metadata.get('epoch', 0)

    for epoch in range(last_epoch + 1, last_epoch + iters + 1):
        val_gen = generate_arrays_from_file(
            validation_path, seqlen=seqlen, step=step, batch_size=batch_size)
        val_samples, _ = val_gen.next()

        hist = model.fit_generator(
            train_gen,
            validation_data=val_gen,
            validation_steps=val_samples/batch_size,
            steps_per_epoch=samples/batch_size, epochs=1,
            verbose=1)

        val_loss = hist.history.get('val_loss', [-1])[0]
        loss = hist.history['loss'][0]
        model.metadata['loss'].append(loss)
        model.metadata['val_loss'].append(val_loss)
        model.metadata['epoch'] = epoch

        message = 'loss = %.4f   val_loss = %.4f' % (loss, val_loss)
        print message
        print 'done fitting epoch %s' % epoch
        
        if epoch % save_every == 0:
            print "saving model"
            save_model(model, save_dir, bucket="juandoso-microtales", access_data=aws_access, upload_to_s3=True)
        
        print "generating a sample" 
        generate_and_print(model, seed, 0.5, 100)
        print "\n_____" 

In [8]:
#cut the texts into sequences of this many chars
maxlen = 80

In [16]:
# LSTM model definition

model = Sequential()
model.add(LSTM(256, input_shape=(maxlen, len(chars)), return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(256, return_sequences=False))
model.add(Dropout(0.25))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.metadata = {'epoch': 0, 'loss': [], 'val_loss': []}


In [47]:
# training parameters

main_path = "/home/ubuntu/data/"

train_path = main_path + "tales_2.txt"
test_path = main_path + "tales_val.txt"
model_dir = main_path
step = 1
batch_size = 512
max_epochs = 30

In [48]:
train_lstm(model=model,
           input_path=train_path,
           validation_path=test_path,
           save_dir=model_dir,
           step=step,
           batch_size=batch_size,
           iters=max_epochs,
           save_every=2)

samples per epoch 224580
Epoch 1/1
loss = 1.2031   val_loss = 1.3907
done fitting epoch 27
generating a sample
 with seed: 
Creyó que se repondría, pero lo amaba de verdad.#
Con una vida no tenían sufi:

Creyó que se repondría, pero lo amaba de verdad.#
Con una vida no tenían suficiente.
Su propia sombra la sombra de preciosa. Ella era un humano,#
No conozco el tiempo.
El final,
_____
Epoch 1/1
loss = 1.1929   val_loss = 1.4048
done fitting epoch 28
saving model
generating a sample
 with seed: 
Creyó que se repondría, pero lo amaba de verdad.#
Con una vida no tenían sufi:

Creyó que se repondría, pero lo amaba de verdad.#
Con una vida no tenían suficiente.#
Ella se contó en el mar. Y el menos se le dejarás escribir su perro.#
Ella te estaba tiene 
_____
