In [1]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf
tf.enable_eager_execution()

import numpy as np
import functools
import os
import time

texts:
* The Trial: http://www.gutenberg.org/cache/epub/7849/pg7849.txt
* Metamorphosis: http://www.gutenberg.org/files/5200/5200-h/5200-h.htm
* A Hunger Artist: https://archive.org/details/AHungerArtistByFranzKafka
* The Castle: https://archive.org/details/in.ernet.dli.2015.149543/page/n11

In [50]:
filenames = ['the castle.txt', '5200-h.txt', '7849-8.txt', 'the hunger artist.txt']

for file in filenames:
    with open('texts/'+file) as infile:
        entire_text=''
        for line in infile:
            if line=='\n' or line.strip()[-1]=='.':
                entire_text+=line
            else:
                entire_text+=line.strip()
        with open('texts/'+file.split('.')[0]+'_cleaned.txt','w') as outfile:
            outfile.write(entire_text)

In [2]:
filenames = ['5200-h.txt', '7849-8.txt', 'the castle.txt', 'the hunger artist.txt']
with open('texts/combined.txt', 'w') as outfile:
    for fname in filenames:
        with open('texts/'+fname) as infile:
            for line in infile:
                outfile.write(line)

In [3]:
text = open('texts/combined.txt', 'rb').read().decode(encoding='utf-8')

Preparing dataset

In [4]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

In [5]:
vocab = sorted(set(text))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])
seq_length = 100
examples_per_epoch = len(text)//seq_length
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
dataset = sequences.map(split_input_target)

In [6]:
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE

BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [7]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [8]:
rnn = functools.partial(tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [9]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model

In [10]:
model = build_model(
  vocab_size = len(vocab), 
  embedding_dim=embedding_dim, 
  rnn_units=rnn_units, 
  batch_size=BATCH_SIZE)

Instructions for updating:
Colocations handled automatically by placer.


In [12]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [13]:
model.compile(optimizer = tf.train.AdamOptimizer(), loss = loss)

In [14]:
checkpoint_dir = './training_checkpoints_kafka'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True)

In [15]:
EPOCHS=3

In [None]:
history = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])


Epoch 1/3
Epoch 2/3
Epoch 3/3

In [16]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints_kafka/ckpt_3'

In [17]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [18]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            28160     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3935232   
_________________________________________________________________
dense_1 (Dense)              (1, None, 110)            112750    
Total params: 4,076,142
Trainable params: 4,076,142
Non-trainable params: 0
_________________________________________________________________


In [30]:
def generate_text(model, start_string):
    num_generate = 500

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    temperature = 0.8 # Higher temp --> less predictable text

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        predictions = predictions / temperature
        predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [32]:
print(generate_text(model, start_string=u"The first"))

The first 
mare with me. But which were her heardos. 
Weren't the man when she of sace the very remarker waiting on them word, were to achioned 
in a very give 
her could brough that as me all to her that he had pays one of they were opened the door.  "You must the viinder 
sitting of the itseirer at all, gut not should go mother such a pecauries of the others was the judget of Land peaceening, strulfing in the otherwise she made no becaued if he gared to himself for him to say.  "I've done I have told a
