""" A clean, no_frills character-level generative language model.
Created by Danijar Hafner (danijar.com), edited by Chip Huyen
for the class CS 20SI: "TensorFlow for Deep Learning Research"

Based on Andrej Karpathy's blog: 
http://karpathy.github.io/2015/05/21/rnn-effectiveness/
"""

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import sys
sys.path.append('..')

import time

import tensorflow as tf

import examples.utils

DATA_PATH = 'examples/data/arvix_abstracts.txt'
HIDDEN_SIZE = 200
BATCH_SIZE = 64
NUM_STEPS = 50
SKIP_STEP = 40
TEMPRATURE = 0.7
LR = 0.003
LEN_GENERATED = 300

In [2]:
def vocab_encode(text, vocab):
    return [vocab.index(x) + 1 for x in text if x in vocab]

def vocab_decode(array, vocab):
    return ''.join([vocab[x - 1] for x in array])

In [3]:
def read_data(filename, vocab, window=NUM_STEPS, overlap=NUM_STEPS//2):
    for text in open(filename):
        text = vocab_encode(text, vocab)
        for start in range(0, len(text) - window, overlap):
            chunk = text[start: start + window]
            chunk += [0] * (window - len(chunk))
            yield chunk

def read_batch(stream, batch_size=BATCH_SIZE):
    batch = []
    for element in stream:
        batch.append(element)
        if len(batch) == batch_size:
            yield batch
            batch = []
    yield batch

dynamic_rnn(
    cell,
    inputs,
    sequence_length=None,
    initial_state=None,
    dtype=None,
    parallel_iterations=None,
    swap_memory=False,
    time_major=False,
    scope=None
)

A pair (outputs, state) where:

outputs: The RNN output `Tensor`.

  If time_major == False (default), this will be a `Tensor` shaped:
    `[batch_size, max_time, cell.output_size]`.

  If time_major == True, this will be a `Tensor` shaped:
    `[max_time, batch_size, cell.output_size]`.

  Note, if `cell.output_size` is a (possibly nested) tuple of integers
  or `TensorShape` objects, then `outputs` will be a tuple having the
  same structure as `cell.output_size`, containing Tensors having shapes
  corresponding to the shape data in `cell.output_size`.

state: The final state.  If `cell.state_size` is an int, this
  will be shaped `[batch_size, cell.state_size]`.  If it is a
  `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
  If it is a (possibly nested) tuple of ints or `TensorShape`, this will
  be a tuple having the corresponding shapes.

In [4]:
def create_rnn(seq, hidden_size=HIDDEN_SIZE):
    cell = tf.contrib.rnn.GRUCell(hidden_size)
    in_state = tf.placeholder_with_default(
            cell.zero_state(tf.shape(seq)[0], tf.float32), [None, hidden_size])
    # this line to calculate the real length of seq
    # all seq are padded to be of the same length which is NUM_STEPS
    length = tf.reduce_sum(tf.reduce_max(tf.sign(seq), 2), 1)
    output, out_state = tf.nn.dynamic_rnn(cell, seq, length, in_state)
    return output, in_state, out_state

tf.one_hot that can convert a set of sparse labels to a dense one-hot representation. This is in addition to tf.nn.sparse_softmax_cross_entropy_with_logits, which can in some cases let you compute the cross entropy directly on the sparse labels instead of converting them to one-hot.

In [5]:
def create_model(seq, temp, vocab, hidden=HIDDEN_SIZE):
    seq = tf.one_hot(seq, len(vocab))
    output, in_state, out_state = create_rnn(seq, hidden)
    # fully_connected is syntactic sugar for tf.matmul(w, output) + b
    # it will create w and b for us
    logits = tf.contrib.layers.fully_connected(output, len(vocab), None)
    loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=logits[:, :-1], labels=seq[:, 1:]))
    # sample the next character from Maxwell-Boltzmann Distribution with temperature temp
    # it works equally well without tf.exp
    sample = tf.multinomial(tf.exp(logits[:, -1] / temp), 1)[:, 0] 
    return loss, sample, in_state, out_state

In [6]:
def training(vocab, seq, loss, optimizer, global_step, temp, sample, in_state, out_state):
    saver = tf.train.Saver()
    start = time.time()
    with tf.Session() as sess:
        writer = tf.summary.FileWriter('graphs/gist', sess.graph)
        sess.run(tf.global_variables_initializer())
        
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/arvix/checkpoint'))
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        
        iteration = global_step.eval()
        for batch in read_batch(read_data(DATA_PATH, vocab)):
            batch_loss, _ = sess.run([loss, optimizer], {seq: batch})
            if (iteration + 1) % SKIP_STEP == 0:
                print('Iter {}. \n    Loss {}. Time {}'.format(iteration, batch_loss, time.time() - start))
                online_inference(sess, vocab, seq, sample, temp, in_state, out_state)
                start = time.time()
                saver.save(sess, 'checkpoints/arvix/char-rnn', iteration)
            iteration += 1

In [7]:
def online_inference(sess, vocab, seq, sample, temp, in_state, out_state, seed='T'):
    """ Generate sequence one character at a time, based on the previous character
    """
    sentence = seed
    state = None
    for _ in range(LEN_GENERATED):
        batch = [vocab_encode(sentence[-1], vocab)]
        feed = {seq: batch, temp: TEMPRATURE}
        # for the first decoder step, the state is None
        if state is not None:
            feed.update({in_state: state})
        index, state = sess.run([sample, out_state], feed)
        sentence += vocab_decode(index, vocab)
    print(sentence)

In [10]:
def main():
    vocab = (
            " $%'()+,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ"
            "\\^_abcdefghijklmnopqrstuvwxyz{|}")
    seq = tf.placeholder(tf.int32, [None, None])
    temp = tf.placeholder(tf.float32)
    loss, sample, in_state, out_state = create_model(seq, temp, vocab)
    global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
    optimizer = tf.train.AdamOptimizer(LR).minimize(loss, global_step=global_step)
    examples.utils.make_dir('checkpoints')
    examples.utils.make_dir('checkpoints/arvix')
    training(vocab, seq, loss, optimizer, global_step, temp, sample, in_state, out_state)
    
if __name__ == '__main__':
    tf.reset_default_graph()
    main()

Iter 39. 
    Loss 9528.40625. Time 6.846832752227783
Tki                                                                                                                                                                                                                                                                                                          
Iter 79. 
    Loss 8567.9150390625. Time 6.672576904296875
TeL on ther on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on ther on ther on ther on ther on ther on ther on ther on ther on ther on ther on ther on ther on ther on ther on ther on ther on ther on ther on ther on ther on 
Iter 119. 
    Loss 7860.015625. Time 6.746322393417358
The the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the

Iter 959. 
    Loss 3835.907958984375. Time 6.585753917694092
The computation of the network convergence reconstrated to a convex optimization processing and the network deconstrated and the network deconstrated and the network deconstrated and the network deconstrated and the network deconstrated and the network deconstrated and the network decoder and standar
Iter 999. 
    Loss 3426.80615234375. Time 6.586920499801636
The and the simple experimental results on a state-of-the-art results on a state-of-the-art results on a state-of-the-art results on a state-of-the-art results on a state-of-the-art results on a state-of-the-art results on a state-of-the-art results on a state-of-the-art results on a state-of-the-art
Iter 1039. 
    Loss 3630.8857421875. Time 6.6226770877838135
The algorithm for the results in search for the recurtent describution in the proposed deep neural networks. We ored to the results in searca essive the network properties the parameters the network are state-of

Iter 1879. 
    Loss 3034.108154296875. Time 6.616097927093506
The activation of the prediction and sumpoct an active to an extension of the and outperform decoder need to be trained using an and training on the and the accuracy of the and the distivatious to adversarial samps, ate accesses and the computational and compared to the input of the are and computati
Iter 1919. 
    Loss 2891.8447265625. Time 6.5822913646698
The accuracy for the activation function of a deep network are autometriction and output for the network architectures by a fixed framework for the proposed method for the accurate computation of the network prodietional on a deep network are at the maximpo than 12 learning and analyzing and output f
Iter 1959. 
    Loss 2795.4521484375. Time 6.607222318649292
The approach for the accuracy of neural network architecture can be trained on the model strategy that allows for the accuracy of neural network architecture can be trained on the model strategy that allows for th

Iter 2799. 
    Loss 2336.48828125. Time 6.635299444198608
The proposed deep RNNs that iterations and implements the model structure that are the model not orithan the parallel computations for the model not orith neuronal models which as experiments show that the model state-of-the-art performance on a speech recognition in such as deep neural network (DNN)
Iter 2839. 
    Loss 2715.3408203125. Time 6.6512980461120605
The better for the autoencoders of choracter learning task in the training of the speed up to improve the superior to deep neural networks in a deep learning in deep neural networks in a deep learning in deep neural networks in a deep learning in deep neural networks in a deep learning in deep neural
Iter 2879. 
    Loss 2346.62841796875. Time 6.62807559967041
The framework are available for larger neural networks. We report of computation in the performance of an error information of a meanable of model performance on the sequence layers that the success of our probabil

Iter 3719. 
    Loss 2270.46826171875. Time 6.6002020835876465
The features used for training deep learning in derive the distance units introduce a method for Quantitative disting fullys (DAC) models have allows GFound the desirable to a single layers and the different descrapsively using stochastic normsst characted and transformation for sparse constrained an
Iter 3759. 
    Loss 2475.840087890625. Time 6.6235270500183105
The framework are property of new argeency of the recently proposed methods of a deep neural network (DNN) to empirical investigation in sparse consistency reduction to be trained using a new approach for the input of the recently proposed methods of the recently proposed and network (RNN) architectu
Iter 3799. 
    Loss 2218.86083984375. Time 6.603944540023804
The accuracy of parameter that are computation in the context of neural networks with recent approaches are computation in the context of neural networks with recent approaches are computation in the context