In [None]:
# Recurrent Neural Networks

GRUs are simpler than LSTMs

# TO create a GRU cell with a certain size:
cell = tf.nn.rnn_cell.GRUCell(hidden_size)
#Layers of cells
rnn_cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)

# TF can construct RNN dynamically
#tf.nn.dynamic_rnn(cell, seq, length, initial_state)
# adjust sequence length by padding with zeros

# can ignore loss from padded label with boolean mask
full_loss = tf.nn.softmax_cross_entropy_loss_with_logits(labels=labels, logits=logits)
loss = tf.reduce_mean(tf.boolean_mask(full_loss, mask))
# or adjust the length
tf.reduce_sum(tf.reduce_max(tf.sign(seq), 2), 1)


# Deal with vanishing gradients -- with activation units
# Exploding gradients - use tf.clip_by_global_norm
gradients = tf.gradients(cost, tf.trainable_variables()) # finds gradient for all trainable variables
clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm) # clips by a max norm
optimizer = tf.AdamOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(gradients, trainables))  # feeds clipped to optimizer, (why not use clipped_gradients?)

In [None]:
Language Generation

-- Word-level (limited by the vocabulary)
-- Character-level (may produce nonsense words)
-- Hybrid (combined word and character)
-- Subword (most common words and syllables)


In [None]:
Phase 1: Assemble the graph

    Define placeholders for Input and Output
    Define Weights
    Define Inference Model
    Define Loss Function
    Define Optimizer (e.g. GradientDescentOptimizer, AdamOptimizer)

Phase 2: Train the model

    Initialize all model variables (e.g. tf.global_variables_initializer())
    Feed in the training data (e.g. batching, randomization)
    Execute inference model on the training inputs with the current model parameters
    Compute the cost
    Adjust model parameters accordingly

In [1]:
## CS20SI Github's Character-Level Generative Language Model

from __future__ import print_function
import os
import time

import tensorflow as tf

DATA_PATH = 'data/arxiv_abstracts.txt'
HIDDEN_SIZE = 200

# training params
BATCH_SIZE = 64
NUM_STEPS = 50
SKIP_STEP = 40
# lower temperature -> more conservative, higher temperature -> more risk/diversity
TEMPERATURE = 0.7
LR = 0.003
LEN_GEN = 300

def vocab_encode(text, vocab):
    # assigns numerical value depending on position in vocab dict
    return [vocab.index(x) + 1 for x in text if x in vocab]

def vocab_decode(array, vocab):
    # given array of numbers, decode into words
    return ''.join([vocab[x - 1] for x in array])

def read_data(filename, vocab, window=NUM_STEPS, overlap=NUM_STEPS/2):
    # encode words in a file
    for text in open(filename):
        text = vocab_encode(text, vocab)
        for start in range(0, len(text) - window, overlap):
        # generates word chunks of length window, followed by [0]'s if padding needed
            chunk = text[start: start + window]
            chunk += [0] * (window - len(chunk))
            yield chunk
        # yield creates a Generator object;
        # only when that Generator is run are those operators in the for loop performed
        
def read_batch(stream, batch_size=BATCH_SIZE):
    batch = []
    # creates a batch composed of elements from stream
    for element in stream:
        batch.append(element)
        if len(batch) == batch_size:
            yield batch
            batch = []
    yield batch
    
def create_rnn(seq, hidden_size=HIDDEN_SIZE):
    # makes an RNN with GRU cells
    cell = tf.nn.rnn_cell.GRUCell(hidden_size)
    default_input = cell.zero_state(tf.shape(seq)[0], tf.float32)
    # state to feed into RNN
    in_state = tf.placeholder_with_default(default_input, [None, hidden_size])    
    
    # finds total sum --> considers padding of seq
    # reduce_max --> gives max length
    # reduce_sum --> total of the lengths
    length = tf.reduce_sum(tf.reduce_max(tf.sign(seq), 2), 1)
    
    # creates RNN based on cell, feeds in the sequence as input, with initial_state=in_state
    output, out_state = tf.nn.dynamic_rnn(cell, seq, length, in_state)
    
    return output, in_state, out_state
    

def create_model(seq, temp, vocab, hidden=HIDDEN_SIZE):
    # seq is the ARRAY of INDICES (representing the characters)
    # creates one_hot tensor with 1 at seq, with HxW - [#indices x len(vocab)]
    # -- array depicting character-level
    seq = tf.one_hot(seq, len(vocab))
    # load the above one_hot tensor into RNN with #units = hidden
    output, in_state, out_state = create_rnn(seq, hidden)
    
    # adds a fully connected layer -- feed in "outputs", w/ #outputunits = len(vocab)
    # returns the resulting tensor (predictions from going through the RNN)
    logits = tf.contrib.layers.fully_connected(output, len(vocab), None)
    # None to skip default ReLU and keep linear
    
    # logits are the predict
    loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=logits[:, :-1], labels=seq[:, 1:]))
    sample = tf.multinomial(tf.exp(logits[:,-1] / temp), 1)[:, 0]
    # gives the softmax cross-entropy loss, a single character, and RNN in_state and out_state tensors
    return loss, sample, in_state, out_state
    
# Step 1: Define inference model
#(weights, etc. administered by TF RNN implementation)
def online_inference(sess, vocab, seq, sample, temp, in_state, out_state, seed="T"):
    """ Dynamically generates the seq based on previous characters """
    sentence = seed # begins with first letter
    state = None
    for _ in range(LEN_GEN):
        batch = [vocab_encode(sentence[-1], vocab)]
        feed_dict = {seq: batch, temp: TEMPERATURE}
        if state is not None:
            # feeds in the state in feed_dict
            feed_dict.update({in_state: state})
        # returns index and new state from running the out_state op
        # out_state from the dynamic RNN in createRNN -> createModel -> main -> online_inference ... phew!
        # Recall RNN cell has in_state, out_state, and output
        index, state = sess.run([sample, out_state], feed_dict)
        sentence += vocab_decode(index, vocab)
    print(sentence)
    
def training(vocab, seq, loss, optimizer, global_step, temp, sample, in_state, out_state):
    saver = tf.train.Saver()
    start = time.time()
    with tf.Session() as sess:
        writer = tf.summary.FileWriter('graphs/gist', sess.graph)
        sess.run(tf.global_variables_initializer())
        
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/arxiv/checkpoint'))
        # if checkpoint exists and path is given, restore sess from path
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            
        iteration = global_step.eval() # gets current iteration
        for batch in read_batch(read_data(DATA_PATH, vocab)):
            # this is the training part -> uses optimizer against loss in the model defined in above functions
            batch_loss, _ = sess.run([loss, optimizer], {seq: batch})
            # status update
            if (iteration + 1) % SKIP_STEP == 0:
                print('Iter {}. \n Loss {}. Time {}'.format(iteration, batch_loss, time.time() - start))
                online_inference(sess, vocab, seq, sample, temp, in_state, out_state)
                start = time.time()
                saver.save(sess, 'checkpoints/arxiv/char-rnn', iteration)
            iteration += 1
    
def main():
    vocab = (
            " $%'()+,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ"
            "\\^_abcdefghijklmnopqrstuvwxyz{|}")
    # Input Placeholder
    seq = tf.placeholder(tf.int32, [None, None])
    # Temp
    temp = tf.placeholder(tf.float32)
    loss, sample, in_state, out_state = create_model(seq, temp, vocab)
    global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
    optimizer = tf.train.AdamOptimizer(LR).minimize(loss, global_step=global_step)
    # after initializing optimizer, inputs, etc.
    training(vocab, seq, loss, optimizer, global_step, temp, sample, in_state, out_state)

In [2]:
main()

Iter 39. 
 Loss 9315.3046875. Time 6.67547893524
Thi e  e  e  e  e  e  e   e  e   e   e     e             e                       e             e                                                                                                        e                                                                                                    
Iter 79. 
 Loss 7994.96337891. Time 6.89592599869
The the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the t
Iter 119. 
 Loss 7350.33837891. Time 6.50276684761
The the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere the sere

KeyboardInterrupt: 