# Building a Char RNN using Tensorflow

This is a sample code from stanford [tensorflow course](http://web.stanford.edu/class/cs20si/syllabus.html) about building a character RNN for Arxiv abstract generation.

In [1]:
## imports 
from __future__ import print_function, division
import numpy as np
import tensorflow as tf

# Dataset Preparation
## Static preparing of datasets 

- no tensorflow involved and no need to run sessions for this part
- reading arxiv abstracts line by line from a dataset and return sequence of vocabulary ids

In [2]:
# character vocabulary
vocab = (" $%'()+,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_abcdefghijklmnopqrstuvwxyz{|}")

# Length of each sequence
SEQ_LENGTH = 50 
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)

def vocab_encode(text, vocab, oov=0):
    return [vocab.index(x) + 1 if x in vocab else oov for x in text]

def vocab_decode(array, vocab, oov="_oov_"):
    return ''.join([vocab[x - 1] if x > 0 else oov for x in array])

In [3]:
def read_data(filename, vocab=vocab, window=SEQ_LENGTH, overlap=SEQ_LENGTH/2):
    """
    read abstract file line by line and chunk each line to some chunks
    pad with zeros if the chunk is less than the window
    """
    for text in open(filename):
        text = vocab_encode(text, vocab)
        for start in range(0, len(text) - window, int(overlap)):
            chunk = text[start: start + window]
            chunk += [0] * (window - len(chunk))
            yield chunk
            

def read_batch(stream, batch_size=BATCH_SIZE):
    """
    read a stream of chunks from read_data and make them in batches to feed an RNN
    """
    batch = [] 
    
    for i in stream:
        batch.append(i)
        if len(batch) == batch_size:
            yield batch 
            batch = [] 
            
    yield batch

### TF dataset preparation 

- dataset preparation in tensorflow session

In [4]:
def create_onehot(seq):
    """
    seq: input sequence of size BATCH_SIZE X SEQ_LENGTH    
    """
    
    # convert sequence into 1 hot representations of size BATCH_SIZE X SEQ_LENGTH X VOCAB_SIZE
    seq = tf.one_hot(seq, VOCAB_SIZE)
    
    # length tensor will be of size  BATCH_SIZE x 1 containing true lengths of each item in the batch at runtime
    lengths = tf.reduce_sum(tf.reduce_max(tf.sign(seq),2), 1)
    
    return seq, lengths

# Building Models

## Char RNN, loss function and Optimizer

In [5]:
def create_rnn(seq, length, state_sizes, num_layer = 3):    
    """
     :param seq: batch of sequences of 1 hot representations of chars each sequence is of size 
                 BATCH_SIZE X SEQ_LENGTH X VOCAB_SIZE
     :param length: true lengths of sequences in each batch size :  BATCH_SIZE X 1 
     :param state_size:  size of the hidden state of the RNN
    """
    
    cells = [tf.nn.rnn_cell.GRUCell(state_size) for state_size in state_sizes]
    
    initial_state_placeholder = tuple([
        tf.placeholder_with_default(input = cell.zero_state(tf.shape(seq)[0], tf.float32), shape = [None, state_sizes[i]])
        for i, cell in enumerate(cells)
    ])
    
    
    cell = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
    
    rnn_outputs, output_state = tf.nn.dynamic_rnn(cell, seq, length, initial_state_placeholder)
    
    return rnn_outputs, initial_state_placeholder, output_state

In [6]:
def create_model(seq_placeholder, global_step, vocab_size, state_sizes, learning_rate):
    
    # converting seq to one hot vectors of size BATCH_SIZE X SEQ_LENGTH X VOCAB_SIZE
    seq, lengths = create_onehot(seq_placeholder)
    
    rnn_outputs, initial_state_placeholder, out_state = create_rnn(seq, lengths,state_sizes=state_sizes)
    
    # logits of size BATCH_SIZE X SEQ_LENGTH X VOCAB_SIZE
    logits = tf.contrib.layers.fully_connected(rnn_outputs, vocab_size, activation_fn=None)
    
    # the RNN is supposed to predict the next character 
    # i.e. predict the next 1 hot vector over the vocabulary
    # each predicted label should match the next word in the sequence
    predicted = logits[:,:-1]
    labels = seq[:,1:]
    
    """
    `softmax_cross_entropy_with_logits`
    This op expects unscaled logits, since it performs a `softmax` on `logits` internally for efficiency.  
    Do not call this op with theoutput of `softmax`, as it will produce incorrect results.
    """
    # the loss is the sum of the cross entropy across the batch 
    loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=predicted, labels=labels))
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)
    
    
    return loss, optimizer, initial_state_placeholder, out_state, logits

### Char Generator 
generate the next character given an input character 

In [7]:
def create_generator(logits):
    """
    logits: are the logits of the output of the RNN model
    """
    
    temp_placeholder = tf.placeholder(tf.float32, name="temperature")    
    sample = tf.multinomial(tf.exp(logits[:, -1] / temp_placeholder), 1)[:, 0] 
    
    return sample, temp_placeholder

## Training and Generation functions 
### Create Training functions for the RNN Model

In [8]:
LOG_EVERY = 40

def train_model(datafeeder, seq_placeholder, init_state_placeholder, out_state, loss, optimizer, sample, temp_placeholder):
    
    
    with tf.Session() as sess:
        
        sess.run(tf.global_variables_initializer())
        iteration = 0 
                
        for batch in datafeeder:
            
            _, batch_loss =sess.run([optimizer, loss], feed_dict={seq_placeholder:batch})
            
                        
            if (iteration + 1) % LOG_EVERY == 0:
                print('iter{} : loss{}'.format(iteration, batch_loss))
                print(generate_sentence(sess, seq_placeholder, init_state_placeholder, sample, out_state, temp_placeholder))
                            
            iteration +=1

### Create Generation function of the RNN Model

In [9]:
TEMPRATURE = 0.7
LEN_GENERATED = 300

def generate_sentence(sess, seq_placeholder, init_state_placeholder, sample, out_state, temp_placeholder, len_generated=LEN_GENERATED, vocab=vocab):
    """
    Generate sentence based on the previous character
    
    :param sess: session 
    :param vocab: vocabulary to generate from 
    :param char: previous character in the sequence  e.g. "T"
    :param sample: sample from the distribution of the output
    :param in_state: input state to the rnn 
    :param out_state: output state of the rnn 
    :param temp: temperature of the sampling    
    :return: 
    """
    
    state = None 
    
    # select a random seed from the vocabulary
    # todo : fix
    seed = "T"
    sentence = ""
    
    for i in range(len_generated):
        
        batch = [vocab_encode(seed, vocab)]
        
        if state is not None:
            feed = {
                init_state_placeholder: state,
                seq_placeholder:batch,
                temp_placeholder: TEMPRATURE
            }
        else:
            feed = {
                seq_placeholder:batch,
                temp_placeholder: TEMPRATURE                
            }
    
        
        index, state = sess.run([sample, out_state], feed_dict = feed)
        sentence += vocab_decode(index, vocab)
        seed = sentence[-1]
        
    
    return sentence

## RUN the RNN MODEL 

In [10]:
### Variables
STATE_SIZES = [200]
LR = 0.003

In [11]:
### Run the whole model
DATA_PATH = '../data/arvix_abstracts.txt'

global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')

datafeeder = read_batch(read_data(filename=DATA_PATH))
seq_placeholder = tf.placeholder(tf.int32, [None, None], name="sequence")


loss, optimizer, init_state_placeholder, out_state, logits = create_model(seq_placeholder,
                                                            global_step,
                                                            VOCAB_SIZE, 
                                                            STATE_SIZES, 
                                                            LR)

sample, temp_placeholder = create_generator(logits)

sess = train_model(datafeeder, seq_placeholder, init_state_placeholder, out_state, loss, optimizer, sample, temp_placeholder)

iter39 : loss9567.9296875
ITt                                                              e     e   e   e  e  e  e  e  e  e  e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e 
iter79 : loss8543.015625
$je the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
iter119 : loss7483.62744141
he the s an the the the the the s an the the the the s an the the s an the the the the the the s an the the the the the the the s an the the s an the the the s an the the the the the the the the the s an the the the s an the the s an the s an the the the s an the the the the the the the the s an the
iter159 : loss6994

KeyboardInterrupt: 