In [None]:
#standard libraries
import csv
import os
import time
import re
from functools import reduce

#custom libraries
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def read_file_format(filename_queue):
    reader = tf.TextLineReader(skip_header_lines=1)
    _, value = reader.read(filename_queue)

    record_defaults = [tf.constant([], dtype=tf.float32), tf.constant([], dtype=tf.int32)]
    _, col2 = tf.decode_csv(value, record_defaults=record_defaults)
    
    example = tf.stack([col2])
    return example

In [None]:
def input_pipeline(filenames, batch_size = 3, seq_length=3,
                   num_epochs = None, evaluation = False):   
    filename_queue = tf.train.string_input_producer(
        filenames, num_epochs=num_epochs, shuffle=False)

    example = read_file_format(filename_queue)
        
    min_after_dequeue = 10
    capacity = min_after_dequeue + 3 * batch_size
    example_batch = tf.train.batch(
        [example], batch_size=batch_size*seq_length, capacity=capacity
    )    

    label_batch = tf.concat(
        [example_batch[-1], example_batch[1:,0]],
        axis=0)

    example_batch = tf.reshape(example_batch, (batch_size, seq_length))
    label_batch = tf.reshape(label_batch, (batch_size, seq_length))

    return example_batch, label_batch

In [None]:
def _add_loss_summaries(total_loss, averager=None, include_averaged_loss=False):
    losses = tf.get_collection('losses')
    if include_averaged_loss:
        loss_averages_op = averager.apply(losses + [total_loss])

    for l in losses + [total_loss]:

        l_name = l.name.replace(":", "_")

        tf.summary.scalar(l_name + '_raw_', tf.reduce_sum(l))        
        if include_averaged_loss:
            tf.summary.scalar(l_name + '_raw_', l)
            tf.summary.scalar(l_name, averager.average(l))
        
    if include_averaged_loss:
        return loss_averages_op
    else:
        return total_loss

In [None]:
def nll(y_, logits, vocab_size):
    y_ = tf.one_hot(tf.reshape(tf.cast(y_, tf.int32), [-1]), depth=vocab_size)
    return tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits), axis=None)
    

In [None]:
class LSTM_Cell:
    
    def __init__(self, args, scope_name, current_weights=None):
        self.rnn_size = args.rnn_size
        self.num_proj = args.vocab_size
        self.input_size = args.batch_size * args.seq_length
        self.state_size = self.rnn_size * 2        
        
        with tf.variable_scope(scope_name):
            if args.weight_noise_type == None:
                self.W = tf.get_variable('W', [self.input_size + self.num_proj, 4 * self.rnn_size],
                                         tf.float32, tf.random_normal_initializer(), trainable=True)

            elif args.weight_noise_type == "static":
                self.W = tf.get_variable('W', [self.input_size + self.num_proj, 4 * self.rnn_size],
                                         tf.float32, tf.random_normal_initializer(), trainable=True)
                weight_noise = tf.truncated_normal([self.input_size + self.num_proj, 4 * self.rnn_size],
                                                   stddev=args.weight_prior_variance)
                self.W = self.W + weight_noise        

            elif args.weight_noise_type == "adaptive":
                self.W = tf.reshape(current_weights, [self.input_size + self.num_proj, 4 * self.rnn_size])


            self.b = tf.get_variable('b', [1, self.rnn_size * 4], tf.float32,
                                     tf.constant_initializer(0.0), trainable=True)
        
            
    def __call__(self, i, state):
        self.c_prev = tf.slice(state, [0, 0], [-1, self.rnn_size])
        self.h_prev = tf.slice(state, [0, self.rnn_size], [-1, self.num_proj])

        data = tf.concat([i, self.h_prev], 1)

        weighted = tf.matmul(data, self.W)

        self.i, self.j, self.f, self.o = tf.split(weighted, num_or_size_splits=4, axis=1)
        self.i_b, self.j_b, _, self.o_b = tf.split(self.b, num_or_size_splits=4, axis=1)
        
        self.c = (tf.sigmoid(self.f + args.forget_bias) * self.c_prev +
                  tf.sigmoid(self.i + self.i_b) * tf.tanh(self.j + self.j_b))
        self.h = tf.sigmoid(self.o + self.o_b) * tf.tanh(self.c)
        
        self.state = tf.concat([self.c, self.h], axis=1)
        return self.h, self.state
    
    def zero_state(self, batch_size, dtype):
        return tf.zeros([batch_size, self.state_size], dtype=dtype)

In [None]:
class Model():
    
    def __init__(self, args):

        self.batch_size = args.batch_size
        self.seq_length = args.seq_length

        self.x = tf.placeholder(tf.float32, shape=[args.batch_size, args.seq_length])
        self.y_ = tf.placeholder(tf.float32, shape=[args.batch_size, args.seq_length])
        
        with tf.variable_scope("weights"):
            if args.weight_noise_type == "adaptive":
                lstm_weight_size = (args.batch_size * args.seq_length + args.vocab_size) * (4 * args.rnn_size) * args.num_layers
                softmax_weight_size = (args.rnn_size * args.vocab_size)
                embedding_weight_size = (args.rnn_size * args.vocab_size)
                weight_size = lstm_weight_size + softmax_weight_size + embedding_weight_size

                prior_loc = [0.] * weight_size
                prior_scale_diag = [args.weight_prior_variance] * weight_size
                self.weight_prior = tf.contrib.distributions.Normal(
                        prior_loc,
                        prior_scale_diag
                )
                
                S_hat = tf.get_variable("S_hat", initializer=prior_scale_diag, trainable=True)
                S = tf.exp(S_hat)

                mu = tf.get_variable("mu", initializer=prior_loc, trainable=True)

                self.weight_dist = tf.contrib.distributions.Normal(mu, S)

                self.weight_st = tf.contrib.bayesflow.stochastic_tensor.StochasticTensor(self.weight_dist)
                
                current_weights = tf.squeeze(self.weight_st, name="W")

                lstm_W, softmax_W, embedding_mat = tf.split(current_weights,
                                                            num_or_size_splits=[lstm_weight_size,
                                                                                softmax_weight_size,
                                                                                embedding_weight_size])
                lstm_W = tf.split(lstm_W, num_or_size_splits=args.num_layers)
            
            else:
                lstm_W = [None for i in range(args.num_layers)]

            self.lstm_cells = []
            for i in range(args.num_layers):
                self.lstm_cells.append(LSTM_Cell(args, "lstm_{0}".format(i), lstm_W[i]))
            self.lstm = tf.contrib.rnn.MultiRNNCell(self.lstm_cells)

            self.initial_state = self.lstm.zero_state(args.batch_size, tf.float32)

            if args.weight_noise_type is None:
                self.softmax_W = tf.get_variable('softmax_W', [args.rnn_size, args.vocab_size],
                                            tf.float32, tf.random_normal_initializer(), trainable=True)
                self.embedding_mat = tf.get_variable('embedding', [args.vocab_size, args.rnn_size],
                                            tf.float32, tf.random_normal_initializer(), trainable=True)

            elif args.weight_noise_type == "static":
                softmax_W = tf.get_variable('softmax_W', [args.rnn_size, args.vocab_size],
                                            tf.float32, tf.random_normal_initializer(), trainable=True)
                softmax_noise = tf.truncated_normal([args.rnn_size, args.vocab_size],
                                                    stddev=args.weight_prior_variance)
                self.softmax_W = softmax_W + softmax_noise

                embedding_mat = tf.get_variable('embedding', [args.vocab_size, args.rnn_size],
                                            tf.float32, tf.random_normal_initializer(), trainable=True)
                embedding_noise = tf.truncated_normal([args.vocab_size, args.rnn_size],
                                                      stddev=args.weight_prior_variance)
                self.embedding_mat = embedding_mat + embedding_noise

            elif args.weight_noise_type == "adaptive":
                softmax_W = tf.reshape(softmax_W, [args.rnn_size, args.vocab_size])
                self.embedding_mat = tf.reshape(embedding_mat, [args.vocab_size, args.rnn_size])

            else:
                raise Exception("Unrecognized value for weight_noise_type; " +
                                "recognized values are: None, 'static', and 'adaptive'.")

            self.b = tf.get_variable('b', [args.vocab_size], tf.float32,
                                tf.constant_initializer(0.0), trainable=True)

        embedding_output = tf.nn.embedding_lookup(self.embedding_mat, tf.cast(self.x, tf.int32))

        rnn_inputs = tf.split(axis=1, num_or_size_splits=self.seq_length, value=embedding_output)
        rnn_inputs = [tf.squeeze(x, [1]) for x in rnn_inputs]

        outputs, last_state = tf.contrib.legacy_seq2seq.rnn_decoder(rnn_inputs,
                                                                    self.initial_state,
                                                                    self.lstm,
                                                                    scope='lstm')
        output = tf.reshape(tf.concat(outputs,1), [-1, args.rnn_size])

        self.logits = tf.matmul(output, softmax_W) + self.b
        
        
    def train(self, args):

        if args.weight_noise_type is None:
            self.y_ = tf.cast(self.y_, tf.int32)
            self.loss = tf.contrib.legacy_seq2seq.sequence_loss(
                logits=[self.logits],
                targets=[self.y_],
                weights=[tf.ones([args.batch_size * args.seq_length], dtype=tf.float32)]
            )
        else:
            self.loss = tf.contrib.bayesflow.variational_inference.elbo(
                nll(self.y_, self.logits, args.vocab_size),
                {self.weight_st: self.weight_prior},
                keep_batch_dim=True
            )
            
        tf.add_to_collection('losses', self.loss)
        tf.add_n(tf.get_collection('losses'), name='total_loss')
                
        opt = tf.train.AdamOptimizer(args.learning_rate)

        if args.weight_noise_type in [None, "static"]:
            grads = opt.compute_gradients(self.loss)
            trunc_grads = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in grads]
            apply_gradient_op = [opt.apply_gradients(trunc_grads, global_step=global_step)]
            
            for grad, var in trunc_grads:
                if grad is not None:
                    tf.summary.histogram(var.op.name + '/gradients', grad)

        else: 
            with tf.variable_scope("weights", reuse=True):
                means = tf.get_variable("mu")
                variances = tf.get_variable("S_hat")
                biases = [tf.get_variable("b")]
                for i in range(args.num_layers):
                    with tf.variable_scope("lstm_{0}".format(i)):
                        biases.append(tf.get_variable("b"))               

            var_grads = opt.compute_gradients(self.loss, var_list=variances)
            self.trunc_var_grads = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in var_grads]
            apply_var_grad_op = opt.apply_gradients(self.trunc_var_grads, global_step=global_step)

            mean_grads = opt.compute_gradients(self.loss, var_list=means)
            self.trunc_mean_grads = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in mean_grads]
            apply_mean_grad_op = opt.apply_gradients(self.trunc_mean_grads, global_step=global_step)
            
            self.bias_grads = opt.compute_gradients(self.loss, var_list=biases)
            apply_bias_grad_op = opt.apply_gradients(self.bias_grads, global_step=global_step)
            
            apply_gradient_op = [apply_mean_grad_op] + [apply_var_grad_op] + [apply_bias_grad_op]
            
            for grad, var in self.trunc_mean_grads:
                if grad is not None:
                    tf.summary.histogram(var.op.name + '/mean_gradients', grad)
            for grad, var in self.trunc_var_grads:
                if grad is not None:
                    tf.summary.histogram(var.op.name + '/variance_gradients', grad)

        for var in tf.global_variables():
            tf.summary.histogram(var.op.name, var)

        if args.compute_variable_averages:
            moving_averager = tf.train.ExponentialMovingAverage(0.9, name='avg')
            variables_averages_op = moving_averager.apply(tf.trainable_variables())
            _add_loss_summaries(model.loss, moving_averager, args.compute_variable_averages)

            with tf.control_dependencies(apply_gradient_op + [variables_averages_op]):
                self.train_op = tf.no_op(name='train')
        else:
            _add_loss_summaries(model.loss)
            with tf.control_dependencies(apply_gradient_op):
                self.train_op = tf.no_op(name='train')
        
        return self.train_op

    
    def sample(self):
        flat_y_ = tf.expand_dims(tf.reshape(self.y_, [-1]), 1)
        flat_y_ = tf.cast(flat_y_, tf.int64)
        softmax = tf.nn.softmax(self.logits)
        samples = tf.multinomial(softmax, 1)
        
        self.sampled_results = tf.concat([flat_y_, samples], axis=1)
        return self.sampled_results

In [None]:
data_path = "data"

vocab_file = "vocab1.csv"

train_file = "train1.csv"

model_path = 'VanillaLSTM'

In [None]:
# Download/store Shakespeare data
full_model_dir = os.path.join(data_path, model_path)

# Make Model Directory
if not os.path.exists(full_model_dir):
    os.makedirs(full_model_dir)

In [None]:
vocab = pd.read_csv("{0}/{1}".format(data_path, vocab_file),
                    header=None)

In [None]:
class ArgStruct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

In [None]:
arg_dict = {
    'data_path': data_path,
    'model_path': model_path,
    'rnn_size': 256,
    'num_layers': 2,
    'batch_size': 16,
    'seq_length': 16,
    'forget_bias': 1.,
    'num_epochs': 1,
    'learning_rate': 0.0001,
    'momentum': 0.9,
    'logdir': 'TF_Logs',
    'vocab_size': len(vocab)+1,
    'save_every': 1000,
    'print_every': 250,
    'compute_variable_averages': True,
    'weight_noise_type': "adaptive",
    'weight_prior_variance': 0.05,
}

In [None]:
args = ArgStruct(**arg_dict)

In [None]:
with tf.Graph().as_default():
    
    global global_step
    global_step = tf.Variable(0, name='global_step', trainable=False)
    
    example_feed, label_feed = input_pipeline(
        ["{0}/{1}".format(args.data_path, train_file)],
        batch_size=args.batch_size,
        seq_length=args.seq_length,
        num_epochs=args.num_epochs)
    
    with tf.Session().as_default() as sess:
        print("Adding model to graph.")
        model = Model(args=args)
        
        print("Adding training and sampling ops to graph.")
        train_op = model.train(args=args)
        sample_op = model.sample()
        
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        
        config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
        
        embedding = config.embeddings.add()
        embedding.tensor_name = model.embedding_mat.name
        embedding.metadata_path = os.path.join(args.logdir, 'metadata.tsv')


        tf.contrib.tensorboard.plugins.projector.visualize_embeddings(writer, config)
        
        merged = tf.summary.merge_all()
        
        init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
        sess.run(init_op)
        
        saver = tf.train.Saver(tf.global_variables())
        saver.save(sess, os.path.join(full_model_dir, "model.ckpt"), global_step.eval())
        
        coord = tf.train.Coordinator()  
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)
                
        global queue_stopped
        queue_stopped = coord.should_stop()
        
        while not coord.should_stop():
            try:
                start_time = time.time()                

                example_batch, label_batch = sess.run([example_feed,
                                                      label_feed])
                
                run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
                
                result, summary = sess.run(
                    [train_op, merged],
                    feed_dict={model.x: example_batch,
                               model.y_: label_batch},
                    options=run_options,
                    run_metadata=run_metadata
                )

                writer.add_summary(summary, global_step.eval())
                writer.add_run_metadata(run_metadata,
                                        tag="step{0}".format(global_step.eval()),
                                        global_step=global_step.eval())
            
                if global_step.eval() % args.print_every == 0:
                    latest_loss = sess.run([model.loss],
                                           feed_dict={model.x: example_batch,
                                                      model.y_: label_batch})
                    try:
                        summary_nums = (global_step.eval(), duration,
                                        np.mean(latest_loss))
                        print('Iteration: {0}, Last Step Duration: {1}, Loss: {2}'.format(*summary_nums))
                    except:
                        pass
                    results = sess.run([sample_op],
                                       feed_dict={model.x: example_batch,
                                                  model.y_: label_batch})
                    results = pd.DataFrame(results[0]).T
                    recode_results = results.replace(vocab.set_index(1).to_dict().get(0))
                    if not os.path.exists("translation.txt"):
                        recode_results.to_csv("translation.txt", header=False, index=False, sep="\t", mode="w")
                    else:
                        recode_results.to_csv("translation.txt", header=False, index=False, sep="\t", mode="a")


                # Save the model and the vocab
                if global_step.eval() % args.save_every == 0:
                    # Save model
                    model_file_name = os.path.join(full_model_dir, 'model.ckpt')
                    saver.save(sess, model_file_name, global_step=global_step)
                    print('Model Saved To: {}'.format(model_file_name))
                    
                queue_stopped = coord.should_stop()
                    
                duration = time.time() - start_time
                
                writer.flush()

            except (tf.errors.OutOfRangeError, tf.errors.InvalidArgumentError) as e:
           
                print('Done training for %d epochs, %d steps.' % (args.num_epochs, global_step.eval()))
                # When done, ask the threads to stop.
                coord.request_stop()
                queue_stopped = coord.should_stop()

        coord.join(threads)
        sess.close()