In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

import os

from tensorflow.contrib import rnn

# from sklearn.feature_extraction import DictVectorizer
# from sklearn import svm
# from sklearn.metrics import accuracy_score  # gt, pred

from utils.utils import user_opt_gen, nice_dict, seed, init_data, pcp1, pcp2, pcp3, pcp4
from utils.utils_baseline_svm import filter_dict_by_val_atleast, char_freq_map

# from collections import Counter
# from math import isnan

# import matplotlib.pyplot as plt

# import re

In [2]:
def remove_dir_content(path):
    if tf.gfile.Exists(path):
        tf.gfile.DeleteRecursively(path)
        print('Log directory was deleted.')
    else:
        print('Log directory was not found.')
#         print(path)


# pad a list to max_length with the pad_symbol
def pad_list(*, input_list, max_length, pad_symbol):
    output_list = input_list + [pad_symbol] * (max_length - len(input_list))
    return output_list


def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
    

def index_to_dense(index, length):
    output_list = [0.0] * length
    output_list[index] = 1.0
    return output_list

In [3]:
x, y, n, _ = init_data()

np.random.seed(seed())

In [12]:
kwargs_simple_lstm = nice_dict({
    # log
    'log_dir': 'logdir/', 
    'del_log': True, 
    # preprocessing and data
    'char_filter': 100, 
    'n': n,
    'batch_size': n, 
    # learning hyper-params
    'learn_rate': 1,  # 1E-4
    'char_embed_dim': 4, 
    'one_hot': False,
    'hidden_state_size': 4, 
    'activate_bool': True, 
    'keep_prob': 1.0, 
    'epochs': 500,
    'summary_step': 5, 
    'save_step': 100
})

if kwargs_simple_lstm.del_log: remove_dir_content(kwargs_simple_lstm.log_dir)

Log directory was not found.


In [5]:
# filter by character, appear at least 'char_filter' times in the input
filter_keys_chars = list(
    filter_dict_by_val_atleast(
        input_dict=char_freq_map(input_data=x), 
        value=kwargs_simple_lstm.char_filter)
    .keys())

# create a list of character lists
x_char = [list(line) for line in x]
x_char_filtered = []
unknown = '<unk-char>'
# replace chars not in 'filter_keys_chars' with 'unknown'
for line in x_char:
    x_char_filtered.append([char if (char in filter_keys_chars) else unknown for char in line])
    
# pad lines, so that all lines are same length
max_line_len = int(np.max([len(line) for line in x]))
pad = '<pad-char>'
x_char_filtered_pad = []
for i, line in enumerate(x_char_filtered):
    x_char_filtered_pad.append(pad_list(input_list=line, 
                                    max_length=max_line_len, 
                                    pad_symbol=pad))

In [6]:
# statistics based on filtered features
label_set = y.unique()
n_label = len(label_set)

# number of unique characters iin input ('x_char_filtered')
char_set = set([char for line in x_char_filtered_pad for char in line])
n_char = len(char_set)

In [13]:
kwargs_simple_lstm = nice_dict({**kwargs_simple_lstm, **{
    'seq_len': max_line_len,
    'n_class': n_label,
    'n_char': n_char
}
                               })

In [8]:
# create lookup dict for characters (and inv)
char_int = {}
char_int_inv = {}
for i, char in enumerate(char_set):
    char_int[char] = i
    char_int_inv[i] = char

# transform x from a list of symbols into a list of ints
X = []
for line in x_char_filtered_pad:
    X.append([char_int[char] for char in line])

# same for labels
label_int = {}
label_int_inv = {}
for i, label in enumerate(label_set):
    label_int[label] = i
    label_int_inv[i] = label
# create Y as a list of list(int)
Y = [[label_int[label]] for label in y]

# transform into format acceptable by tf
X = np.array(X)
Y_dense = np.array(
    [index_to_dense(label[0], 
                    kwargs_simple_lstm.n_class) for label in Y])

In [14]:
def embed_matrix(index_size, 
                 embedding_dim, 
                 one_hot, 
                 stddev=0.1, 
                 seed=seed(), 
                 name="embedding_matrix"):
    # index_size would be the size of the character set
        
    with tf.name_scope(name):
        if not one_hot:
            embedding_matrix = tf.get_variable(
                'embedding_matrix', 
                initializer=tf.truncated_normal([index_size, embedding_dim], 
                                                stddev=stddev, 
                                                seed=seed), 
                trainable=True)
        else:
            # creating a one-hot for each character corresponds to the identity matrix
            embedding_matrix = tf.constant(value=np.identity(index_size), 
                                           name='embedding_matrix', 
                                           dtype=tf.float32)
            
        tf.summary.histogram('embedding_matrix', embedding_matrix)
        return embedding_matrix


def lstm_unit(input, 
              embeddings, 
              seq_length, 
              hidden_state_size, 
              keep_prob, 
              seed=seed(), 
              name='LSTM'):
    with tf.name_scope(name):
        
        rnn_inputs = [tf.squeeze(i) for i in 
                      tf.split(tf.nn.embedding_lookup(embeddings, input),
                               seq_length, 
                               1)]

        cell = rnn.BasicLSTMCell(num_units=hidden_state_size)
        keep_prob = tf.constant(keep_prob)
        cell = rnn.DropoutWrapper(cell, 
                                  output_keep_prob=keep_prob, 
                                  seed=seed)

        outputs, states = rnn.static_rnn(cell, rnn_inputs, dtype=tf.float32)
        outputs = outputs[-1]
#         outputs = tf.constant(value=outputs, 
#                               name='outputs')
        tf.summary.histogram('outputs', outputs)
        return outputs
        

def logit(*, 
          input, 
          size_in, 
          size_out, 
          stddev=0.1, 
          seed=seed(), 
          name='logit'):
    
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], 
                                            stddev=stddev, 
                                            seed=seed), 
                       name='W')
        b = tf.Variable(tf.constant(0.1, 
                                    shape=[size_out]), 
                        name='B')
        logits = tf.matmul(input, w) + b
        tf.summary.histogram('weights', w)
        tf.summary.histogram('biases', b)
        tf.summary.histogram('logits', logits)
        return logits
                        

def lstm_simple_model(feed_dict, 
                      hparam_str, 
                      n, 
                      seq_len, 
                      n_class, 
                      n_char, 
                      char_embed_dim, 
                      one_hot, 
                      hidden_state_size, 
                      keep_prob, 
                      learn_rate, 
                      epochs, 
                      log_dir, 
                      summary_step, 
                      save_step, *args, **kwargs):
    
    tf.reset_default_graph()
    sess = tf.Session()
    
#     tf.set_random_seed(seed())

    # Setup placeholders, and reshape the data
    x_ = tf.placeholder(tf.int32, [n, 
                                   seq_len])
    y_ = tf.placeholder(tf.int32, [n, 
                                   n_class])

    embedding_matrix = embed_matrix(index_size=n_char, 
                                    embedding_dim=char_embed_dim, 
                                    one_hot=one_hot)
    outputs = lstm_unit(input=x_, 
                        embeddings=embedding_matrix, 
                        hidden_state_size=hidden_state_size, 
                        keep_prob=keep_prob, 
                        seq_length=seq_len)
    
    logits = logit(input=outputs, 
               size_in=hidden_state_size, 
               size_out=n_class)
    
    with tf.name_scope('cross_entropy'):
        cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=y_), name='cross_entropy')
        tf.summary.scalar('cross_entropy', cost)
    
    with tf.name_scope('train'):
        train_step = tf.train.AdamOptimizer(
            learn_rate).minimize(cost)
          
    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        tf.summary.scalar('accuracy', accuracy)

    summ = tf.summary.merge_all()
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(log_dir + hparam_str)
    writer.add_graph(sess.graph)
    
    feed_dict = {x_: kwargs_feed_dict['x'], 
                 y_: kwargs_feed_dict['y']}
    
    for i in range(epochs + 1):
        if i % summary_step == 0:
            # minimizing cost (while also tracking accuracy, for summary)
            [train_accuracy, train_cost, s] = sess.run([accuracy, cost, summ], feed_dict=feed_dict)
            writer.add_summary(s, i)
            print('Iteration number {}, '.format(i) +
                  'accuracy is {:.5f} and '.format(train_accuracy) + 
                  'cost is {:.5f}'.format(train_cost))
        if i % save_step == 0:
#             sess.run(assignment, feed_dict=feed_dict)
            saver.save(sess, os.path.join(log_dir, "model.ckpt"), i)
        sess.run(train_step, feed_dict=feed_dict)
        
    print('Training is done!')

In [15]:
kwargs_feed_dict = {'x': X, 'y': Y_dense}
lstm_simple_model(feed_dict=kwargs_feed_dict, 
                  hparam_str='testrun', 
                  **kwargs_simple_lstm)

Iteration number 0, accuracy is 0.00197 and cost is 6.32400
Iteration number 5, accuracy is 0.03895 and cost is 5.92907
Iteration number 10, accuracy is 0.03895 and cost is 6.18828
Iteration number 15, accuracy is 0.02416 and cost is 5.81941
Iteration number 20, accuracy is 0.03895 and cost is 5.82102
Iteration number 25, accuracy is 0.02416 and cost is 5.79322
Iteration number 30, accuracy is 0.04043 and cost is 5.76409
Iteration number 35, accuracy is 0.03994 and cost is 5.74919
Iteration number 40, accuracy is 0.04093 and cost is 5.74058
Iteration number 45, accuracy is 0.04043 and cost is 5.73562
Iteration number 50, accuracy is 0.04093 and cost is 5.73277
Iteration number 55, accuracy is 0.04093 and cost is 5.73014
Iteration number 60, accuracy is 0.04093 and cost is 5.72813
Iteration number 65, accuracy is 0.04093 and cost is 5.72675
Iteration number 70, accuracy is 0.04093 and cost is 5.72578
Iteration number 75, accuracy is 0.04093 and cost is 5.72509
Iteration number 80, accur

In [None]:
# kwargs_tf_simple.log_dir
# os.path.join(os.path.curdir + '/logdir/')

In [None]:
### MNIST EMBEDDINGS ###
mnist = tf.contrib.learn.datasets.mnist.read_data_sets(train_dir=kwargs_tf_simple.log_dir + 'data', one_hot=True)
### Get a sprite and labels file for the embedding projector ###
urllib.request.urlretrieve(kwargs_tf_simple.GIST_URL + 'labels_1024.tsv', kwargs_tf_simple.log_dir + 'labels_1024.tsv')
urllib.request.urlretrieve(kwargs_tf_simple.GIST_URL + 'sprite_1024.png', kwargs_tf_simple.log_dir + 'sprite_1024.png')

pcp1()

def conv_layer(input, size_in, size_out, name="conv"):
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([5, 5, size_in, size_out], stddev=0.1), name="W")
        b = tf.Variable(tf.constant(0.1, shape=[size_out]), name="B")
        conv = tf.nn.conv2d(input, w, strides=[1, 1, 1, 1], padding="SAME")
        act = tf.nn.relu(conv + b)
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return tf.nn.max_pool(act, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")


def fc_layer(input, size_in, size_out, name="fc"):
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], stddev=0.1), name="W")
        b = tf.Variable(tf.constant(0.1, shape=[size_out]), name="B")
        act = tf.nn.relu(tf.matmul(input, w) + b)
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return act


def mnist_model(learning_rate, use_two_conv, use_two_fc, hparam):
    tf.reset_default_graph()
    sess = tf.Session()
    
    tf.set_random_seed(seed())

    # Setup placeholders, and reshape the data
    x = tf.placeholder(tf.float32, shape=[None, 784], name="x")
    x_image = tf.reshape(x, [-1, 28, 28, 1])
    tf.summary.image('input', x_image, 3)
    y = tf.placeholder(tf.float32, shape=[None, 10], name="labels")

    if use_two_conv:
        conv1 = conv_layer(x_image, 1, 32, "conv1")
        conv_out = conv_layer(conv1, 32, 64, "conv2")
    else:
        conv1 = conv_layer(x_image, 1, 64, "conv")
        conv_out = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")

    flattened = tf.reshape(conv_out, [-1, 7 * 7 * 64])


    if use_two_fc:
        fc1 = fc_layer(flattened, 7 * 7 * 64, 1024, "fc1")
        embedding_input = fc1
        embedding_size = 1024
        logits = fc_layer(fc1, 1024, 10, "fc2")
    else:
        embedding_input = flattened
        embedding_size = 7*7*64
        logits = fc_layer(flattened, 7*7*64, 10, "fc")

    with tf.name_scope("xent"):
        xent = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=y), name="xent")
        tf.summary.scalar("xent", xent)

    with tf.name_scope("train"):
        train_step = tf.train.AdamOptimizer(learning_rate).minimize(xent)

    with tf.name_scope("accuracy"):
        correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        tf.summary.scalar("accuracy", accuracy)

    summ = tf.summary.merge_all()


    embedding = tf.Variable(tf.zeros([1024, embedding_size]), name="test_embedding")
    assignment = embedding.assign(embedding_input)
    saver = tf.train.Saver()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(kwargs_tf_simple.log_dir + hparam)
    writer.add_graph(sess.graph)

    config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
    embedding_config = config.embeddings.add()
    embedding_config.tensor_name = embedding.name
    embedding_config.sprite.image_path = kwargs_tf_simple.log_dir + 'sprite_1024.png'
    embedding_config.metadata_path = kwargs_tf_simple.log_dir + 'labels_1024.tsv'
    # Specify the width and height of a single thumbnail.
    embedding_config.sprite.single_image_dim.extend([28, 28])
    tf.contrib.tensorboard.plugins.projector.visualize_embeddings(writer, config)

    pcp3()
    
    for i in range(1000 + 1):
        batch = mnist.train.next_batch(100)
        if i % 5 == 0:
            
#             pcp4()
            
            [train_accuracy, s] = sess.run([accuracy, summ], feed_dict={x: batch[0], y: batch[1]})
            writer.add_summary(s, i)
            print('Iteration number {}, Accuracy is currently {}'.format(i, train_accuracy))
        if i % 500 == 0:
            sess.run(assignment, feed_dict={x: mnist.test.images[:1024], y: mnist.test.labels[:1024]})
            saver.save(sess, os.path.join(kwargs_tf_simple.log_dir, "model.ckpt"), i)
        sess.run(train_step, feed_dict={x: batch[0], y: batch[1]})

def make_hparam_string(learning_rate, use_two_fc, use_two_conv):
    conv_param = "conv=2" if use_two_conv else "conv=1"
    fc_param = "fc=2" if use_two_fc else "fc=1"
    return "lr_%.0E,%s,%s" % (learning_rate, conv_param, fc_param)

def main():
    
#     tf.set_random_seed(seed())
    
    # You can try adding some more learning rates
    for learning_rate in [1E-4]:

        # Include "False" as a value to try different model architectures
        for use_two_fc in [False]:
            for use_two_conv in [True]:
                # Construct a hyperparameter string for each one (example: "lr_1E-3,fc=2,conv=2)
                
                pcp2()
                
                hparam = make_hparam_string(learning_rate, use_two_fc, use_two_conv)
                print('Starting run for %s' % hparam)

                # Actually run with the new settings
                mnist_model(learning_rate, use_two_fc, use_two_conv, hparam)


if __name__ == '__main__':
    main()