In [84]:
import tensorflow as tf
import numpy as np
import pandas as pd

import os

# from sklearn.feature_extraction import DictVectorizer
# from sklearn import svm
# from sklearn.metrics import accuracy_score  # gt, pred

from utils.utils import user_opt_gen, nice_dict, seed, pcp1, pcp2, pcp3, pcp4
from utils.utils_baseline_svm import filter_dict_by_val_atleast, char_freq_map

# from collections import Counter
# from math import isnan

# import matplotlib.pyplot as plt

# import re

In [85]:
def remove_dir_content(path):
    if tf.gfile.Exists(path):
#         tf.gfile.DeleteRecursively(path)
        print(path)

In [86]:
user_opt = user_opt_gen()

main_data = pd.read_csv(user_opt['data_path'], 
                         sep=';', 
                         header=0, 
                         encoding='cp850')

# only observations with ATC labels
main_data_labeled = main_data.loc[[isinstance(k, str) for k in main_data['ATC']],:]

In [87]:
# import urllib

n = len(main_data_labeled)

x = main_data_labeled['FREETXT'][:n]
y = main_data_labeled['ATC'][:n]

kwargs_tf_simple = nice_dict({'log_dir': 'logdir/', 
                              'GIST_URL': 'https://gist.githubusercontent.com/dandelionmane/4f02ab8f1451e276fea1f165a20336f1/raw/dfb8ee95b010480d56a73f324aca480b3820c180/', 
                              'del_log': True, 
                              'char_filter': 100})

if kwargs_tf_simple.del_log: remove_dir_content(kwargs_tf_simple.log_dir)

logdir/


In [88]:
# filter by character, appear at least 'char_filter' times in the input
filter_keys_chars = list(
    filter_dict_by_val_atleast(
        input_dict=char_freq_map(input_data=x), 
        value=kwargs_tf_simple.char_filter)
    .keys())

In [89]:
# create list of character lists
x_char = [list(line) for line in x]
x_char_filtered = []
unknown = '<unk-char>'
# replace chars not in 'filter_keys_chars' with 'unknown'
for line in x_char:
    x_char_filtered.append([char if (char in filter_keys_chars) else unknown for char in line])

# can check
# for ind in range(10):
#     print(x_char_filtered[ind])

In [90]:
n_labels = len(y.unique())
# number of unique characters iin input ('x_char_filtered')
n_char = len(set([char for line in x_char_filtered for char in line]))

In [96]:
def lstm_unit(*, 
              input, 
              size_in, 
              size_out, 
              name="LSTM", 
              activate = tf.nn.relu, 
              n_input, 
              n_steps, 
              n_hidden):
    # activate can be (commonly):
    # tf.sigmoid
    # tf.nn.relu
    # tf.tanh
    # tf.nn.relu6
    assert activate in [tf.nn.relu, 
                        tf.nn.relu6, 
                        tf.sigmoid, 
                        tf.tanh], 'Please choose activation function from the given set'
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], stddev=0.1), name="W")
        b = tf.Variable(tf.constant(0.1, shape=[size_out]), name="B")
        
        
        # Permuting batch_size and n_steps
        x = tf.transpose(input, [1, 0, 2])
        # Reshape to (n_steps*batch_size, n_input)
        x = tf.reshape(x, [-1, n_input])
        # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
        x = tf.split(0, n_steps, x)
        
#         lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
        lstm_fw_cell = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
        
        output, state = lstm_fw_cell(x, state, dtype=tf.float32)        
        
        lin_activations = tf.matmul(output[-1], w) + b
        act = activate(lin_activations)
        
#         act = activate(tf.matmul(input, w) + b)
        
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return act

    
def make_hparam_string(learning_rate, fc, conv):
    conv_param = 'conv={:d}'.format(conv)
    fc_param = 'conv={:d}'.format(fc)
#     return "lr_%.0E,%s,%s" % (learning_rate, conv_param, fc_param)
    return 'lr_{:.0E},{},{}'.format(learning_rate, conv_param, fc_param)


def simple_lstm_model(learn_rate, conv, lstm, hparam_str):
    tf.reset_default_graph()
    sess = tf.Session()
    
    tf.set_random_seed(seed())
    
    
# https://www.tensorflow.org/tutorials/recurrent
# https://www.tensorflow.org/programmers_guide/reading_data#preloaded_data
# https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup
# https://www.tensorflow.org/api_guides/python/nn#Embeddings
# https://github.com/dhwajraj/deep-siamese-text-similarity/blob/master/siamese_network.py

# source code, from lines 125, 141
# https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/ptb_word_lm.py

In [None]:
# kwargs_tf_simple.log_dir
# os.path.join(os.path.curdir + '/logdir/')

In [None]:
### MNIST EMBEDDINGS ###
mnist = tf.contrib.learn.datasets.mnist.read_data_sets(train_dir=kwargs_tf_simple.log_dir + 'data', one_hot=True)
### Get a sprite and labels file for the embedding projector ###
urllib.request.urlretrieve(kwargs_tf_simple.GIST_URL + 'labels_1024.tsv', kwargs_tf_simple.log_dir + 'labels_1024.tsv')
urllib.request.urlretrieve(kwargs_tf_simple.GIST_URL + 'sprite_1024.png', kwargs_tf_simple.log_dir + 'sprite_1024.png')

pcp1()

def conv_layer(input, size_in, size_out, name="conv"):
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([5, 5, size_in, size_out], stddev=0.1), name="W")
        b = tf.Variable(tf.constant(0.1, shape=[size_out]), name="B")
        conv = tf.nn.conv2d(input, w, strides=[1, 1, 1, 1], padding="SAME")
        act = tf.nn.relu(conv + b)
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return tf.nn.max_pool(act, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")


def fc_layer(input, size_in, size_out, name="fc"):
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], stddev=0.1), name="W")
        b = tf.Variable(tf.constant(0.1, shape=[size_out]), name="B")
        act = tf.nn.relu(tf.matmul(input, w) + b)
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return act


def mnist_model(learning_rate, use_two_conv, use_two_fc, hparam):
    tf.reset_default_graph()
    sess = tf.Session()
    
    tf.set_random_seed(seed())

    # Setup placeholders, and reshape the data
    x = tf.placeholder(tf.float32, shape=[None, 784], name="x")
    x_image = tf.reshape(x, [-1, 28, 28, 1])
    tf.summary.image('input', x_image, 3)
    y = tf.placeholder(tf.float32, shape=[None, 10], name="labels")

    if use_two_conv:
        conv1 = conv_layer(x_image, 1, 32, "conv1")
        conv_out = conv_layer(conv1, 32, 64, "conv2")
    else:
        conv1 = conv_layer(x_image, 1, 64, "conv")
        conv_out = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")

    flattened = tf.reshape(conv_out, [-1, 7 * 7 * 64])


    if use_two_fc:
        fc1 = fc_layer(flattened, 7 * 7 * 64, 1024, "fc1")
        embedding_input = fc1
        embedding_size = 1024
        logits = fc_layer(fc1, 1024, 10, "fc2")
    else:
        embedding_input = flattened
        embedding_size = 7*7*64
        logits = fc_layer(flattened, 7*7*64, 10, "fc")

    with tf.name_scope("xent"):
        xent = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=y), name="xent")
    tf.summary.scalar("xent", xent)

    with tf.name_scope("train"):
        train_step = tf.train.AdamOptimizer(learning_rate).minimize(xent)

    with tf.name_scope("accuracy"):
        correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        tf.summary.scalar("accuracy", accuracy)

    summ = tf.summary.merge_all()


    embedding = tf.Variable(tf.zeros([1024, embedding_size]), name="test_embedding")
    assignment = embedding.assign(embedding_input)
    saver = tf.train.Saver()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(kwargs_tf_simple.log_dir + hparam)
    writer.add_graph(sess.graph)

    config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
    embedding_config = config.embeddings.add()
    embedding_config.tensor_name = embedding.name
    embedding_config.sprite.image_path = kwargs_tf_simple.log_dir + 'sprite_1024.png'
    embedding_config.metadata_path = kwargs_tf_simple.log_dir + 'labels_1024.tsv'
    # Specify the width and height of a single thumbnail.
    embedding_config.sprite.single_image_dim.extend([28, 28])
    tf.contrib.tensorboard.plugins.projector.visualize_embeddings(writer, config)

    pcp3()
    
    for i in range(1000 + 1):
        batch = mnist.train.next_batch(100)
        if i % 5 == 0:
            
#             pcp4()
            
            [train_accuracy, s] = sess.run([accuracy, summ], feed_dict={x: batch[0], y: batch[1]})
            writer.add_summary(s, i)
            print('Iteration number {}, Accuracy is currently {}'.format(i, train_accuracy))
        if i % 500 == 0:
            sess.run(assignment, feed_dict={x: mnist.test.images[:1024], y: mnist.test.labels[:1024]})
            saver.save(sess, os.path.join(kwargs_tf_simple.log_dir, "model.ckpt"), i)
        sess.run(train_step, feed_dict={x: batch[0], y: batch[1]})

def make_hparam_string(learning_rate, use_two_fc, use_two_conv):
    conv_param = "conv=2" if use_two_conv else "conv=1"
    fc_param = "fc=2" if use_two_fc else "fc=1"
    return "lr_%.0E,%s,%s" % (learning_rate, conv_param, fc_param)

def main():
    
#     tf.set_random_seed(seed())
    
    # You can try adding some more learning rates
    for learning_rate in [1E-4]:

        # Include "False" as a value to try different model architectures
        for use_two_fc in [False]:
            for use_two_conv in [True]:
                # Construct a hyperparameter string for each one (example: "lr_1E-3,fc=2,conv=2)
                
                pcp2()
                
                hparam = make_hparam_string(learning_rate, use_two_fc, use_two_conv)
                print('Starting run for %s' % hparam)

                # Actually run with the new settings
                mnist_model(learning_rate, use_two_fc, use_two_conv, hparam)


if __name__ == '__main__':
    main()

In [None]:
print('Done!')

In [None]:
# Iteration number 925, Accuracy is currently 0.5
# Iteration number 930, Accuracy is currently 0.6000000238418579
# Iteration number 935, Accuracy is currently 0.5699999928474426
# Iteration number 940, Accuracy is currently 0.550000011920929
# Iteration number 945, Accuracy is currently 0.5400000214576721
# Iteration number 950, Accuracy is currently 0.5799999833106995
# Iteration number 955, Accuracy is currently 0.6200000047683716
# Iteration number 960, Accuracy is currently 0.5199999809265137
# Iteration number 965, Accuracy is currently 0.5400000214576721
# Iteration number 970, Accuracy is currently 0.6700000166893005
# Iteration number 975, Accuracy is currently 0.5899999737739563
# Iteration number 980, Accuracy is currently 0.46000000834465027
# Iteration number 985, Accuracy is currently 0.5699999928474426
# Iteration number 990, Accuracy is currently 0.5899999737739563
# Iteration number 995, Accuracy is currently 0.550000011920929
# Iteration number 1000, Accuracy is currently 0.5799999833106995