In [None]:
import numpy as np

from utils.utils import *
from utils.utils_nn import *

np.random.seed(seed())

import tensorflow as tf
tf.reset_default_graph()

import pandas as pd

import os

from tensorflow.contrib import rnn
from tensorflow.contrib.tensorboard.plugins import projector  # embeddings visualizer

# from collections import OrderedDict
# from math import ceil

import random
random.seed(seed())

# import matplotlib.pyplot as plt

# import re

In [None]:
# initialize data from main (original) CSV file
x, y, n, main_data = init_data()
freq = [i for i in main_data['CNT'][:n]]  # frequencies, turned into a list
# initialize data from suggestions CSV file
x_suggest, y_suggest, freq_suggest = init_data_suggest()

In [None]:
kwargs_simple_lstm = nice_dict({
    # log
    'log_dir': 'logdir/', 
    'del_log': True, 
    # preprocessing and data
    'char_filter': 100, 
    'scale_func': unscale,  # log_scale, 
    'to_permute': True, 
    'label_count_thresh': 10,  # threshold for train-test split
    'valid_ratio': 0.25,  # ratio for train-test split
    'keep_rare_labels': False, 
    'top_k': 5, 
    'seed': seed(), 
    # learning hyper-params
    'learn_rate': 1E-1,  # 1E-4
    'dynamic_learn_rate': True, 
    'bidirection': False, 
    'char_embed_dim': 4, 
    'one_hot': False,
    'hidden_state_size': 32, 
    'keep_prob': 0.7, 
    'l2_wieght_reg': 1E-4, 
    'target_rep': True, 
    'target_rep_weight': 0.1, 
    'epochs': 200,
    'summary_step': 10, 
    'save_step': np.inf,
    'verbose_summary': False
})

if kwargs_simple_lstm.del_log: remove_dir_content(kwargs_simple_lstm.log_dir)

In [None]:
# filter characters according to 'char_filter',
# makes all sequences the same (max) length and pads with 'unknown' character
x_char_filtered_pad, statistics_dict = \
    text_filter_pad(text=x, y=y, **kwargs_simple_lstm)
# update main dict with newly calculated figures
kwargs_simple_lstm = nice_dict({**kwargs_simple_lstm, **statistics_dict})

# # create look-up dictionaries (and inverse) for an index representation
# char_int, char_int_inv, label_int, label_int_inv = \
#     lookup_dicts_chars_labels(**kwargs_simple_lstm)

# transform x_suggest in a similar manner
# taking into consideration the given character set
x_suggest_char_filtered_pad, statistics_dict = \
    text_filter_pad(text=x_suggest, y=y_suggest, **kwargs_simple_lstm)

# check that there are no "new" statistics popping out
assert_no_stats_change(new_dict=statistics_dict, 
                       kwargs=kwargs_simple_lstm)

# merge original and suggested data
x_merge, y_merge, freq_merge = \
    x_char_filtered_pad + x_suggest_char_filtered_pad, \
    y + y_suggest, \
    freq + freq_suggest
# y_merge = y + y_suggest
# freq_merge = freq + freq_suggest

In [None]:
# split to training and validation sets
x_val, x_train, y_val, y_train, freq_val, freq_train, valid_index, statistics_dict = \
    train_validation_split(x=x_merge, y=y_merge, freq=freq_merge, 
                           label_count_thresh=kwargs_simple_lstm.label_count_thresh, 
                           valid_ratio=kwargs_simple_lstm.valid_ratio, 
                           keep_rare_labels=kwargs_simple_lstm.keep_rare_labels)
n_train, n_test = len(y_train), len(y_val)
# update main dict with labels set and set's size
kwargs_simple_lstm = nice_dict({**kwargs_simple_lstm, **statistics_dict})

# create look-up dictionaries (and inverse) for an index representation
char_int, char_int_inv, label_int, label_int_inv = \
    lookup_dicts_chars_labels(**kwargs_simple_lstm)

In [None]:
# scale data (proportional to frequency)
# training data
x_train_scaled, y_train_scaled, kwargs_simple_lstm['n_train'] = \
    scale_permute_data(x=x_train, 
                       y=y_train, 
                       freq=freq_train, 
                       scale_func=kwargs_simple_lstm.scale_func, 
                       to_permute=kwargs_simple_lstm.to_permute)

# validation data
x_val_scaled, y_val_scaled, kwargs_simple_lstm['n_test'] = \
    scale_permute_data(x=x_val, 
                       y=y_val, 
                       freq=freq_val, 
                       scale_func=kwargs_simple_lstm.scale_func, 
                       to_permute=kwargs_simple_lstm.to_permute)

In [None]:
# aliasing, so that will run smoothly from here
x_feed_train, y_feed_train, x_feed_val, y_feed_val = \
    x_train_scaled, y_train_scaled, x_val_scaled, y_val_scaled

In [None]:
# returns np.arrays to feed into tf model
# training data
X_train, _, Y_train = index_transorm_xy(x=x_feed_train, 
                                        y=y_feed_train, 
                                        char_int=char_int, 
                                        label_int=label_int, 
                                        **kwargs_simple_lstm)

# validation data
X_val, _, Y_val = index_transorm_xy(x=x_feed_val, 
                                    y=y_feed_val, 
                                    char_int=char_int, 
                                    label_int=label_int, 
                                    **kwargs_simple_lstm)

# write a metadata file for embeddings visualizer and create path string
embed_vis_path = write_embeddings_metadata(log_dir=kwargs_simple_lstm.log_dir, 
                                           dictionary=char_int, 
                                           file_name='metadata.tsv')

In [None]:
class Lstm_model(object):

    def __init__(self, 
                 *args, 
                 hparam_str, 
#                  n_train, 
#                  n_test, 
                 seq_len, 
                 n_class, 
                 n_char, 
                 char_embed_dim, 
                 one_hot, 
                 hidden_state_size, 
                 keep_prob, 
                 learn_rate, 
                 dynamic_learn_rate, 
                 bidirection, 
                 top_k, 
                 epochs, 
                 log_dir, 
                 embed_vis_path, 
                 summary_step, 
                 save_step, 
                 seed, 
                 l2_wieght_reg, 
                 target_rep, 
                 target_rep_weight, 
                 verbose_summary, 
                 feed_dict_train, 
                 feed_dict_test, 
                 **kwargs):
        
        self.hparam_str = hparam_str
#         self.n_train = n_train
#         self.n_test = n_test
        self.seq_len = seq_len 
        self.n_class = n_class 
        self.n_char = n_char
        self.char_embed_dim = char_embed_dim
        self.one_hot = one_hot
        self.hidden_state_size = hidden_state_size
#         self.keep_prob = keep_prob
        self.learn_rate = learn_rate
        self.dynamic_learn_rate = dynamic_learn_rate
        self.bidirection = bidirection
        self.top_k = top_k
        self.epochs = epochs
        self.log_dir = log_dir
        self.embed_vis_path = embed_vis_path
        self.summary_step = summary_step 
        self.save_step = save_step
        self.seed = seed
        self.l2_wieght_reg = l2_wieght_reg
        self.target_rep = target_rep
        self.verbose_summary = verbose_summary
        self.target_rep_weight = target_rep_weight if self.target_rep else 0.0
        self.embedding_matrix = None

        # clear tf graph and set seeds
        tf.reset_default_graph()
        tf.set_random_seed(self.seed)
        np.random.seed(self.seed)
        random.seed(self.seed)
        
        # Setup placeholders, and reshape the data
        self.x_ = tf.placeholder(tf.int32, [None, self.seq_len], 
                            name='Examples')
        self.y_ = tf.placeholder(tf.int32, [None, self.n_class], 
                            name='Lables')
        self.keep_prob = tf.placeholder(tf.float32, [], 
                            name='Keep_probability')

        self.feed_dict_train = {self.x_: feed_dict_train['x'], 
                                self.y_: feed_dict_train['y'], 
                                self.keep_prob: keep_prob}

        self.feed_dict_train_eval = {**self.feed_dict_train, 
                                     **{self.keep_prob: 1.0}}

        self.feed_dict_test = {self.x_: feed_dict_test['x'], 
                               self.y_: feed_dict_test['y'], 
                               self.keep_prob: 1.0}

        self.embedding_matrix = self.embed_matrix()

        self.outputs = self.lstm_unit(input=self.x_)
        with tf.name_scope('logits_seq'):
            if self.bidirection: logit_in_size = 2 * self.hidden_state_size
            else: logit_in_size = self.hidden_state_size
            self.logits = [self.logit(input=out, 
                                      size_in=logit_in_size, 
                                      size_out=self.n_class) 
                           for out in self.outputs]

        with tf.name_scope('Cost_function'):
            # cross entropy loss with target replication and
            # regularization terms based on the weights' L2 norm
            with tf.name_scope('target_replication_loss'):
                self.cost_targetrep = tf.reduce_mean(
                    [tf.nn.softmax_cross_entropy_with_logits(
                        logits=log, labels=self.y_) 
                     for log in self.logits])
            with tf.name_scope('cross_entropy'):
                self.cost_crossent = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(
                        logits=self.logits[-1], labels=self.y_))
            with tf.name_scope('L2_norm_reg'):
                self.cost_l2reg = tf.reduce_mean([tf.nn.l2_loss(weight) 
                                                  for weight in tf.trainable_variables()])
            with tf.name_scope('total_cost'):
                self.cost = self.target_rep_weight * self.cost_targetrep + \
                    (1 - self.target_rep_weight) * self.cost_crossent + \
                    self.l2_wieght_reg * self.cost_l2reg
            # add summaries
            tf.summary.scalar('Total_cost_train', 
                              self.cost, collections=['train'])
            tf.summary.scalar('Total_cost_test', 
                              self.cost, collections=['test'])
            
        with tf.name_scope('Cost_function_additional_metrics'):
            tf.summary.scalar('Target_rep_cost_train', 
                              self.cost_targetrep, collections=['train'])
            tf.summary.scalar('Target_rep_cost_test', 
                              self.cost_targetrep, collections=['test'])
            tf.summary.scalar('Cross_entropy_train', 
                              self.cost_crossent, collections=['train'])
            tf.summary.scalar('Cross_entropy_test', 
                              self.cost_crossent, collections=['test'])
            tf.summary.scalar('L2_norm_train', 
                              self.cost_l2reg, collections=['train'])
            tf.summary.scalar('L2_norm_test', 
                              self.cost_l2reg, collections=['test'])            
            
        with tf.name_scope('Train'):
            if self.dynamic_learn_rate:
                self.optimizer = tf.train.GradientDescentOptimizer(self.learn_rate)
            else:
                self.optimizer = tf.train.AdamOptimizer(self.learn_rate)
            self.train_step = self.optimizer.minimize(self.cost)

        with tf.name_scope('Accuracy'):  # modified to take last element of logits
            self.correct_prediction = tf.equal(tf.argmax(self.logits[-1], 1), tf.argmax(self.y_, 1))
            self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))
            tf.summary.scalar('Accuracy_train', self.accuracy, collections=['train'])
            tf.summary.scalar('Accuracy_test', self.accuracy, collections=['test'])

        with tf.name_scope('In_top_{}'.format(self.top_k)):  # modified to take last element of logits
            self.y_targets = tf.argmax(self.y_, 1)
            self.top_k_res = tf.reduce_mean(tf.cast(
                tf.nn.in_top_k(self.logits[-1], self.y_targets, self.top_k), 
                tf.float32))
            tf.summary.scalar('In_top_{}_train'.format(self.top_k), self.top_k_res, collections=['train'])
            tf.summary.scalar('In_top_{}_test'.format(self.top_k), self.top_k_res, collections=['test'])

        # summaries per collection and saver object
        self.summ_train = tf.summary.merge_all('train')
        self.summ_test = tf.summary.merge_all('test')
        self.saver = tf.train.Saver()
        self.init_op = tf.global_variables_initializer()
        
        # init vars and setup writer
        self.sess = tf.Session()
        self.sess.run(self.init_op)
        self.writer = tf.summary.FileWriter(self.log_dir + self.hparam_str)
        self.writer.add_graph(self.sess.graph)
        
        # Add embedding tensorboard visualization. Need tensorflow version
        self.config = projector.ProjectorConfig()
        self.embed = self.config.embeddings.add()
        self.embed.tensor_name = self.embedding_matrix.name
        self.embed.metadata_path = os.path.join(self.embed_vis_path)
        projector.visualize_embeddings(self.writer, self.config)
        
        
    def embed_matrix(self, stddev=0.1, name='embeddings'):
        # index_size would be the size of the character set
        with tf.name_scope(name):
            if not self.one_hot:
                embedding_matrix = tf.get_variable(
                    'embedding_matrix', 
                    initializer=tf.truncated_normal([self.n_char, self.char_embed_dim], 
                                                    stddev=stddev, 
                                                    seed=self.seed), 
                    trainable=True)
            else:
                # creating a one-hot for each character corresponds to the identity matrix
                embedding_matrix = tf.constant(value=np.identity(self.n_char), 
                                               name='embedding_matrix', 
                                               dtype=tf.float32)
                self.char_embed_dim = self.n_char
            if self.verbose_summary:
                tf.summary.histogram('embedding_matrix', embedding_matrix, collections=['train'])
            self.embedding_matrix = embedding_matrix
            return self.embedding_matrix
        
        
    def lstm_unit(self, 
                  input, 
                  name='LSTM'):
        if self.bidirection: name= 'LSTM_bidir'
        with tf.name_scope(name):
            input = tf.nn.embedding_lookup(self.embedding_matrix, input)
            # reshaping
            # Permuting batch_size and n_steps
            input = tf.transpose(input, [1, 0, 2])
            # Reshaping to (n_steps*batch_size, n_input)
            input = tf.reshape(input, [-1, self.char_embed_dim])
            # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
            rnn_inputs = tf.split(input, self.seq_len, 0)

            cell_fw = rnn.BasicLSTMCell(num_units=self.hidden_state_size, 
                                     forget_bias=1.0)
            cell_fw = rnn.DropoutWrapper(cell_fw, 
                                         output_keep_prob=self.keep_prob, 
                                         seed=self.seed)
            
            if self.bidirection:
                # add another cell for backwards direction and a dropout wrapper
                cell_bw = rnn.BasicLSTMCell(num_units=self.hidden_state_size, 
                                         forget_bias=1.0)
                cell_bw = rnn.DropoutWrapper(cell_bw, 
                                             output_keep_prob=self.keep_prob, 
                                             seed=self.seed)
                outputs, _, _ = rnn.static_bidirectional_rnn(
                    cell_fw, cell_bw, rnn_inputs, dtype=tf.float32, scope=name)
            else:
                outputs, _ = rnn.static_rnn(cell_fw, rnn_inputs, dtype=tf.float32, scope=name)
            
            if not self.target_rep:  # take only last output (list for structure consistency)
                outputs = [outputs[-1]]
            if self.verbose_summary:
                tf.summary.histogram('outputs', outputs, collections=['train'])
            return outputs


    def logit(self, 
              input, 
              size_in, 
              size_out, 
              stddev=0.1, 
              name='logit'):

        with tf.name_scope(name):
            w = tf.Variable(tf.truncated_normal([size_in, size_out], 
                                                stddev=stddev, 
                                                seed=self.seed), 
                            name='W')
            b = tf.Variable(tf.constant(0.1, 
                                        shape=[size_out]), 
                            name='B')
            logits = tf.matmul(input, w) + b
            if self.verbose_summary:
                tf.summary.histogram('weights', w, collections=['train'])
                tf.summary.histogram('biases', b, collections=['train'])
                tf.summary.histogram('logits', logits, collections=['train'])
            return logits
    
        
    def train(self):
        print('Starting to train model {:s}'.format(self.hparam_str))
        for i in range(1, self.epochs+1):
            # update learning rate, if it is dynamic
            if self.dynamic_learn_rate: self.update_lr(epoch=i)
            # train step
            self.sess.run(self.train_step, feed_dict=self.feed_dict_train)
            if i % self.summary_step == 0:
                # train summary
                # use self.feed_dict_train_eval for evaluation (keep probability set to 1.0)
                [train_accuracy, train_cost, _, _, _, train_top_k, s] = \
                    self.sess.run([self.accuracy, 
                                   self.cost, self.cost_targetrep, self.cost_crossent, self.cost_l2reg, 
                                   self.top_k_res, 
                                   self.summ_train],
                                  feed_dict=self.feed_dict_train_eval)
                self.writer.add_summary(s, i)
                print('{:.3f} of observations in the top is {}'.format(train_top_k, self.top_k))
                # test summary
                [test_accuracy, test_cost, _, _, _, test_top_k, s] = \
                    self.sess.run([self.accuracy, 
                                   self.cost, self.cost_targetrep, self.cost_crossent, self.cost_l2reg, 
                                   self.top_k_res, 
                                   self.summ_test],
                                  feed_dict=self.feed_dict_test)
                self.writer.add_summary(s, i)
                
                print('Epoch number {}, '.format(i) +
                      'training accuracy is {:.5f} and '.format(train_accuracy) + 
                      'test accuracy is {:.5f}, '.format(test_accuracy))
                print('training cost is {:.5f} and '.format(train_cost) + 
                      'test cost is {:.5f} and '.format(test_cost))
                
            if i % self.save_step == 0:
                print('Saving step {}'.format(i))
                self.saver.save(self.sess, os.path.join(self.log_dir, 
                                                        self.hparam_str, 
                                                        'model.ckpt'), i)
            
        print('Training the model is done! ({:s})'.format(self.hparam_str))
    
    
    def restore(self, cp_path, feed_dict = None):
        
        print('Loading variables from {:s}'.format(cp_path))

        ckpt = tf.train.get_checkpoint_state(cp_path)
        if ckpt and ckpt.model_checkpoint_path:
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            raise Exception("no checkpoint found")

#         if feed_dict:
#             self.feed(feed_dict=feed_dict)
        print('Loading successful')
    
    def close_session(self):
        self.sess.close()
    
    def update_lr(self, epoch):
        self.learn_rate = 1.0 / np.sqrt(epoch)

In [None]:
# kwargs_feed_dict_train = {'x': X_train, 'y': Y_train}
# kwargs_feed_dict_test = {'x': X_val, 'y': Y_val}

# hparam_str = make_hparam_string(**kwargs_simple_lstm)

# lstm = Lstm_model(hparam_str=hparam_str, 
#                   embed_vis_path=embed_vis_path, 
#                   feed_dict_train=kwargs_feed_dict_train, 
#                   feed_dict_test=kwargs_feed_dict_test, 
#                   **{**kwargs_simple_lstm, 
#                      **{'epochs': 100, 'dynamic_learn_rate': True}})

# lstm.train()
# lstm.close_session()

In [None]:
# topk= tf.nn.top_k(lstm.logits, 
#                   k=lstm.n_class, 
#                   sorted=True)
# topk.indices

# mod_y_targets = lstm.n_class - 1 - lstm.y_targets
# mod_y_targets

In [None]:
# lstm.restore(cp_path=os.path.join(lstm.log_dir, hparam_str))

In [None]:
# lstm.sess.run(lstm.outputs, feed_dict=lstm.feed_dict_train_eval)

In [None]:
# trying out a LOT of hyper-parameters configurations
kwargs_feed_dict_train = {'x': X_train, 'y': Y_train}
kwargs_feed_dict_test = {'x': X_val, 'y': Y_val}

lstm_models = {}
for dynamic_learn_rate, learn_rate in [(True, 0.1)] + list(
    zip([False] * 2 ,list(np.logspace(-1, -2, 2)))):
    for keep_prob in [0.5, 0.7, 1.0]:
        for one_hot, char_embed_dim in [(True, 4)] + list(zip([False] * 1 , [4])):
            for hidden_state_size in [8, 32]:
                # collect new hyperparameters as args
                current_kw_simple_lstm = {
                    **kwargs_simple_lstm, 
                    **{'dynamic_learn_rate': dynamic_learn_rate, 
                       'learn_rate': learn_rate, 
                       'keep_prob': keep_prob, 
                       'one_hot': one_hot, 
                       'char_embed_dim': char_embed_dim, 
                       'hidden_state_size': hidden_state_size
#                        'bidirection': bidirection, 
#                        'target_rep': target_rep
                      }}
                # clear tf graph
    #                     tf.reset_default_graph()
                # create hyperparameter string
                hparam_str = make_hparam_string(**current_kw_simple_lstm)
                var = 'lstm_{}'.format(hparam_str)
                lstm_models[var] = Lstm_model(feed_dict_train=kwargs_feed_dict_train, 
                                              feed_dict_test=kwargs_feed_dict_test, 
                                              hparam_str=hparam_str, 
                                              embed_vis_path=embed_vis_path, 
                                              **current_kw_simple_lstm)
                lstm_models[var].train()
                lstm_models[var].close_session()

In [None]:
# getting data directly from a tensorboard log dir
from tensorflow.python.summary import event_multiplexer
# specify path (for parent log dir)
log_parent_dir = './logdir_exper_4_3/'
ea = event_multiplexer.EventMultiplexer().AddRunsFromDirectory(log_parent_dir)
ea.Reload()  # load

child_dir = next(os.walk(log_parent_dir))[1]
print(ea.Scalars(child_dir[0], 'accuracy/accuracy_test'))  # specify run, scalar_name