In [None]:
import numpy as np

from utils.utils import *
from utils.utils_nn import *

np.random.seed(seed())

import tensorflow as tf
tf.reset_default_graph()
tf.set_random_seed(seed())

import pandas as pd

import os

from tensorflow.contrib import rnn
from tensorflow.contrib.tensorboard.plugins import projector  # embeddings visualizer

# from collections import Counter
# from math import ceil

import random
random.seed(seed())

# import matplotlib.pyplot as plt

# import re

In [None]:
# initialize data from main (original) CSV file
x, y, n, main_data = init_data()
freq = [i for i in main_data['CNT'][:n]]  # frequencies, turned into a list
# initialize data from suggestions CSV file
x_suggest, y_suggest, freq_suggest = init_data_suggest()

In [None]:
kwargs_simple_lstm = nice_dict({
    # log
    'log_dir': 'logdir/', 
    'del_log': True, 
    # preprocessing and data
    'char_filter': 100, 
    'scale_func': unscale,  # log_scale, 
    'to_permute': True, 
    'top_k': 5, 
    'seed': seed(), 
    # learning hyper-params
    'learn_rate': 1E-1,  # 1E-4
    'char_embed_dim': 4, 
    'one_hot': False,
    'hidden_state_size': 8, 
    'keep_prob': 0.7, 
    'epochs': 20,
    'summary_step': 5, 
    'save_step': 10
})

if kwargs_simple_lstm.del_log: remove_dir_content(kwargs_simple_lstm.log_dir)

In [None]:
# filter characters according to 'char_filter',
# makes all sequences the same (max) length and pads with 'unknown' character
x_char_filtered_pad, statistics_dict = \
    text_filter_pad_to_index(text=x, y=y, **kwargs_simple_lstm)
# update main dict with newly calculated figures
kwargs_simple_lstm = nice_dict({**kwargs_simple_lstm, **statistics_dict})

# create look-up dictionaries (and inverse) for an index representation
char_int, char_int_inv, label_int, label_int_inv = \
    lookup_dicts_chars_labels(**kwargs_simple_lstm)

# transform x_suggest in a similar manner
# taking into consideration the given character set
x_suggest_char_filtered_pad, statistics_dict = \
    text_filter_pad_to_index(text=x_suggest, y=y_suggest, **kwargs_simple_lstm)

# check that there are no "new" statistics popping out
assert_no_stats_change(new_dict=statistics_dict, 
                       kwargs=kwargs_simple_lstm)

# merge original and suggested data
x_merge, y_merge, freq_merge = \
    x_char_filtered_pad + x_suggest_char_filtered_pad, \
    y + y_suggest, \
    freq + freq_suggest
# y_merge = y + y_suggest
# freq_merge = freq + freq_suggest

In [None]:
# split to training and validation sets
x_val, x_train, y_val, y_train, freq_val, freq_train, valid_index = \
    train_validation_split(x=x_merge, y=y_merge, freq=freq_merge, 
                           label_count_thresh=10, 
                           valid_ratio=0.25)
n_train, n_test = len(y_train), len(y_val)

In [None]:
# scale data (proportional to frequency)
# training data
x_train_scaled, y_train_scaled, kwargs_simple_lstm['n_train'] = \
    scale_permute_data(x=x_train, 
                       y=y_train, 
                       freq=freq_train, 
                       scale_func=kwargs_simple_lstm.scale_func, 
                       to_permute=kwargs_simple_lstm.to_permute)

# validation data
x_val_scaled, y_val_scaled, kwargs_simple_lstm['n_test'] = \
    scale_permute_data(x=x_val, 
                       y=y_val, 
                       freq=freq_val, 
                       scale_func=kwargs_simple_lstm.scale_func, 
                       to_permute=kwargs_simple_lstm.to_permute)

In [None]:
# aliasing, so that will run smoothly from here
x_feed_train, y_feed_train, x_feed_val, y_feed_val = \
    x_train_scaled, y_train_scaled, x_val_scaled, y_val_scaled

In [None]:
# returns np.arrays to feed into tf model
# training data
X_train, _, Y_train = index_transorm_xy(x=x_feed_train, 
                                        y=y_feed_train, 
                                        char_int=char_int, 
                                        label_int=label_int, 
                                        **kwargs_simple_lstm)

# validation data
X_val, _, Y_val = index_transorm_xy(x=x_feed_val, 
                                    y=y_feed_val, 
                                    char_int=char_int, 
                                    label_int=label_int, 
                                    **kwargs_simple_lstm)

# write a metadata file for embeddings visualizer and create path string
embed_vis_path = write_embeddings_metadata(log_dir=kwargs_simple_lstm.log_dir, 
                                           dictionary=char_int, 
                                           file_name='metadata.tsv')

In [None]:
class Lstm_model(object):

    def __init__(self, 
                 *args, 
                 hparam_str, 
#                  n_train, 
#                  n_test, 
                 seq_len, 
                 n_class, 
                 n_char, 
                 char_embed_dim, 
                 one_hot, 
                 hidden_state_size, 
                 keep_prob, 
                 learn_rate, 
                 top_k, 
                 epochs, 
                 log_dir, 
                 embed_vis_path, 
                 summary_step, 
                 save_step, 
                 seed, 
                 feed_dict_train, 
                 feed_dict_test, 
                 **kwargs):
        # clear tf graph0
        tf.reset_default_graph()
        
        self.hparam_str = hparam_str
#         self.n_train = n_train
#         self.n_test = n_test
        self.seq_len = seq_len 
        self.n_class = n_class 
        self.n_char = n_char
        self.char_embed_dim = char_embed_dim
        self.one_hot = one_hot
        self.hidden_state_size = hidden_state_size
#         self.keep_prob = keep_prob
        self.learn_rate = learn_rate
        self.top_k = top_k
        self.epochs = epochs
        self.log_dir = log_dir
        self.embed_vis_path = embed_vis_path
        self.summary_step = summary_step 
        self.save_step = save_step
        self.seed = seed
        
        # placeholders
        self.embedding_matrix = None
                
        # g = tf.Graph()
        # with g.as_default():
        #     tf.set_random_seed(1)
        
#         self.g = tf.Graph()
#         self.g.seed = self.seed
    #         with self.g.as_default():
#         tf.set_random_seed(self.seed)
        self.sess = tf.Session()
        

        # Setup placeholders, and reshape the data
        self.x_ = tf.placeholder(tf.int32, [None, self.seq_len], 
                            name='Examples')
        self.y_ = tf.placeholder(tf.int32, [None, self.n_class], 
                            name='Lables')
        self.keep_prob = tf.placeholder(tf.float32, [], 
                            name='Keep_probability')
        
        self.feed_dict_train = {self.x_: feed_dict_train['x'], 
                                self.y_: feed_dict_train['y'], 
                                self.keep_prob: keep_prob}
        
        self.feed_dict_test = {self.x_: feed_dict_test['x'], 
                               self.y_: feed_dict_test['y'], 
                               self.keep_prob: 1.0}
        
        self.embedding_matrix = self.embed_matrix()

        self.outputs = self.lstm_unit(input=self.x_)

        self.logits = self.logit(input=self.outputs, 
                            size_in=self.hidden_state_size, 
                            size_out=self.n_class)

        with tf.name_scope('cross_entropy'):
            self.cost = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_))
            tf.summary.scalar('cross_entropy_train', self.cost, collections=['train'])
            tf.summary.scalar('cross_entropy_test', self.cost, collections=['test'])

        with tf.name_scope('train'):
            self.train_step = tf.train.AdamOptimizer(
                self.learn_rate).minimize(self.cost)

        with tf.name_scope('accuracy'):
            self.correct_prediction = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.y_, 1))
            self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))
            tf.summary.scalar('accuracy_train', self.accuracy, collections=['train'])
            tf.summary.scalar('accuracy_test', self.accuracy, collections=['test'])
        
        with tf.name_scope('in_top_{}'.format(self.top_k)):
            self.y_targets = tf.argmax(self.y_, 1)
            self.top_k_res = tf.reduce_mean(tf.cast(
                tf.nn.in_top_k(self.logits, self.y_targets, self.top_k), 
                tf.float32))
            tf.summary.scalar('in_top_{}_train'.format(self.top_k), self.top_k_res, collections=['train'])
            tf.summary.scalar('in_top_{}_test'.format(self.top_k), self.top_k_res, collections=['test'])
                    
#         # embedding vis
#         self.embedding_vis = tf.Variable(tf.zeros(self.embedding_matrix.get_shape().as_list()), 
#                                     trainable=False, 
#                                     name='embedding_vis')
#         self.assignment = self.embedding_vis.assign(self.embedding_matrix)

        # summaries per collection and saver object
        self.summ_train = tf.summary.merge_all('train')
        self.summ_test = tf.summary.merge_all('test')
        self.saver = tf.train.Saver()

        # init vars and setup writer
        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(self.log_dir + self.hparam_str)
        self.writer.add_graph(self.sess.graph)
        
        # Add embedding tensorboard visualization. Need tensorflow version
        self.config = projector.ProjectorConfig()
        self.embed = self.config.embeddings.add()
        self.embed.tensor_name = self.embedding_matrix.name
        self.embed.metadata_path = os.path.join(self.embed_vis_path)
#         self.embed.metadata_path = os.path.join(self.log_dir, 'metadata.tsv')
#         self.embed.metadata_path = os.path.join(self.log_dir, 'metadata.tsv')
        projector.visualize_embeddings(self.writer, self.config)
        
        
    def embed_matrix(self, stddev=0.1, name='embeddings'):
        # index_size would be the size of the character set
        with tf.name_scope(name):
            if not self.one_hot:
                embedding_matrix = tf.get_variable(
                    'embedding_matrix', 
                    initializer=tf.truncated_normal([self.n_char, self.char_embed_dim], 
                                                    stddev=stddev, 
                                                    seed=self.seed), 
                    trainable=True)
            else:
                # creating a one-hot for each character corresponds to the identity matrix
                embedding_matrix = tf.constant(value=np.identity(self.n_char), 
                                               name='embedding_matrix', 
                                               dtype=tf.float32)
                self.char_embed_dim = self.n_char

            tf.summary.histogram('embedding_matrix', embedding_matrix, collections=['train'])
            self.embedding_matrix = embedding_matrix
            return self.embedding_matrix
        
        
    def lstm_unit(self, 
                  input, 
                  name='LSTM'):
        with tf.name_scope(name):
            input = tf.nn.embedding_lookup(self.embedding_matrix, input)
            # reshaping
            # Permuting batch_size and n_steps
            input = tf.transpose(input, [1, 0, 2])
            # Reshaping to (n_steps*batch_size, n_input)
            input = tf.reshape(input, [-1, self.char_embed_dim])
            # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
            rnn_inputs = tf.split(input, self.seq_len, 0)

            cell = rnn.BasicLSTMCell(num_units=self.hidden_state_size)
            cell = rnn.DropoutWrapper(cell, 
                                      output_keep_prob=self.keep_prob, 
                                      seed=self.seed)

            outputs, states = rnn.static_rnn(cell, rnn_inputs, dtype=tf.float32)
            outputs = outputs[-1]
            tf.summary.histogram('outputs', outputs, collections=['train'])
            return outputs


    def logit(self, 
              input, 
              size_in, 
              size_out, 
              stddev=0.1, 
              name='logit'):

        with tf.name_scope(name):
            w = tf.Variable(tf.truncated_normal([size_in, size_out], 
                                                stddev=stddev, 
                                                seed=self.seed), 
                            name='W')
            b = tf.Variable(tf.constant(0.1, 
                                        shape=[size_out]), 
                            name='B')
            logits = tf.matmul(input, w) + b
            tf.summary.histogram('weights', w, collections=['train'])
            tf.summary.histogram('biases', b, collections=['train'])
            tf.summary.histogram('logits', logits, collections=['train'])
            return logits
    
        
    def train(self):
        print('Starting to train model {:s}'.format(self.hparam_str))
        
#         self.sess.graph.seed = self.seed
        for i in range(1, self.epochs+1):
            # train step
            self.sess.run(self.train_step, feed_dict=self.feed_dict_train)
            if i % self.summary_step == 0:
                # train summary
                [train_accuracy, train_cost, train_top_k, s] = \
                    self.sess.run([self.accuracy, 
                                   self.cost, 
                                   self.top_k_res, 
                                   self.summ_train],
                                  feed_dict=self.feed_dict_train)
                self.writer.add_summary(s, i)
                print('{:.3f} of observations in the top is {}'.format(self.top_k, train_top_k))
                # test summary
                [test_accuracy, test_cost, test_top_k, s] = \
                    self.sess.run([self.accuracy, 
                                   self.cost, 
                                   self.top_k_res, 
                                   self.summ_test],
                                  feed_dict=self.feed_dict_test)
                self.writer.add_summary(s, i)
                
                print('Epoch number {}, '.format(i) +
                      'training accuracy is {:.5f} and '.format(train_accuracy) + 
                      'test accuracy is {:.5f}, '.format(test_accuracy))
                print('training cost is {:.5f} and '.format(train_cost) + 
                      'test cost is {:.5f} and '.format(test_cost))
                
            if i % self.save_step == 0:
                print('Saving step {}'.format(i))
#                 self.sess.run(self.assignment, feed_dict=self.feed_dict_train)
                self.saver.save(self.sess, os.path.join(self.log_dir, 
                                                        self.hparam_str, 
                                                        'model.ckpt'), i)
            
#         self.sess.close()
        print('Training the model is done! ({:s})'.format(self.hparam_str))
    
    
    def restore(self, cp_path, feed_dict = None):
        
        print('Loading variables from {:s}'.format(cp_path))

        ckpt = tf.train.get_checkpoint_state(cp_path)
        if ckpt and ckpt.model_checkpoint_path:
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            raise Exception("no checkpoint found")

#         if feed_dict:
#             self.feed(feed_dict=feed_dict)
        print('Loading successful')

In [None]:
# kwargs_feed_dict_train = {'x': X_train, 'y': Y_train}
# kwargs_feed_dict_test = {'x': X_val, 'y': Y_val}

# hparam_str = make_hparam_string(**kwargs_simple_lstm)

# lstm = Lstm_model(hparam_str=hparam_str, 
#                   embed_vis_path=embed_vis_path, 
#                   feed_dict_train=kwargs_feed_dict_train, 
#                   feed_dict_test=kwargs_feed_dict_test, 
#                   **{**kwargs_simple_lstm, 
#                      **{'epochs': 20}})

# lstm.train()

In [None]:
# lstm.restore(cp_path=os.path.join(lstm.log_dir, hparam_str))

In [None]:
# with lstm.sess.as_default() as sess:
# with lstm.sess as sess:
#     with lstm.Graph().as_default():
#         print(lstm.logits.eval(feed_dict=lstm.feed_dict))
#     print(lstm.embedding_matrix.eval())

In [None]:
# trying out a LOT of hyper-parameters configurations
kwargs_feed_dict_train = {'x': X_train, 'y': Y_train}
kwargs_feed_dict_test = {'x': X_val, 'y': Y_val}

lstm_models = {}
for learn_rate in list(np.logspace(-1, -2, 2)):
    for keep_prob in [0.7, 1.0]:
        for one_hot, char_embed_dim in [(True, 4)] + list(zip([False] * 1 , [4])):
            for hidden_state_size in [4, 32]:
                    # collect new hyperparameters as args
                    current_kw_simple_lstm = {
                        **kwargs_simple_lstm, 
                        **{'learn_rate': learn_rate, 
                           'one_hot': one_hot, 
                           'keep_prob': keep_prob, 
                           'char_embed_dim': char_embed_dim, 
                           'hidden_state_size': hidden_state_size}}
                    # clear tf graph
#                     tf.reset_default_graph()
                    # create hyperparameter string
                    hparam_str = make_hparam_string(learn_rate, 
                                                    one_hot, 
                                                    keep_prob, 
                                                    char_embed_dim, 
                                                    hidden_state_size)
                    var = 'lstm_{}'.format(hparam_str)
                    lstm_models[var] = Lstm_model(feed_dict_train=kwargs_feed_dict_train, 
                                                  feed_dict_test=kwargs_feed_dict_test, 
                                                  hparam_str=hparam_str, 
                                                  embed_vis_path=embed_vis_path, 
                                                  **current_kw_simple_lstm)
                    lstm_models[var].train()