In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

%matplotlib inline

<br></br><br></br><br></br>

# Model
- Transformer with denseNet
    - Encoding된 vector를 stack하는 방식
    - 마지막에 1x1 CNN
    
    
- need to customize
    - QK_T_dk -> QK_T
    - sinusoid PE -> other PE
    - pooling
    
    
- https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/4f4a192f0fd272102c8852b00b1007dffd292b90/transformer/Models.py#L11
- https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/layers/common_attention.py
- http://nlp.seas.harvard.edu/2018/04/03/attention.html


In [None]:
class Transformer_Classifer() :
    def __init__(self, sess, name):
        self.sess = sess
        self.name = name
        
    def embedding(self, X_input, emb_dim, emb_activation) :
        if emb_dim == X_input.get_shape().as_list()[2] :
            emb_vector = X_input
            
        else :
            emb_vector = tf.layers.dense(X_input, emb_dim, activation=emb_activation)
            
        return emb_vector
    
    def positional_encoding(self, X_input) :
        batch, position, dim = X_input.get_shape().as_list()
        position_enc = np.array([list([[pos / np.power(10000, 2*i/dim) for i in range(dim)] 
                                if pos != 0 else np.zeros(dim) for pos in range(position)])]*batch)

        position_enc[:, 1:, 0::2] = np.sin(position_enc[:, 1:, 0::2])
        position_enc[:, 1:, 1::2] = np.cos(position_enc[:, 1:, 1::2])

        pos_enc_vector = tf.constant(position_enc, dtype=tf.float32, shape=[batch, position, dim])
        return pos_enc_vector
        
    def scaled_dot_product_attention(self, Q, K, V, dk) :
        QKT_dk = tf.matmul(Q,K, transpose_b=True) / tf.sqrt(dk)
        attention = tf.nn.softmax(QKT_dk)
        attended_vector = tf.matmul(attention,V)
        
        return attended_vector
        
    def multihead_attention(self, Q, K, V, h) :
        batch, position, dim_q = Q.get_shape().as_list()
        batch, position, dim_k = K.get_shape().as_list()
        batch, position, dim_v = V.get_shape().as_list()
        dq = dim_q//h
        dk = dim_k//h
        dv = dim_v//h
        
        W_Q = tf.Variable(tf.random_normal([h, batch, dim_q, dq]))
        W_K = tf.Variable(tf.random_normal([h, batch, dim_k, dk]))
        W_V = tf.Variable(tf.random_normal([h, batch, dim_v, dv]))
        W_O = tf.Variable(tf.random_normal([batch, dk*h, dim_k]))
    
        head_lst = []
        for idx in range(h) :
            head_lst.append(self.scaled_dot_product_attention(tf.matmul(Q, W_Q[idx]), 
                                                              tf.matmul(K, W_K[idx]), 
                                                              tf.matmul(V, W_V[idx]), 
                                                              tf.constant(dk, tf.float32)))
        
        multihead_attention = tf.concat(head_lst, axis=2)
        linear_projection= tf.matmul(multihead_attention, W_O)
        
        return linear_projection
        
    def normarlization(self, X_input) :
        return tf.contrib.layers.layer_norm(X_input)
    
    def feedforward_network(self, X_input, ffn_dim) :
        batch, position, dim =  X_input.get_shape().as_list()
        
        ff_vector1 = tf.layers.dense(X_input, ffn_dim, activation=tf.nn.leaky_relu)
        dropout = tf.layers.dropout(ff_vector1, training=self.training)
        ff_vector2 = tf.layers.dense(dropout, dim)
        
        return ff_vector2
    
    def resnet(self, X_input1, X_input2) :
        return X_input1 + X_input2
        
    def encoder_layer(self, X_input, h, ffn_dim) :
        attended_vector = self.multihead_attention(X_input, X_input, X_input, h)
        sublayer1 = self.resnet(X_input, attended_vector)
        norm1 = self.normarlization(sublayer1)
        
        ffn_vector = self.feedforward_network(norm1, ffn_dim)
        sublayer2 = self.resnet(norm1, ffn_vector)
        norm2 = self.normarlization(sublayer2)
        
        return norm2
    
    def classifier(self, X_input, fc_dim, fc_activation, output_class) : 
        layer1 = tf.layers.dense(inputs=X_input, units=fc_dim, activation=fc_activation)
        dropout1 = tf.layers.dropout(layer1, training=self.training)
        layer2 = tf.layers.dense(inputs=dropout1, units=fc_dim//10)

        batch, position, dim =  layer2.get_shape().as_list()
        dimension = position*dim
        flat = tf.reshape(layer2, [batch, dimension])

        layer3 = tf.layers.dense(inputs=flat, units=dimension*2, activation=fc_activation)
        dropout2 = tf.layers.dropout(layer3, training=self.training)
        layer4 = tf.layers.dense(inputs=dropout2, units=output_class)

        return layer4
  
    def build(self, batch_size, input_length, input_dim, N, emb_dim, emb_activation, h, ffn_dim, fc_dim, fc_activation, output_class) :
        with tf.variable_scope(self.name) :
            
            ## Setting ##
            # input  : ? x input_length x input_dim
            self.X = tf.placeholder(tf.float32, [batch_size, input_length, input_dim])
            self.Y = tf.placeholder(tf.float32, [batch_size, output_class])
            self.learning_rate =  tf.placeholder(tf.float32)
            self.training = tf.placeholder(tf.bool)
            #############
            
            
            ## Encoder ##
            # input  : ? x input_length x input_dim
            # output : ? x input_length x emb_dim
            embedded_vector = self.embedding(self.X, emb_dim, emb_activation)
            pos_encoded_vector = embedded_vector + self.positional_encoding(embedded_vector)
            encoder_input = pos_encoded_vector
            
            for idx in range(N) :
                encoder_input = self.encoder_layer(encoder_input, h, ffn_dim)
            self.encoder_output = encoder_input
            #############
            
            
            ## Classifier ##
            # input  : ? x input_length x emb_dim
            # output : ? x input_length x fc_dim
            self.logit = self.classifier(self.encoder_output, fc_dim, fc_activation, output_class)
            self.softmax_logit = tf.nn.softmax_cross_entropy_with_logits(logits=self.logit, labels=self.Y)
            ################
            
            
            ## Learning ##
            self.cost =  tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logit, labels=self.Y))

            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=self.name)
            with tf.control_dependencies(update_ops):
                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
            
            correct_prediction = tf.equal(tf.argmax(self.logit, 1), tf.argmax(self.Y, 1))     
            self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))    
            ##############
            
    def predict(self, X_input, batch_size, training=False):
        N = X_input.shape[0]
        total_logit = []
            
        for i in range(0, N, batch_size):
            X_batch = X_input[i:i + batch_size]
            feed_dict = {self.X: X_batch, self.training: False}
                
            step_logit = self.sess.run(self.logit, feed_dict=feed_dict)
            total_logit.append(step_logit)
            
        return np.concatenate(total_logit, axis=0)

    def get_accuracy(self, X_input, Y_input, batch_size, training=False):
        N = X_input.shape[0]
        total_acc = 0
            
        for i in range(0, N, batch_size):
            X_batch = X_input[i:i + batch_size]
            Y_batch = Y_input[i:i + batch_size]
            feed_dict = {self.X: X_batch, self.Y: Y_batch, self.training: False}
                
            step_acc = self.sess.run(self.accuracy, feed_dict=feed_dict)
            total_acc += step_acc * X_batch.shape[0]
            
        total_acc /= N
        return total_acc

    def train(self, X_input, Y_input, learning_rate, training=True):
        return self.sess.run([self.cost, self.optimizer], feed_dict={self.X: X_input, self.Y: Y_input, self.learning_rate: learning_rate, self.training: training})
    
    def evaluate(self, X_input, Y_input, batch_size):
        N = X_input.shape[0]
            
        total_loss = 0
        total_acc = 0
            
        for i in range(0, N, batch_size):
            X_batch = X_input[i:i + batch_size]
            Y_batch = Y_input[i:i + batch_size]
            feed_dict = {self.X: X_batch, self.Y: Y_batch, self.training: False}
                
            loss = self.cost
            accuracy = self.accuracy
                
            step_loss, step_acc = self.sess.run([loss, accuracy], feed_dict=feed_dict)
                
            total_loss += step_loss * X_batch.shape[0]
            total_acc += step_acc * X_batch.shape[0]
            
        total_loss /= N
        total_acc /= N
            
        return total_loss, total_acc