In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

%matplotlib inline

# Model
- Transformer with denseNet
    - Encoding된 vector를 stack하는 방식
    - 마지막에 1x1 CNN
    
    
- need to customize
    - QK_T_dk -> QK_T
    - sinusoid PE -> other PE
    - pooling
    - one-hot을 한 후 cost 계산
    
    
- https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/4f4a192f0fd272102c8852b00b1007dffd292b90/transformer/Models.py#L11
- https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/layers/common_attention.py
- http://nlp.seas.harvard.edu/2018/04/03/attention.html


In [None]:
class Transformer_Classifer() :
    def __init__(self, sess, name):
        self.sess = sess
        self.name = name
        
    def enc_embedding(self, X_input, emb_dim, emb_activation) :
        if emb_dim == X_input.get_shape().as_list()[2] :
            emb_vector = X_input
            
        else :
            batch, position, dim = X_input.get_shape().as_list()
            W_emb = tf.Variable(tf.random_normal([batch, dim, emb_dim]), name="W_emb_enc")
            emb_vector = tf.matmul(X_input, W_emb)
            
        return emb_vector
    
    def dec_embedding(self, X_input, emb_dim, emb_activation) :
        batch, position, dim = X_input.get_shape().as_list()
        dimension = position*dim
        
        flat = tf.reshape(X_input, [batch, 1, dimension])
        W_emb = tf.Variable(tf.random_normal([batch, dimension, emb_dim]), name="W_emb_dec")
        
        emb_vector = tf.matmul(flat, W_emb)
        return emb_vector
    
    def positional_encoding(self, X_input) :
        batch, position, dim = X_input.get_shape().as_list()
        position_enc = np.array([list([[pos / np.power(10000, 2*i/dim) for i in range(dim)] 
                                if pos != 0 else np.zeros(dim) for pos in range(position)])]*batch)

        position_enc[:, 1:, 0::2] = np.sin(position_enc[:, 1:, 0::2])
        position_enc[:, 1:, 1::2] = np.cos(position_enc[:, 1:, 1::2])

        pos_enc_vector = tf.constant(position_enc, dtype=tf.float32, shape=[batch, position, dim])
        return pos_enc_vector
    
    def scaled_dot_product_attention(self, Q, K, V, dk) :
        QKT_dk = tf.matmul(Q,K, transpose_b=True) / dk
        attention = tf.nn.softmax(QKT_dk)
        attended_vector = tf.matmul(attention,V)
        
        return attended_vector
        
    def multihead_attention(self, Q, K, V, h) :
        batch, position, dim_q = Q.get_shape().as_list()
        batch, position, dim_k = K.get_shape().as_list()
        batch, position, dim_v = V.get_shape().as_list()
        dq = dim_q//h
        dk = dim_k//h
        dv = dim_v//h
        
        W_Q = tf.Variable(tf.random_normal([h, batch, dim_q, dq]), name="W_Q")
        W_K = tf.Variable(tf.random_normal([h, batch, dim_k, dk]), name="W_K")
        W_V = tf.Variable(tf.random_normal([h, batch, dim_v, dv]), name="W_V")
        W_O = tf.Variable(tf.random_normal([batch, dk*h, dim_k]), name="W_O")
    
        head_lst = []
        for idx in range(h) :
            head_lst.append(self.scaled_dot_product_attention(tf.matmul(Q, W_Q[idx]), 
                                                              tf.matmul(K, W_K[idx]), 
                                                              tf.matmul(V, W_V[idx]), 
                                                              tf.constant(dk, tf.float32)))
        
        multihead_attention = tf.concat(head_lst, axis=2)
        linear_projection= tf.matmul(multihead_attention, W_O)
        
        return linear_projection
        
    def normarlization(self, X_input) :
        return tf.contrib.layers.layer_norm(X_input)
    
    def feedforward_network(self, X_input, ffn_dim) :
        batch, position, dim =  X_input.get_shape().as_list()

        ff_vector1 = tf.layers.dense(X_input, ffn_dim, activation=tf.nn.relu)
        dropout = tf.layers.dropout(ff_vector1, training=self.training)
        ff_vector2 = tf.layers.dense(dropout, dim)
        
        return ff_vector2
    
    def resnet(self, X_input1, X_input2) :
        return X_input1 + X_input2
        
    def encoder_layer(self, X_input, h, ffn_dim) :
        attended_vector = self.multihead_attention(X_input, X_input, X_input, h)
        sublayer1 = self.resnet(X_input, attended_vector)
        norm1 = self.normarlization(sublayer1)
        
        ffn_vector = self.feedforward_network(norm1, ffn_dim)
        sublayer2 = self.resnet(norm1, ffn_vector)
        norm2 = self.normarlization(sublayer2)
        
        return norm2
    
    def decoder_layer(self, X_input, Y_input, h, ffn_dim) :
        attended_vector = self.multihead_attention(Y_input, X_input, X_input, h)
        sublayer1 = self.resnet(X_input, attended_vector)
        norm1 = self.normarlization(sublayer1)
        
        ffn_vector = self.feedforward_network(norm1, ffn_dim)
        sublayer2 = self.resnet(norm1, ffn_vector)
        norm2 = self.normarlization(sublayer2)
        
        return norm2
    
    def Encode(self) :
        enc_embedded_vector = self.enc_embedding(self.X, self.emb_dim, self.emb_activation)
        enc_pos_encoded_vector = enc_embedded_vector + self.positional_encoding(enc_embedded_vector)
        encoder_input = enc_pos_encoded_vector
        
        for idx in range(self.N) :
            encoder_input = self.encoder_layer(encoder_input, self.h, self.ffn_dim)
        encoder_output = encoder_input
    
        return encoder_output
    
    def Decode(self, encoder_output) :
        dec_embedded_vector = self.dec_embedding(self.X, self.emb_dim, self.emb_activation)
        decoder_input = dec_embedded_vector
        
        W_D = tf.Variable(tf.random_normal([self.output_length-1, self.batch_size, self.emb_dim, self.emb_dim]), name="W_D")
        decode_lst = []

        for idx in range(self.output_length) :
            if idx != 0 :
                decoder_input = tf.matmul(decoder_input, W_D[idx-1])
                
            for idx2 in range(self.N) :
                decoder_input = self.decoder_layer(encoder_output, decoder_input, self.h, self.ffn_dim)
                
            decoder_output = decoder_input
            decode_lst.append(decoder_output)

        return decode_lst
    
    def Classify(self, decoder_output) : 
        classifier_input = tf.stack(decoder_output, axis=1)
        batch, length, position, dim = classifier_input.get_shape().as_list()

        dimension = position*dim
        flat = tf.reshape(classifier_input, [batch, length, dimension])
        layer = tf.layers.dense(inputs=flat, units=1)

        return tf.reshape(layer, [batch, length])
    
    def build(self, batch_size, input_length, output_length, input_dim, N, emb_dim, emb_activation, h, ffn_dim, fc_activation) :
        with tf.variable_scope(self.name) :
            
            ## Setting ##
            # input  : ? x input_length x input_dim
            self.X = tf.placeholder(tf.float32, [batch_size, input_length, input_dim])
            self.Y = tf.placeholder(tf.float32, [batch_size, output_length])
            self.learning_rate =  tf.placeholder(tf.float32)
            self.training = tf.placeholder(tf.bool)
            
            self.batch_size = batch_size
            self.input_length = input_length
            self.output_length = output_length
            self.input_dim = input_dim
            self.N = N
            self.emb_dim = emb_dim
            self.emb_activation = emb_activation
            self.h = h 
            self.ffn_dim = ffn_dim
            self.fc_activation = fc_activation
            #############
            
            
            ## Encoder and Decoder ##
            self.Encoder = self.Encode()
            self.Decoder = self.Decode(self.Encoder)
            #########################
            
            
            ## Classifier ##
            self.logit = self.Classify(self.Decoder)
            self.softmax = tf.nn.softmax(self.logit)
            self.softmax_logit = tf.nn.softmax_cross_entropy_with_logits(logits=self.logit, labels=self.Y)
            ################
            
            
            ## Learning ##
            self.cost =  tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logit, labels=self.Y))

            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=self.name)
            with tf.control_dependencies(update_ops):
                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
            
            self.prediction = tf.equal(tf.argmax(self.logit, 1), tf.argmax(self.Y, 1))     
            self.accuracy = tf.reduce_mean(tf.cast(self.prediction, tf.float32))    
            ##############
        
        
    def train(self, X_input, Y_input, learning_rate, training=True):
        feed_dict = {self.X: X_input, self.Y: Y_input, self.learning_rate: learning_rate, self.training: training}
        _, cost = self.sess.run([self.optimizer, self.cost], feed_dict=feed_dict)
        
        return _, cost
    
    def predict(self, X_input, training=False):
        size = X_input.shape[0]
        result_lst = []
        
        total_loss = 0
        total_acc = 0
            
        for idx in range(0, size, self.batch_size):
            X_batch = X_input[idx:idx + batch_size]
            feed_dict = {self.X: X_batch, self.training: False}
                
            result = self.sess.run([self.logit], feed_dict=feed_dict)
            result_lst.append(result)
            
        return np.concatenate([x[0][:] for x in result_lst], axis=0)
    
    def evaluate(self, X_input, Y_input):
        size = X_input.shape[0]
            
        total_loss = 0
        total_acc = 0
            
        for idx in range(0, size, self.batch_size):
            X_batch = X_input[idx:idx + batch_size]
            Y_batch = Y_input[idx:idx + batch_size]
            feed_dict = {self.X: X_batch, self.Y: Y_batch, self.training: False}
                
            loss = self.cost
            accuracy = self.accuracy
                
            step_loss, step_acc = self.sess.run([loss, accuracy], feed_dict=feed_dict)
                
            total_loss += step_loss * X_batch.shape[0]
            total_acc += step_acc * X_batch.shape[0]
            
        total_loss /= size
        total_acc /= size
            
        return total_loss, total_acc
        
        

In [None]:
tf.reset_default_graph() 

<br></br><br></br><br></br> 