In [5]:
import random
import numpy as np
import tensorflow as tf
# from konlpy.tag import Okt

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Lambda, Layer, LayerNormalization, Embedding

In [6]:
class ScaleDotProductAttention(layer):
    def __init__(self, d_emb, d_reduced, masked=False):
        super().__init__()
        
        self.q = Dense(d_reduced, input_shape=(-1, d_emb))        # linear projection
        self.k = Dense(d_reduced, input_shape=(-1, d_emb))        # linear projection
        self.v = Dense(d_reduced, input_shape=(-1, d_emb))        # linear projection
        
        self.scaled = Lambda(lambda x: x/np.sqrt(d_reduced))    #1번 공식 활용
        self.masked = masked
    
    def call(self, x, training=None, masked=None): # x shape = (q, k, v)
        q = self.q(x[0])
        k = self.k(x[1])
        v = self.v(x[2])
        
        k_t = tf.transpose(k, perm=[0, 2, 1])      #[0,1,2] ->[0,2,1]로 transpose
        product = tf.matmul(q, k_t)
        
        scaled = self.scaled(product)
        
        if masked:                                #add the mask
            length = tf.shape(scaled)[-1]
            mask = tf.fill((length, length), -np.inf)                   #inf = infinity
            mask = tf.linalg.band_part(mask, 0, -1)                     # upper triangle
            mask = tf.linalg.set_diag(mask, tf.zeros(length))
            scaled += mask
        
        scaled = tf.nn.softmax(scaled, axis=-1)    #가장 낮은 차원에서 붙이기
        
        return tf.matmul(scaled, v)

NameError: name 'layer' is not defined

In [7]:
class MultiHeadAttention(Layer):
    def __init__(self, h, d_emb, d_reduced, masked=False):
        super().__init__()
        
        self.attention_list = list()
        
        for _ in range(h):
            self.attention_list.append(ScaledDotProductAttention(d_emb, d_reduced, masked)) #DotProduct에서 받아온 값들을 attentionlist에 저장
        
        self.linear = Dense(d_emb, input_shape=((-1, h * d_reduced)))
                            
    def call(self, x, training=None):
        attention_list = [a(x) for a in self.attention_list]            #attention_list에 저장된 값들을 리스트로 만들어
        concat = tf.concat(attention_list, axis=-1)                      # concat하기
                            
        return self.linear(concat)

In [8]:
class Encoder(Layer):
    def __init__(self, num_head, d_reduced):
        super().__init__()
        self.num_head = num_head
        self.d_reduced = d_reduced
        
    def build(self, input_shape):
        self.multihead_attention = MultiHeadAttention(self.num_head, input_shape[-1], self.d_reduced)
        self.layer_normalization1 = LayerNormalization(input_shape=input_shape)
        self.dense1 = Dense(input_shape[-1] * 4, input_shape=input_shape, activation='relu') #FNN에서 두 레이어 선형레이어 사이에 ReLU를 넣
        self.dense2 = Dense(input_shape[-1])
        self.layer_normalization2 = LayerNormalization(input_shape=input_shape)
        super().build(input_shape)
        
    def call(self, x, training=None, masked=None):
        h = self.multihead_attention((x, x, x))
        ln1 = self.layer_normalization1(x + h)
        
        h = self.dense2(self.dense1(ln1))
        return self.layer_normalization2(ln1 + h)
        
    def compute_output_shape(self, input_shape):
        return input_shape   

In [9]:
class Decoder(Layer):
    def __init__(self, num_head, d_reduced):
        super().__init__()
        self.num_head = num_head
        self.d_reduced = d_reduced
    
    def build(self, input_shape):
        self.self_attention = MultiHeadAttention(self.num_head, input_shape[0][-1], self.d_reduced, masked=True)
        self.layer_normalization1 = LayerNormalization(input_shape=input_shape)
        
        self.multihead_attention = MultiHeadAttention(self.num_head, input_shape[0][-1], self.d_reduced)
        self.layer_normalization2 = LayerNormalization(input_shape=input_shape)
        
        self.dense1 = Dense(input_shape[0][-1] * 4, input_shape=input_shape[0], activation='relu')
        self.dense2 = Dense(input_shape[0][-1])
        self.layer_normalization3 = LayerNormalization(input_shape=input_shape)
    
    def call(self, inputs, training=None, masked=None): #inputs (x, context)
        x, context = inputs
        h = self.self_attention((x, x, x))
        ln1 = self.layer_normalization1(x + h)
        
        h = self.multihead_attention((ln1, context, context))
        ln2 = self.layer_normalization2(ln1 + h)
        
        h = self.dense2(self.dense1(ln2))
        return self.layer_normalization3(ln2 + h)
        
        
    def compute_output_shape(self, input_shape):
        return input_shape

In [None]:
class PositionalEncoding(Layer):  # Referred from https://github.com/LastRemote/Transformer-TF2.0
    def __init__(self, max_len, d_emb):
        super().__init__()
        self.sinusoidal_encoding = np.array([self.get_positional_angle(pos, d_emb) for pos in range(max_len)], dtype=np.float32)
        self.sinusoidal_encoding[:, 0::2] = np.sin(self.sinusoidal_encoding[:, 0::2])  #sin 0~짝수
        self.sinusoidal_encoding[:, 1::2] = np.cos(self.sinusoidal_encoding[:, 1::2])  #cos 1~홀수
        self.sinusoidal_encoding = tf.cast(self.sinusoidal_encoding, dtype=tf.float32)

    def call(self, x, training=None, mask=None):
        return x + self.sinusoidal_encoding[:tf.shape(x)[1]]

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_angle(self, pos, dim, d_emb):
        return pos / np.power(10000, 2 * (dim // 2) / d_emb)

    def get_positional_angle(self, pos, d_emb):
        return [self.get_angle(pos, dim, d_emb) for dim in range(d_emb)]