# Transformer (translation)

In [4]:
import random
import numpy as np
import tensorflow as tf
from konlpy.tag import Okt

from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Lambda, Layer, Embedding, LayerNormalization

In [5]:
%pwd

'C:\\Users\\ashgh\\해오라기'

In [11]:
# working directory 를 통일하기
import os
os.chdir('C:\\Users\\ashgh\\해오라기')

In [8]:
EPOCHS = 200
NUM_WORDS = 2000

'C:\\Users\\ashgh\\해오라기'

### Dot-Scaled Attention

![scaled dot product](https://cdn-images-1.medium.com/freeze/max/1000/1*ke6i8PgYamVVKedNbcmjVA.png?q=20)

In [14]:
class DotScaledAttention(Layer) :
    def __init__(self, d_emb, d_reduced, masked=False) :
        super().__init__()
        self.q = Dense(d_reduced, input_shape=(-1, d_emb))
        self.k = Dense(d_reduced, input_shape=(-1, d_emb))
        self.v = Dense(d_reduced, input_shape=(-1, d_emb))
        self.masked = masked
        
        self.scale = Lambda(lambda x : x/np.sqrt(d_reduced))
        
        
    # (q, k ,v)
    # self attention 을 할때 미래의 것을 참조하면 안되기 때문에 
    # 그부분을 고려한 코드
    # git 에서 참조
    def call(self, x, training =None, mask = None) :
        q = self.scale(self.q(x[0]))
        k = self.k(x[1])
        v = self.v(x[2])
        
        # k를 transpose 한 후 q와 matrix multiplication 해준다
        k_T = tf.transpose(k, perm=[0,2,1])
        # 결과는 두개를 inner product 한것
        comp = tf.matmul(q, k_T)
        
        
        
        if self.masked :
            length = tf.shape(comp)[-1]
            # 미래에 해당하면 마이너스 인피니티를 넣어 softmax 시에 0이 된다.
            mask = tf.fill((length, length), -np.inf)
            mask = tf.linalg.band_part(mask, 0, -1) # Get upper triangle
            mask = tf.linalg.set_diag(mask, tf.zeros((length))) # Set diagnoal to zeros
            comp += mask
            
        comp = tf.nn.softmax(comp, axis)
        
        
        return tf.matmul(comp, v)

### Multi Head Attention

In [15]:
class MultiHeadAttention(Layer) :
    def __init__(self, num_head, d_emb, d_reduced, masked=False) :
        super().__init__()
        # attention 을 여러개 만들자
        self.attention_list = list()
        for _ in range(num_head) :
            self.attention_list.append(DotScaledAttention(d_emb, d_reduced, masked))
       
        # 위결과를 concat 후 다시 projection 하는 과정
        self.linear = Dense(d_emb, input_shape=(-1, num_head * d_reduced))
        
    def call(self, x, traning=None, mask=None) :
        
        attention_list = [a(x) for a in self.attention_list]
        concat = tf.concat(attention_list, axis=-1)
        return self.linear(concat)
    
    
        
        

### Encoder

![encoder](https://miro.medium.com/max/329/1*4HJt3iD5tbtf9wZFuDrM-Q.png)

In [16]:
class Encoder(Layer) :
    def __init__(self, num_head, d_reduced) :
        super().__init__()
        
        self.num_head = num_head
        self.d_r = d_reduced

    # build는 레이어를 초기화하며
    # input_shape 가 처음 들어왔을때 어떻게 빌드 할것인가
    def build(self, input_shape) :
        # 왜 input_shape[-1] 인가?
        # x의 dimention을 구해야 하기 때문 ,
        # MHA 의 d_emb 파라미터 값이다,
        self.multi_attention = MultiHeadAttention(self.num_head, input_shape[-1], self.d_r)
        self.layer_norm1 = LayerNomalization(input_shape= input_shape)
        self.dense1 = Dense(input_shape[-1] * 4, input_shape = input_shape, activation ='relu')
        self.dense2 = Dense(input_shape[-1] 
                            , input_shape = self.dense1.compute_output_shape(input_shape))
        self.layer_norm1 = LayerNomalization(input_shape= input_shape)
        
        
        super().build(input_shape)
    # call 은 입력을 처리해서 결과를 반환하는 메소드
    def call(self, x, training=None, mask=None) :
        # 인코더에선 selfattention 구조이므로 xxx
        # 아래 두줄은 residual 구조
        h = self.multi_attention(x, x, x)
        ln1 = self.layer_norm1(x+h)
        
        h = self.linear2(self.linear1(ln1))
        ln2 = self.layer_norm1(h+ln1)
        return ln2
        
    def compute_output_shape(self, input_shape) :
        return input_shape

### Decoder