# 8-7. 프로젝트: 한영 번역기 만들기

In [1]:
import tensorflow as tf
import numpy as np
import sentencepiece as spm
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
import matplotlib.ticker as ticker
import matplotlib as mpl
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
 
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

import time
import re
import os
import io

from konlpy.tag import Mecab
from tqdm import tqdm    # tqdm
import random

## Step 1. 데이터 다운로드

In [2]:
path_to_file = os.getenv('HOME')+'/aiffel/transformer/korean-english-park.train.ko'

with open(path_to_file, "r") as f:
    train_ko_raw = f.read().splitlines()

print("Data Size:", len(train_ko_raw))

print("Example:")
for sen in train_ko_raw[0:100][::20]: print(">>", sen)

Data Size: 94123
Example:
>> 개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"
>> 북한의 핵무기 계획을 포기하도록 하려는 압력이 거세지고 있는 가운데, 일본과 북한의 외교관들이 외교 관계를 정상화하려는 회담을 재개했다.
>> "경호 로보트가 침입자나 화재를 탐지하기 위해서 개인적으로, 그리고 전문적으로 사용되고 있습니다."
>> 수자원부 당국은 논란이 되고 있고, 막대한 비용이 드는 이 사업에 대해 내년에 건설을 시작할 계획이다.
>> 또한 근력 운동은 활발하게 걷는 것이나 최소한 20분 동안 뛰는 것과 같은 유산소 활동에서 얻는 운동 효과를 심장과 폐에 주지 않기 때문에, 연구학자들은 근력 운동이 심장에 큰 영향을 미치는지 여부에 대해 논쟁을 해왔다.


In [3]:
path_to_file = os.getenv('HOME')+'/aiffel/transformer/korean-english-park.train.en'

with open(path_to_file, "r") as f:
    train_en_raw = f.read().splitlines()

print("Data Size:", len(train_en_raw))

print("Example:")
for sen in train_en_raw[0:100][::20]: print(">>", sen)

Data Size: 94123
Example:
>> Much of personal computing is about "can you top this?"
>> Amid mounting pressure on North Korea to abandon its nuclear weapons program Japanese and North Korean diplomats have resumed talks on normalizing diplomatic relations.
>> “Guard robots are used privately and professionally to detect intruders or fire,” Karlsson said.
>> Authorities from the Water Resources Ministry plan to begin construction next year on the controversial and hugely expensive project.
>> Researchers also have debated whether weight-training has a big impact on the heart, since it does not give the heart and lungs the kind of workout they get from aerobic activities such as brisk walking or running for at least 20 minutes.


## Step 2. 데이터 정제 및 토큰화

In [4]:
cleaned_corpus = set(list(zip(train_ko_raw, train_en_raw)))

In [5]:
cleaned_corpus

{('인터폴은 지난해 10월 이 같은 방식을 동원, 수배 10일만에 용의자를 검거한 바 있다.',
  'The first time, last October, led to an arrest ten days later.'),
 ('9장에 달하는 법원 명령서에따르면 스피어스는 아이들과 하룻밤을 보낼 수 있다.',
  'The nine-page ruling said Spears could have overnight visits with her kids something she had been seeking.'),
 ('여러 연구결과 여성의 건강상의 문제 중 일부는 심리적인 측면 때문에 불거진다고 진단했다.',
  "Several studies have shown that women's medical problems are more likely to be interpreted as emotional issues or complaining."),
 ('지난달 실업률이 3.8% 감소했지만 에너지 연료 인플레이션이 가속화되고 있어 이 같은 복합적인 요인으로 인해 일본 은행이 한동안 기준금리를 동결할 것이라는 예측도 나오고 있다.',
  "The nation's jobless rate unexpectedly fell to 3.8 percent in November, but overall the mixed data cements expectations that the Bank of Japan will keep interest rates unchanged for some time, even as energy-fueled inflation accelerates."),
 ('하워드 총리는 “우리의 유일한 관심사는 시간의 경과와 법적 시스템의 근본원리에 관한 것이며 그 사람들을 무기한 공판 없이 구속 할 수 없다"고 말했다.',
  '"Our sole concern is about the passage of time and the bedrock principle of our legal

In [6]:
def preprocess_sentence(sentence, s_token=False, e_token=False):
    sentence = sentence.lower().strip() #모든 입력을 소문자로 변환합니다. + 문장 앞뒤의 불필요한 공백을 제거합니다.
    sentence = re.sub(r"[^ㄱ-ㅎ가-힣a-z?.!,]+", " ", sentence)#알파벳, 문장부호, 한글만 남기고 모두 제거합니다.
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence) #문장부호 양옆에 공백을 추가합니다.
    sentence = re.sub(r'[" "]+', " ", sentence)
    
    return sentence

In [7]:
def preprocessing(cleaned_corpus):
    kor_corpus = []
    eng_corpus = []
    for sentence in cleaned_corpus:
        sentence_ko = preprocess_sentence(sentence[0])
        sentence_en = preprocess_sentence(sentence[1])
        kor_corpus.append(sentence_ko)
        eng_corpus.append(sentence_en)
    
    return kor_corpus, eng_corpus

In [8]:
#데이터 정제

kor_corpus, eng_corpus = preprocessing(cleaned_corpus)

In [9]:
print("kor data count:", len(kor_corpus))
print("eng data count:", len(eng_corpus))

print(kor_corpus[100])
print(eng_corpus[100])

kor data count: 78968
eng data count: 78968
앤드류 쿠오모 뉴욕주 관할 검찰총장은 안호이저부시가 미성년자에게 카페인이 든 알코올성 음료수를 홍보하고 있다 며 미국 개주 관할 검찰 수사에서 음료수의 건강상 이로움을 잘못 광고하고 있다는 사실을 밝혀냈다 고 전했다 . 
an investigation by attorneys general of states found that the brewer was marketing its caffeinated alcoholic beverages to minors and misrepresenting the drinks health benefits , said new york state attorney general andrew cuomo . 


In [10]:
type(kor_corpus)

list

In [11]:
#sentencepiece에 적용하기 위한 txt파일 생성

with open('kor.txt', 'w') as f:
    for sentence in kor_corpus:
        f.write('{}\n'.format(sentence))

In [12]:
with open('eng.txt', 'w') as f:
    for sentence in eng_corpus:
        f.write('{}\n'.format(sentence))

In [13]:
#데이터 토큰화 

def generate_tokenizer(txt):
    templates= '--input={} \
    --pad_id={} \
    --bos_id={} \
    --eos_id={} \
    --unk_id={} \
    --model_prefix={} \
    --vocab_size={} \
    --character_coverage={} \
    --model_type={}'
    
    train_input_file = txt
    vocab_size = 20000 # vocab 사이즈
    prefix = txt.split('.')[0] + '_spm' # 저장될 tokenizer 모델에 붙는 이름
    pad_id=0 #<pad> token을 0으로 설정
    bos_id=1 #<BOS> token을 1으로 설정
    eos_id=2 #<EOS> token을 2으로 설정
    unk_id=3 #<UNK> token(unknown)을 3으로 설정
    character_coverage = 1.0 # to reduce character set
    model_type ='unigram' # Choose from unigram (default), bpe, char, or word

    cmd = templates.format(train_input_file,
                           pad_id,
                           bos_id,
                           eos_id,
                           unk_id,
                           prefix,
                           vocab_size,
                           character_coverage,
                           model_type)
    
    spm.SentencePieceTrainer.Train(cmd)
    sp = spm.SentencePieceProcessor()
    sp.Load(prefix + '.model')
    
    return sp

In [14]:
ko_tokenizer = generate_tokenizer('kor.txt')
en_tokenizer = generate_tokenizer('eng.txt')

In [15]:
en_tokenizer.SetEncodeExtraOptions('bos:eos')

True

In [16]:
src_train = []
tgt_train = []

for i in range(0, len(kor_corpus)):
    src_tokens = ko_tokenizer.EncodeAsIds(kor_corpus[i])
    tgt_tokens = en_tokenizer.EncodeAsIds(eng_corpus[i])

    if (len(src_tokens) > 50): continue
    if (len(tgt_tokens) > 50): continue
    
    src_train.append(src_tokens)
    tgt_train.append(tgt_tokens)    

In [17]:
enc_train = tf.keras.preprocessing.sequence.pad_sequences(src_train, padding='post')
dec_train = tf.keras.preprocessing.sequence.pad_sequences(tgt_train, padding='post')

print(len(enc_train), len(dec_train))

print(enc_train.shape)
print(dec_train.shape)

67793 67793
(67793, 50)
(67793, 50)


In [18]:
print(enc_train[0])
print(dec_train[0])

[5738    9  236   50   23  123 2563    6 2652    4   14 4755   22 2580
  713   10 3239   16  147   25    4    5    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
[   1    6   92   78    4    8   86  944  781    4    8  492   10   33
  312 2541   90    5  359    7    2    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [19]:
with open('./kor_spm.vocab', encoding='utf-8') as f:
    Vo_k = [doc.strip().split("\t") for doc in f]

# w[0]: token name    
# w[1]: token score
kor_word2idx = {w[0]: i for i, w in enumerate(Vo_k)}

In [20]:
with open('./eng_spm.vocab', encoding='utf-8') as f:
    Vo_e = [doc.strip().split("\t") for doc in f]

# w[0]: token name    
# w[1]: token score
eng_word2idx = {w[0]: i for i, w in enumerate(Vo_e)}

In [21]:
VOCAB_SIZE = len(kor_word2idx)

In [22]:
len(eng_word2idx)

20000

In [23]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

# 디코더의 실제값 시퀀스에서는 시작 토큰을 제거해야 한다.
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': enc_train,
        'dec_inputs': dec_train[:, :-1] # 디코더의 입력. 마지막 패딩 토큰이 제거된다.
    },
    {
        'outputs': dec_train[:, 1:]  # 맨 처음 토큰이 제거된다. 다시 말해 시작 토큰이 제거된다.
    },
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [24]:
dataset

<PrefetchDataset shapes: ({inputs: (None, 50), dec_inputs: (None, 49)}, {outputs: (None, 49)}), types: ({inputs: tf.int32, dec_inputs: tf.int32}, {outputs: tf.int32})>

## Step3. transformer(GD12)

In [25]:
#GD12

def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, int(i) / d_model)

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])

    return sinusoid_table

In [26]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_mask = generate_padding_mask(tgt)

    dec_causality_mask = generate_causality_mask(tgt.shape[1], tgt.shape[1])
    dec_mask = tf.maximum(dec_mask, dec_causality_mask)

    dec_enc_causality_mask = generate_causality_mask(tgt.shape[1], src.shape[1])
    dec_enc_mask = tf.maximum(enc_mask, dec_enc_causality_mask)

    return enc_mask, dec_enc_mask, dec_mask

In [27]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        self.depth = d_model // self.num_heads

        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)

        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions


    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x


    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)

        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)

        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)

        out = self.combine_heads(out)
        out = self.linear(out)

        return out, attention_weights

In [28]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.fc1(x)
        out = self.fc2(out)

        return out

In [29]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, mask):

        """
        Multi-Head Attention
        """
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual

        """
        Position-Wise Feed Forward Network
        """
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, enc_attn

In [30]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, enc_out, causality_mask, padding_mask):

        """
        Masked Multi-Head Attention
        """
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual

        """
        Multi-Head Attention
        """
        residual = out
        out = self.norm_2(out)
        out, dec_enc_attn = self.dec_self_attn(out, enc_out, enc_out, causality_mask)
        out = self.do(out)
        out += residual

        """
        Position-Wise Feed Forward Network
        """
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn

In [31]:
class Encoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, mask):
        out = x

        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)

        return out, enc_attns

In [32]:
class Decoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]


    def call(self, x, enc_out, causality_mask, padding_mask):
        out = x

        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, causality_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

In [33]:
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared_fc=True,
                    shared_emb=False):
        super(Transformer, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        out = emb(x)

        if self.shared_fc: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out


    def call(self, enc_in, dec_in, enc_mask, causality_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)

        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, causality_mask, dec_mask)

        logits = self.fc(dec_out)

        return logits, enc_attns, dec_attns, dec_enc_attns

In [34]:
transformer = Transformer(
    n_layers=2,
    d_model=512,
    n_heads=8,
    d_ff=2048,
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    pos_len=200,
    dropout=0.3,
    shared_fc=True,
    shared_emb=True)

d_model = 512

In [35]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()

        self.d_model = d_model
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)

        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

In [36]:
learning_rate = LearningRateScheduler(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                        beta_1=0.9,
                                        beta_2=0.98, 
                                        epsilon=1e-9)

In [37]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [38]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1]
    gold = tgt[:, 1:]

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

In [39]:
def visualize_attention(src, tgt, enc_attns, dec_attns, dec_enc_attns):
    def draw(data, ax, x="auto", y="auto"):
        import seaborn
        seaborn.heatmap(data, 
                        square=True,
                        vmin=0.0, vmax=1.0, 
                        cbar=False, ax=ax,
                        xticklabels=x,
                        yticklabels=y)

    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Encoder Layer", layer + 1)
        for h in range(4):
            draw(enc_attns[layer][0, h, :len(src), :len(src)], axs[h], src, src)
        plt.show()

    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Decoder Self Layer", layer+1)
        for h in range(4):
            draw(dec_attns[layer][0, h, :len(tgt), :len(tgt)], axs[h], tgt, tgt)
        plt.show()

        print("Decoder Src Layer", layer+1)
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        for h in range(4):
            draw(dec_enc_attns[layer][0, h, :len(tgt), :len(src)], axs[h], src, tgt)
        plt.show()

In [40]:
def evaluate(sentence, model, src_tokenizer, tgt_tokenizer):
    sentence = preprocess_sentence(sentence)

    pieces = src_tokenizer.encode_as_pieces(sentence)
    tokens = src_tokenizer.encode_as_ids(sentence)

    _input = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                           maxlen=enc_train.shape[-1],
                                                           padding='post')

    ids = []
    output = tf.expand_dims([tgt_tokenizer.bos_id()], 0)
    for i in range(dec_train.shape[-1]):
        enc_padding_mask, combined_mask, dec_padding_mask = \
        generate_masks(_input, output)

        predictions, enc_attns, dec_attns, dec_enc_attns =\
        model(_input, 
              output,
              enc_padding_mask,
              combined_mask,
              dec_padding_mask)

        predicted_id = \
        tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()

        if tgt_tokenizer.eos_id() == predicted_id:
            result = tgt_tokenizer.decode_ids(ids)
            return pieces, result, enc_attns, dec_attns, dec_enc_attns

        ids.append(predicted_id)
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)

    result = tgt_tokenizer.decode_ids(ids)

    return pieces, result, enc_attns, dec_attns, dec_enc_attns

In [41]:
def translate(sentence, model, src_tokenizer, tgt_tokenizer, plot_attention=False):
    pieces, result, enc_attns, dec_attns, dec_enc_attns = \
    evaluate(sentence, model, src_tokenizer, tgt_tokenizer)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    if plot_attention:
        visualize_attention(pieces, result.split(), enc_attns, dec_attns, dec_enc_attns)

In [42]:
sentences = ['오바마는 대통령이다.', '시민들은 도시 속에 산다.', '커피는 필요 없다.', '일곱 명의 사망자가 발생했다.']

training 되지 않았을 때 번역

In [43]:
for sen in sentences:
    translate(sen, transformer, ko_tokenizer, en_tokenizer)

Input: 오바마는 대통령이다.
Predicted translation: thomas thomasameleo isabeldrop join commercializ installment thomas thomas thomasupe joinevo join abidindrop join burjevo join abidin hamidevo thomas thomas thomas thomasdrop abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidindrop abidindrop abidin jett burjevo abidin abidin
Input: 시민들은 도시 속에 산다.
Predicted translation: balanc join join join join join isabel pot abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin abidin
Input: 커피는 필요 없다.
Predicted translation: balancevo barn lear lear lear snu criticism snuevo abidin abidin source source pillow peanut abidin abidin abidindrop abidin bulgarian bulgarian bulgarian bulgarian bulgarian abidin abidin abidin abidin blondyou una pillow pean

In [44]:
from tqdm import tqdm_notebook 

BATCH_SIZE = 64
EPOCHS = 20

for epoch in range(EPOCHS):
    total_loss = 0

    idx_list = list(range(0, enc_train.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm_notebook(idx_list)

    for (batch, idx) in enumerate(t):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = \
        train_step(enc_train[idx:idx+BATCH_SIZE],
                    dec_train[idx:idx+BATCH_SIZE],
                    transformer,
                    optimizer)

        total_loss += batch_loss

        t.set_description_str('Epoch %2d' % (epoch + 1))
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))
    
    if epoch < EPOCHS:
        for sen in sentences:
            translate(sen, transformer, ko_tokenizer, en_tokenizer)
    
    else:
        for sen in sentences:
            translate(sen, transformer, ko_tokenizer, en_tokenizer, plot_attention = True)

        
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama has , obama said .
Input: 시민들은 도시 속에 산다.
Predicted translation: they were also about they .
Input: 커피는 필요 없다.
Predicted translation: i want to see they are not because of they are not because of they .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the death toll , the death toll , the death toll , the death toll .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is a president elect barack obama .
Input: 시민들은 도시 속에 산다.
Predicted translation: they are expected to be around .
Input: 커피는 필요 없다.
Predicted translation: it is not a little bit of the same .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the death toll was dead in the death toll .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is the second president .
Input: 시민들은 도시 속에 산다.
Predicted translation: they were in the city of city .
Input: 커피는 필요 없다.
Predicted translation: it s not necessary .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the casualty toll was killed .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is a big setback .
Input: 시민들은 도시 속에 산다.
Predicted translation: they were then in the city .
Input: 커피는 필요 없다.
Predicted translation: it s not necessarily need to need need need
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the death toll killed people were killed in the early s .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is president
Input: 시민들은 도시 속에 산다.
Predicted translation: they re a lot of the city .
Input: 커피는 필요 없다.
Predicted translation: no need need to need coffee
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the death toll was among seven people .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama s president says he is the president .
Input: 시민들은 도시 속에 산다.
Predicted translation: citizens in cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit cit citing citizens .
Input: 커피는 필요 없다.
Predicted translation: no need to need for a need to be done .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people dead , most of the dead were killed .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is president .
Input: 시민들은 도시 속에 산다.
Predicted translation: the city is a small city of city .
Input: 커피는 필요 없다.
Predicted translation: need is necessary .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven deaths were killed .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is part of his campaign .
Input: 시민들은 도시 속에 산다.
Predicted translation: citizens are the city of cited city .
Input: 커피는 필요 없다.
Predicted translation: for coffee does not necessarily need to stress .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: deaths were killed , the death toll reported .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is the world president .
Input: 시민들은 도시 속에 산다.
Predicted translation: city is city in city .
Input: 커피는 필요 없다.
Predicted translation: there is no need for coffee .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the dead were injured , the seat struck sunday .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is being the white house for office .
Input: 시민들은 도시 속에 산다.
Predicted translation: citizens are cited in the city .
Input: 커피는 필요 없다.
Predicted translation: for coffee now , but it needs for a needed coffee space .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were killed , but officials said tuesday .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is the third republican president .
Input: 시민들은 도시 속에 산다.
Predicted translation: the city s streets in the city .
Input: 커피는 필요 없다.
Predicted translation: need for necessarily calls hat .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven deaths were blamed on thursday .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is responding .
Input: 시민들은 도시 속에 산다.
Predicted translation: citizens are the very small city of citizens .
Input: 커피는 필요 없다.
Predicted translation: it needs necessarily needed to speak .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seventeen others were abducted tuesday .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is president
Input: 시민들은 도시 속에 산다.
Predicted translation: citizens are cited the city of cit cit cit cited historic cities .
Input: 커피는 필요 없다.
Predicted translation: for coffee needed to be necessary to be needed .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven deaths were killed .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is president .
Input: 시민들은 도시 속에 산다.
Predicted translation: citizens are cited in the city .
Input: 커피는 필요 없다.
Predicted translation: need for coffee is necessarily need
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were killed , and dozens were wounded .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is osama bin laden .
Input: 시민들은 도시 속에 산다.
Predicted translation: city is quiet for the city .
Input: 커피는 필요 없다.
Predicted translation: need to go coffee .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were killed , with seven people were killed .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama s president is obama .
Input: 시민들은 도시 속에 산다.
Predicted translation: citizens city along the city of citizens .
Input: 커피는 필요 없다.
Predicted translation: there s need to be needed for coffee .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were killed , seven civilians officials said .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama is president .
Input: 시민들은 도시 속에 산다.
Predicted translation: citya has a local city in cit cit cities .
Input: 커피는 필요 없다.
Predicted translation: de coffee is not good for coffee .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven deaths were killed .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama s president is
Input: 시민들은 도시 속에 산다.
Predicted translation: city is a city near the city .
Input: 커피는 필요 없다.
Predicted translation: for coffee now , coffee should not necessarily take it needs clean .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: deaths were killed , seven people were killed .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: president obama is the president .
Input: 시민들은 도시 속에 산다.
Predicted translation: city is cited in the city .
Input: 커피는 필요 없다.
Predicted translation: no need need to necessar coffee
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: death toll of seven people were neighboring the seventh person .


HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))


Input: 오바마는 대통령이다.
Predicted translation: obama s life is ahead .
Input: 시민들은 도시 속에 산다.
Predicted translation: city is a city in the city .
Input: 커피는 필요 없다.
Predicted translation: for need to need coffee
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were killed , including seven civilians were killed .


## RESULT

Translations

>Input: 오바마는 대통령이다.
>Predicted translation: obama is president .
>Input: 시민들은 도시 속에 산다.
>Predicted translation: citizens are cited in the city .
>Input: 커피는 필요 없다.
>Predicted translation: need for coffee is necessarily need
>Input: 일곱 명의 사망자가 발생했다.
>Predicted translation: seven people were killed , and dozens were wounded .


Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At: 14

## Step4. Transformer(딥 러닝을 이용한 자연어 처리 입문)
https://wikidocs.net/31379

In [45]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

# 디코더의 실제값 시퀀스에서는 시작 토큰을 제거해야 한다.
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': enc_train,
        'dec_inputs': dec_train[:, :-1] # 디코더의 입력. 마지막 패딩 토큰이 제거된다.
    },
    {
        'outputs': dec_train[:, 1:]  # 맨 처음 토큰이 제거된다. 다시 말해 시작 토큰이 제거된다.
    },
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [46]:
dataset

<PrefetchDataset shapes: ({inputs: (None, 50), dec_inputs: (None, 49)}, {outputs: (None, 49)}), types: ({inputs: tf.int32, dec_inputs: tf.int32}, {outputs: tf.int32})>

In [47]:
class PositionalEncoding(tf.keras.layers.Layer):
  def __init__(self, position, d_model):
    super(PositionalEncoding, self).__init__()
    self.pos_encoding = self.positional_encoding(position, d_model)

  def get_angles(self, position, i, d_model):
    angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
    return position * angles

  def positional_encoding(self, position, d_model):
    angle_rads = self.get_angles(
        position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
        i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
        d_model=d_model)

    # 배열의 짝수 인덱스(2i)에는 사인 함수 적용
    sines = tf.math.sin(angle_rads[:, 0::2])

    # 배열의 홀수 인덱스(2i+1)에는 코사인 함수 적용
    cosines = tf.math.cos(angle_rads[:, 1::2])

    pos_encoding = tf.concat([sines, cosines], axis=-1)
    pos_encoding = pos_encoding[tf.newaxis, ...]
    print(pos_encoding.shape)
    return tf.cast(pos_encoding, tf.float32)

  def call(self, inputs):
    return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

In [48]:
def scaled_dot_product_attention(query, key, value, mask):
  # query 크기 : (batch_size, num_heads, query의 문장 길이, d_model/num_heads)
  # key 크기 : (batch_size, num_heads, key의 문장 길이, d_model/num_heads)
  # value 크기 : (batch_size, num_heads, value의 문장 길이, d_model/num_heads)
  # padding_mask : (batch_size, 1, 1, key의 문장 길이)

  # Q와 K의 곱. 어텐션 스코어 행렬.
  matmul_qk = tf.matmul(query, key, transpose_b=True)

  # 스케일링
  # dk의 루트값으로 나눠준다.
  depth = tf.cast(tf.shape(key)[-1], tf.float32)
  logits = matmul_qk / tf.math.sqrt(depth)

  # 마스킹. 어텐션 스코어 행렬의 마스킹 할 위치에 매우 작은 음수값을 넣는다.
  # 매우 작은 값이므로 소프트맥스 함수를 지나면 행렬의 해당 위치의 값은 0이 된다.
  if mask is not None:
    logits += (mask * -1e9)

  # 소프트맥스 함수는 마지막 차원인 key의 문장 길이 방향으로 수행된다.
  # attention weight : (batch_size, num_heads, query의 문장 길이, key의 문장 길이)
  attention_weights = tf.nn.softmax(logits, axis=-1)

  # output : (batch_size, num_heads, query의 문장 길이, d_model/num_heads)
  output = tf.matmul(attention_weights, value)

  return output, attention_weights

In [49]:
class MultiHeadAttention(tf.keras.layers.Layer):

  def __init__(self, d_model, num_heads, name="multi_head_attention"):
    super(MultiHeadAttention, self).__init__(name=name)
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    # d_model을 num_heads로 나눈 값.
    # 논문 기준 : 64
    self.depth = d_model // self.num_heads

    # WQ, WK, WV에 해당하는 밀집층 정의
    self.query_dense = tf.keras.layers.Dense(units=d_model)
    self.key_dense = tf.keras.layers.Dense(units=d_model)
    self.value_dense = tf.keras.layers.Dense(units=d_model)

    # WO에 해당하는 밀집층 정의
    self.dense = tf.keras.layers.Dense(units=d_model)

  # num_heads 개수만큼 q, k, v를 split하는 함수
  def split_heads(self, inputs, batch_size):
    inputs = tf.reshape(
        inputs, shape=(batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(inputs, perm=[0, 2, 1, 3])

  def call(self, inputs):
    query, key, value, mask = inputs['query'], inputs['key'], inputs[
        'value'], inputs['mask']
    batch_size = tf.shape(query)[0]

    # 1. WQ, WK, WV에 해당하는 밀집층 지나기
    # q : (batch_size, query의 문장 길이, d_model)
    # k : (batch_size, key의 문장 길이, d_model)
    # v : (batch_size, value의 문장 길이, d_model)
    # 참고) 인코더(k, v)-디코더(q) 어텐션에서는 query 길이와 key, value의 길이는 다를 수 있다.
    query = self.query_dense(query)
    key = self.key_dense(key)
    value = self.value_dense(value)

    # 2. 헤드 나누기
    # q : (batch_size, num_heads, query의 문장 길이, d_model/num_heads)
    # k : (batch_size, num_heads, key의 문장 길이, d_model/num_heads)
    # v : (batch_size, num_heads, value의 문장 길이, d_model/num_heads)
    query = self.split_heads(query, batch_size)
    key = self.split_heads(key, batch_size)
    value = self.split_heads(value, batch_size)

    # 3. 스케일드 닷 프로덕트 어텐션. 앞서 구현한 함수 사용.
    # (batch_size, num_heads, query의 문장 길이, d_model/num_heads)
    scaled_attention, _ = scaled_dot_product_attention(query, key, value, mask)
    # (batch_size, query의 문장 길이, num_heads, d_model/num_heads)
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

    # 4. 헤드 연결(concatenate)하기
    # (batch_size, query의 문장 길이, d_model)
    concat_attention = tf.reshape(scaled_attention,
                                  (batch_size, -1, self.d_model))

    # 5. WO에 해당하는 밀집층 지나기
    # (batch_size, query의 문장 길이, d_model)
    outputs = self.dense(concat_attention)

    return outputs

In [50]:
def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, key의 문장 길이)
    return mask[:, tf.newaxis, tf.newaxis, :]

In [51]:
def encoder_layer(dff, d_model, num_heads, dropout, name="encoder_layer"):
  inputs = tf.keras.Input(shape=(None, d_model), name="inputs")

  # 인코더는 패딩 마스크 사용
  padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

  # 멀티-헤드 어텐션 (첫번째 서브층 / 셀프 어텐션)
  attention = MultiHeadAttention(
      d_model, num_heads, name="attention")({
          'query': inputs, 'key': inputs, 'value': inputs, # Q = K = V
          'mask': padding_mask # 패딩 마스크 사용
      })

  # 드롭아웃 + 잔차 연결과 층 정규화
  attention = tf.keras.layers.Dropout(rate=dropout)(attention)
  attention = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(inputs + attention)

  # 포지션 와이즈 피드 포워드 신경망 (두번째 서브층)
  outputs = tf.keras.layers.Dense(units=dff, activation='relu')(attention)
  outputs = tf.keras.layers.Dense(units=d_model)(outputs)

  # 드롭아웃 + 잔차 연결과 층 정규화
  outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
  outputs = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention + outputs)

  return tf.keras.Model(
      inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [52]:
def encoder(vocab_size, num_layers, dff,
            d_model, num_heads, dropout,
            name="encoder"):
  inputs = tf.keras.Input(shape=(None,), name="inputs")

  # 인코더는 패딩 마스크 사용
  padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

  # 포지셔널 인코딩 + 드롭아웃
  embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
  embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
  embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
  outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

  # 인코더를 num_layers개 쌓기
  for i in range(num_layers):
    outputs = encoder_layer(dff=dff, d_model=d_model, num_heads=num_heads,
        dropout=dropout, name="encoder_layer_{}".format(i),
    )([outputs, padding_mask])

  return tf.keras.Model(
      inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [53]:
def create_look_ahead_mask(x):
  seq_len = tf.shape(x)[1]
  look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
  padding_mask = create_padding_mask(x) # 패딩 마스크도 포함
  return tf.maximum(look_ahead_mask, padding_mask)

In [54]:
def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

In [55]:
def decoder_layer(dff, d_model, num_heads, dropout, name="decoder_layer"):
  inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
  enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")

  # 디코더는 룩어헤드 마스크(첫번째 서브층)와 패딩 마스크(두번째 서브층) 둘 다 사용.
  look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name="look_ahead_mask")
  padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

  # 멀티-헤드 어텐션 (첫번째 서브층 / 마스크드 셀프 어텐션)
  attention1 = MultiHeadAttention(
      d_model, num_heads, name="attention_1")(inputs={
          'query': inputs, 'key': inputs, 'value': inputs, # Q = K = V
          'mask': look_ahead_mask # 룩어헤드 마스크
      })

  # 잔차 연결과 층 정규화
  attention1 = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention1 + inputs)

  # 멀티-헤드 어텐션 (두번째 서브층 / 디코더-인코더 어텐션)
  attention2 = MultiHeadAttention(
      d_model, num_heads, name="attention_2")(inputs={
          'query': attention1, 'key': enc_outputs, 'value': enc_outputs, # Q != K = V
          'mask': padding_mask # 패딩 마스크
      })

  # 드롭아웃 + 잔차 연결과 층 정규화
  attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
  attention2 = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention2 + attention1)

  # 포지션 와이즈 피드 포워드 신경망 (세번째 서브층)
  outputs = tf.keras.layers.Dense(units=dff, activation='relu')(attention2)
  outputs = tf.keras.layers.Dense(units=d_model)(outputs)

  # 드롭아웃 + 잔차 연결과 층 정규화
  outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
  outputs = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(outputs + attention2)

  return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)

In [56]:
def decoder(vocab_size, num_layers, dff,
            d_model, num_heads, dropout,
            name='decoder'):
  inputs = tf.keras.Input(shape=(None,), name='inputs')
  enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')

  # 디코더는 룩어헤드 마스크(첫번째 서브층)와 패딩 마스크(두번째 서브층) 둘 다 사용.
  look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name='look_ahead_mask')
  padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

  # 포지셔널 인코딩 + 드롭아웃
  embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
  embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
  embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
  outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

  # 디코더를 num_layers개 쌓기
  for i in range(num_layers):
    outputs = decoder_layer(dff=dff, d_model=d_model, num_heads=num_heads,
        dropout=dropout, name='decoder_layer_{}'.format(i),
    )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])

  return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)

In [57]:
def transformer(vocab_size, num_layers, dff,
                d_model, num_heads, dropout,
                name="transformer"):

  # 인코더의 입력
  inputs = tf.keras.Input(shape=(None,), name="inputs")

  # 디코더의 입력
  dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")

  # 인코더의 패딩 마스크
  enc_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='enc_padding_mask')(inputs)

  # 디코더의 룩어헤드 마스크(첫번째 서브층)
  look_ahead_mask = tf.keras.layers.Lambda(
      create_look_ahead_mask, output_shape=(1, None, None),
      name='look_ahead_mask')(dec_inputs)

  # 디코더의 패딩 마스크(두번째 서브층)
  dec_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='dec_padding_mask')(inputs)

  # 인코더의 출력은 enc_outputs. 디코더로 전달된다.
  enc_outputs = encoder(vocab_size=vocab_size, num_layers=num_layers, dff=dff,
      d_model=d_model, num_heads=num_heads, dropout=dropout,
  )(inputs=[inputs, enc_padding_mask]) # 인코더의 입력은 입력 문장과 패딩 마스크

  # 디코더의 출력은 dec_outputs. 출력층으로 전달된다.
  dec_outputs = decoder(vocab_size=vocab_size, num_layers=num_layers, dff=dff,
      d_model=d_model, num_heads=num_heads, dropout=dropout,
  )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])

  # 다음 단어 예측을 위한 출력층
  outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)

  return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)

In [58]:
def loss_function(y_true, y_pred):
  y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))

  loss = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)

  mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
  loss = tf.multiply(loss, mask)

  return tf.reduce_mean(loss)


In [59]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)
    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps**-1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


In [66]:
class CustomCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print("EPOCH: ", epoch)
        for sen in sentences:
            translate(sen, model, ko_tokenizer, en_tokenizer)

In [68]:
tf.keras.backend.clear_session()

# Hyper-parameters
D_MODEL = 256
NUM_LAYERS = 2
NUM_HEADS = 8
DFF = 512
DROPOUT = 0.1
VOCAB_SIZE = 20000
MAX_LENGTH = 50

model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

(1, 20000, 256)
(1, 20000, 256)


In [69]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
  # 레이블의 크기는 (batch_size, MAX_LENGTH - 1)
  y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
  return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [70]:
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## 총평

### 발생된 문제점

decoder layer를 형성하는 과정에서 GD10에서 제공한 decoder의 multi-head attention의 경우 masked multi-head attention은 causality_mask를 사용하고 multi-head attention은 padding_mask를 사용했으나 GD12의 경우 이를 바꾸어 사용하였다. 하지만 아직 세부적인 계산 내용까지 완전하게 이해하고 있지 않기 때문에 좀 더 공부가 필요하다.

### 모델 평가
seq2seq 번역 모델과 비교하였을때 1 epoch만에 키워드를 뽑아낼만큼 성능이 매우 뛰어났다.
