In [30]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'

import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truethype/nanum/NanumBarunGothic.ttf'

font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic')
mpl.font_manager._rebuild()

In [31]:
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split

import matplotlib.ticker as ticker
import matplotlib.pyplot as plt

import time
import re
import os
import io

In [32]:
data_dir = os.getenv('HOME')+'/aiffel/transformer/data'
kor_path = data_dir+"/korean-english-park.train.ko"
eng_path = data_dir+"/korean-english-park.train.en"

# 데이터 정제 및 토큰화
def clean_corpus(kor_path, eng_path):
    with open(kor_path, "r") as f: 
        kor = f.read().splitlines()
    with open(eng_path, "r") as f: 
        eng = f.read().splitlines()
    assert len(kor) == len(eng)
    
    return kor, eng


ko_corpus, en_corpus = clean_corpus(kor_path, eng_path)

In [33]:
ko_corpus[:5]

['개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"',
 '모든 광마우스와 마찬가지 로 이 광마우스도 책상 위에 놓는 마우스 패드를 필요로 하지 않는다.',
 '그러나 이것은 또한 책상도 필요로 하지 않는다.',
 '79.95달러하는 이 최첨단 무선 광마우스는 허공에서 팔목, 팔, 그외에 어떤 부분이든 그 움직임에따라 커서의 움직임을 조절하는 회전 운동 센서를 사용하고 있다.',
 '정보 관리들은 동남 아시아에서의 선박들에 대한 많은 (테러) 계획들이 실패로 돌아갔음을 밝혔으며, 세계 해상 교역량의 거의 3분의 1을 운송하는 좁은 해로인 말라카 해협이 테러 공격을 당하기 쉽다고 경고하고 있다.']

In [34]:
en_corpus[:5]

['Much of personal computing is about "can you top this?"',
 'so a mention a few weeks ago about a rechargeable wireless optical mouse brought in another rechargeable, wireless mouse.',
 "Like all optical mice, But it also doesn't need a desk.",
 'uses gyroscopic sensors to control the cursor movement as you move your wrist, arm, whatever through the air.',

In [35]:
cleaned_ko_corpus = list(set(ko_corpus))
cleaned_en_corpus = list(set(en_corpus))

print(cleaned_ko_corpus == cleaned_en_corpus)

False


In [36]:
corpus = []

for i in range(len(ko_corpus)):
    st = ko_corpus[i] + '\t' + en_corpus[i]
    corpus.append(st)

corpus[:5]

cleaned_corpus = list(set(corpus))
cleaned_corpus[10]

'지난해 6월 체코에서는 TV 날씨방송 중 웹캠을 통해 나온 정지 화면으로 핵무기 폭파 장면이 나타났다.\tA freeze-frame from the Web cam shows an image of a nuclear explosion.'

In [37]:
from konlpy.tag import Mecab

def preprocess_sentence(sentence):
    mecab = Mecab()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^ㄱ-하-ㅣ가-힣a-zA-Z 0-9 ]", " ", sentence)
    sentence = mecab.morphs(sentence)

    return sentence

In [52]:
import sentencepiece as spm

def generate_tokenizer(corpus,
                       vocab_size,
                       lang="kor",
                       pad_id=0,
                       bos_id=1,
                       eos_id=2,
                       unk_id=3):
    file = os.getenv('HOME') + '/aiffel/transformer/data/token.tmp'
    
    with open(file, 'w') as f:
        for sen in corpus:
            if sen == corpus[0]:
                print(sen)
            f.write(str(sen) + '\n')
            
    import sentencepiece as spm
    spm.SentencePieceTrainer.Train(
        '--input={} --model_prefix={} --vocab_size={} --pad_id={} --bos_id={} --eos_id={} --unk_id={}'\
        .format(file, lang, vocab_size, pad_id, bos_id, eos_id, unk_id)
#         + \
#         '--pad_id==%d --bos_id=%d --eos_id=%d --unk_id=%d'\
#         % (pad_id, bos_id, eos_id, unk_id)
    )
    
    s_path = os.getenv('HOME') + '/aiffel/transformer/data/' + lang + '_spm.model'

    s = spm.SentencePieceProcessor()
    s.Load(s_path)
    tokenizer = s

    return tokenizer


In [40]:
VOCAB_SIZE = 8000

cleaned_ko_corpus = []
cleaned_en_corpus = []

for pair in cleaned_corpus:
    k, e = pair.split("\t")

    cleaned_ko_corpus.append(preprocess_sentence(k))
    cleaned_en_corpus.append(preprocess_sentence(e))

cleaned_ko_corpus[10]

['지난해',
 '6',
 '월',
 '체코',
 '에서',
 '는',
 'TV',
 '날씨',
 '방송',
 '중',
 '웹캠',
 '을',
 '통해',
 '나온',
 '정지',
 '화면',
 '으로',
 '핵무기',
 '폭파',
 '장면',
 '이',
 '나타났',
 '다']

In [53]:
ko_tokenizer = generate_tokenizer(cleaned_ko_corpus, VOCAB_SIZE, 'kor')
en_tokenizer = generate_tokenizer(cleaned_en_corpus, VOCAB_SIZE, 'eng')
en_tokenizer.set_encode_extra_options("bos:eos")

['레바논', '소식통', '은', '국경선', '의', '상황', '이', '매우', '위태', '롭', '다고', '전했', '다']
['The', 'situation', 'on', 'the', 'border', 'is', 'very', 'tense', 'the', 'Lebanese', 'sources', 'said']


True

In [81]:
src_corpus = []
tgt_corpus = []

assert len(kor_corpus) == len(eng_corpus)

# 토큰의 길이가 50 이하인 문장만 남김
for idx in tqdm(range(len(kor_corpus))):
    src = ko_tokenizer.EncodeAsIds(kor_corpus[idx])
    tgt = en_tokenizer.EncodeAsIds(eng_corpus[idx])
    
    if len(src) <= 50 and len(tgt) <= 50:
        src_corpus.append(src)
        tgt_corpus.append(tgt)

# 패딩처리를 완료하여 학습용 데이터를 완성
enc_train = tf.keras.preprocessing.sequence.pad_sequences(src_corpus, padding='post')
dec_train = tf.keras.preprocessing.sequence.pad_sequences(tgt_corpus, padding='post')

100%|██████████| 94123/94123 [00:08<00:00, 11189.93it/s]


In [82]:
SRC_VOCAB_SIZE = TGT_VOCAB_SIZE = 20000

In [83]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, int(i) / d_model)
    
    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]
    
    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for \
                              pos_i in range(pos)])
    
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
    
    return sinusoid_table

In [102]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = d_model // self.num_heads
        
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        
        self.linear = tf.keras.layers.Dense(d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)
        
        
        scaled_qk = QK / tf.math.sqrt(d_k)
        
        if mask is not None:
            scaled_qk += (mask * -1e9)
            
        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)
            
        return out, attentions
    
    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])
        
        return split_x
    
    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_z = tf.reshape(combined_x, (bsz, -1, self.d_model))
        
        return combined_x
    
    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)
        
        out = self.combine_heads(out)
        out = self.linear(out)
        
        return out, attention_weights

In [103]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        
        self.w_1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.w_2 = tf.keras.layers.Dense(d_model)
        
    def call(self, x):
        out = self.w_1(x)
        out = self.w_2(out)
        
        return out

In [104]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        
        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)
        
        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        # Multi-Head Attention
        
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual
        
        # Position-Wise Feed Forward Network
        
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual
        
        return out, enc_attn

In [105]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        
        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)
        
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)
        
        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, enc_out, causality_mask, padding_mask):
        # Masked Multi-Head Attention
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, causality_mask)
        out = self.do(out)
        out += residual
        
        # Multi-Head Attention
        residual = out
        out = self.norm_2(out)
        out, dec_enc_attn = self.enc_dec_attn(out, enc_out,
                                    enc_out, padding_mask)
        out = self.do(out)
        out += residual
        
        # Position-Wise Feed Forward Network
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual
        
        return out, dec_attn, dec_enc_attn

In [106]:
class Encoder(tf.keras.Model):
    def __init__(self,
                n_layers,
                d_model,
                n_heads,
                d_ff,
                dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads,
                    d_ff, dropout) for _ in range(n_layers)]
        
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        out = x
        
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            end_attns.append(enc_attn)
            
        return out, enc_attns

In [107]:
class Decoder(tf.keras.Model):
    def __init__(self,
                n_layers,
                d_model,
                n_heads,
                d_ff,
                dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads,
                        d_ff, dropout) for _ in range(n_layers)]
        
    def call(self, x, enc_out, causality_mask, padding_mask):
        out = x
        
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
                self.dec_layers[i](out, enc_out, causality_mask,
                                  padding_mask)
            
            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)
            
        return out, dec_attns, dec_enc_attns

In [108]:
class Transformer(tf.keras.Model):
    def __init__(self,
                n_layers,
                d_model,
                n_heads,
                d_ff,
                src_vocab_size,
                tgt_vocab_size,
                pos_len,
                dropout=0.2,
                shared=True):
        super(Transformer, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        
        self.enc_emb = tf.keras.layers.Embedding(
            src_vocab_size, d_model)
        self.dec_emb = tf.keras.layers.Embedding(
            tgt_vocab_size, d_model)
        
        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)
        
        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)
        
        self.fc = tf.keras.layers.Dense(tgt_vocab_size)
        
        self.shared = shared
        
        if shared:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))
            
    def embedding(self, emb, x):
        # 입력된 정수 배열을 Embedding+Pos Encoding+Shared : Scaling 작업포함
        # x: [bath x length]
        # return: [batch x length x emb]
        
        seq_len = x.shape[1]
        out = emb(x)
        
        if self.shared:
            out *= tf.math.sqrt(self.d_model)
            
        out += self.pos_encoding[np.newaxis, ...][:,:seq_len, :]
        out = self.do(out)

        return out
    
    def call(self, enc_in, dec_in, enc_mask, causality_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)
        
        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        
        dec_out, dec_attns, dec_enc_attns = \
            self.decoder(dec_in, enc_out, causality_mask, dec_mask)
        
        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns

In [109]:
import numpy as np
import tensorflow as tf

def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_mask = generate_padding_mask(tgt)
    
    dec_enc_causality_mask = generate_causality_mask(
        tgt.shape[1], src.shape[1])
    dec_enc_mask = tf.maximum(enc_mask, dec_enc_causality_mask)
    
    dec_causality_mask = generate_causality_mask(
        tgt.shape[1], tgt.shape[1])
    dec_mask = tf.maximum(dec_mask, dec_causality_mask)
    
    return enc_mask, dec_enc_mask, dec_mask

In [110]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        
    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)
    

In [111]:
transformer = Transformer(2, 256, 8, 1024, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, 50,) ### jjang

In [112]:
learning_rate = LearningRateScheduler(512)
optimizer = tf.keras.optimizers.Adam(learning_rate,
                                    beta_1=0.9,
                                    beta_2=0.98,
                                    epsilon=1e-9)

In [113]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    # Masking 되지 않은 입력의 개수로 scaling하는 과정
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [114]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    gold = tgt[:, 1:]
    
    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt)
    
    # 계산된 loss에 tf.GradientTape()를 적용해 학습을 진행
    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
            model(src, tgt, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions[:, :-1])
        
    # 최종적으로 optimizer.apply_gradients()가 사용
    ### jjang
    loss = (loss / int(tgt.shape[1]))
    variabels = enc_attns + dec_attns
    gradients = tape.gradient(loss, variables)
    
    optimizer.apply_graditnes(zip(gradients, variable))
    
    return loss, enc_attns, dec_attns, dec_enc_attns

In [115]:
def translate(sentence, encoder, decoder):
    result, sentence, attention = evaluate(sentence, encoder, decoder)
    
    print('Input: %s' %(sentence))
    print('Predicted translatio: {}'.format(result))
    
    attention = \
        attention[:len(result.split()), :len(sentence.split())]
    plot_attention(attention, sentence.split(), result.split(' '))

In [116]:
# Training Process

from tqdm import tqdm
import random

BATCH_SIZE = 64
EPOCHS = 10

items = ["오바마는 대통령이다.",
        "시민들은 도시 속에 산다.",
        "커피는 필요 없다.",
        "일곱 명의 사망자가 발생했다."]

for epoch in range(EPOCHS):
    total_loss = 0

    idx_list = list(range(0, enc_train.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm(idx_list)

    for (batch, idx) in enumerate(t):
        batch_loss = train_step(enc_train[idx:idx+BATCH_SIZE],
                                dec_train[idx:idx+BATCH_SIZE],
                                transformer,
                                optimizer)

        total_loss += batch_loss
    
        t.set_description_str('Epoch %2d' % (epoch + 1))
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))

    test_loss = 0

    idx_list = list(range(0, enc_val.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm(idx_list)

    for (test_batch, idx) in enumerate(t):
        test_batch_loss = eval_step(enc_val[idx:idx+BATCH_SIZE],
                                    dec_val[idx:idx+BATCH_SIZE],
                                    encoder,
                                    decoder,
                                    dec_tokenizer)

        test_loss += test_batch_loss

        t.set_description_str('Test Epoch %2d' % (epoch + 1))
        t.set_postfix_str('Test Loss %.4f' % (test_loss.numpy() / (test_batch + 1)))
        
    for item in items:
        translate(item, encoder, decoder)


  0%|          | 0/747 [00:00<?, ?it/s]


ValueError: in user code:

    <ipython-input-96-2a058133c06f>:9 train_step  *
        predictions, enc_attns, dec_attns, dec_enc_attns =             model(src, tgt, enc_mask, dec_enc_mask, dec_mask)
    <ipython-input-90-ae3e300e88ee>:53 call  *
        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
    <ipython-input-88-b6b6da7420fd>:20 call  *
        out, enc_attn = self.enc_layers[i](out, mask)
    <ipython-input-86-44e492af5fc2>:20 call  *
        out += residual
    /home/aiffel/anaconda3/envs/aiffel/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:984 binary_op_wrapper
        return func(x, y, name=name)
    /home/aiffel/anaconda3/envs/aiffel/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:1276 _add_dispatch
        return gen_math_ops.add_v2(x, y, name=name)
    /home/aiffel/anaconda3/envs/aiffel/lib/python3.7/site-packages/tensorflow/python/ops/gen_math_ops.py:483 add_v2
        "AddV2", x=x, y=y, name=name)
    /home/aiffel/anaconda3/envs/aiffel/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py:744 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    /home/aiffel/anaconda3/envs/aiffel/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py:595 _create_op_internal
        compute_device)
    /home/aiffel/anaconda3/envs/aiffel/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:3327 _create_op_internal
        op_def=op_def)
    /home/aiffel/anaconda3/envs/aiffel/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:1817 __init__
        control_input_ops, op_def)
    /home/aiffel/anaconda3/envs/aiffel/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:1657 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimensions must be equal, but are 50 and 64 for '{{node transformer_3/encoder_3/encoder_layer_6/add}} = AddV2[T=DT_FLOAT](transformer_3/encoder_3/encoder_layer_6/dropout_17/Identity, transformer_3/dropout_16/Identity)' with input shapes: [64,50,8,256], [64,50,256].
