In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install sentencepiece

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sentencepiece as spm

import re
import os
import random
import math

from tqdm import tqdm
import matplotlib.pyplot as plt



In [None]:
! tar -zxvf /content/drive/MyDrive/korean-english-park.dev.tar.gz
! tar -zxvf /content/drive/MyDrive/korean-english-park.test.tar.gz
! tar -zxvf /content/drive/MyDrive/korean-english-park.train.tar.gz

In [None]:
k_file_path = '/content/korean-english-park.train.ko'
e_file_path = '/content/korean-english-park.train.en'

with open(k_file_path, 'r' ) as f:
    ko_raw = f.read().splitlines()

with open(e_file_path, 'r' ) as f:
    en_raw = f.read().splitlines()

print(ko_raw[:3])
print(en_raw[:3])

In [None]:
import re
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'([?!,."])', r' \1 ',sentence) # 특수문자 인정
    sentence = re.sub(r'[^A-zㄱ-ㅎㅏ-ㅣ가-힣0-9?!,."]', ' ', sentence) # 영어, 한국어, 숫자 표현만 인정
    sentence = re.sub(r'[" "]+', ' ',sentence)
    sentence = sentence.strip()
    return sentence

In [None]:
def clean_corpus(kor,eng):
    assert len(kor) == len(eng)
    print(' 데이터 수  :', len(kor))

    dataset = set()
    for i , j in tqdm(list(zip(kor, eng))):
        i = preprocess_sentence(i)
        j = preprocess_sentence(j)
        dataset.add((i,j))
    print(len(dataset))
    cleaned_corpus = list(dataset)
    return cleaned_corpus
# 데이터불러 오고 ->정규표현식 -> 중복 데이터
# 좋은데이터  = 1 . 많고, 2 . 카테고리 혼동 X ,3.다양하게
#

In [None]:
cleaned_corpus = clean_corpus(ko_raw, en_raw)

In [None]:
def generate_tokenizer(corpus, vocab_size, lang='ko',
                       pad_id =0,
                       bos_id  = 1, # 문장 시작
                       eos_id = 2, # 문장 끝
                       unk_id = 3, # unkown token
                       model_type='bpe'):
    file = './%s_corpus.txt' %  lang
    model = "./%s_spm" % lang

    with open(file , 'w') as f:
        for row in corpus: f.write(str(row) + '\n')

    spm.SentencePieceTrainer.train(
        input=file, model_prefix = model, vocab_size=vocab_size,
        pad_id=pad_id, bos_id=bos_id,eos_id=eos_id,unk_id=unk_id,
        model_type=model_type
    )
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load('%s.model'%model)
    return tokenizer

In [None]:
kor, eng = zip(*cleaned_corpus)
print(kor[0])
vocab_size = 10000
ko_tokenizer = generate_tokenizer(kor, vocab_size)
en_tokenzier = generate_tokenizer(eng, vocab_size)

In [None]:
# 번역전 문장
# 번역후 문장
# <Start>번역전문장<end><start>번역후문장<end>
# 오늘 점심 뭐 먹을까?
#<start>오늘 점심 뭐 먹을까?<end><start>오늘 점심은 식당에서 알아서 드세요 <end> dslkdfdfsdf

In [None]:
# 한국어를 맞춰보고 싶으므로 한국어 토크나이저에 bos토큰, eos 토큰 추가 옵션

ko_tokenizer.SetEncodeExtraOptions("bos:eos")

In [None]:
def make_corpus(sentences, tokenizer):
    corpus = []
    for sentence in tqdm(sentences):
        tokens = tokenizer.encode_as_ids(sentence)
        corpus.append(tokens)
    return corpus


In [None]:
ko_corpus = make_corpus(kor, ko_tokenizer)

In [None]:
en_corpus = make_corpus(eng, en_tokenzier)
# 전처리 데이터불러 오고 ->정규표현식 -> 중복 데이터 -> 토큰화 -> 길이
#14:02

In [None]:
print(kor[1])
print(ko_corpus[1])
print(eng[1])
print(en_corpus[1])

In [None]:
def num_of_word(corpus):
    length_sen = [0] * len(corpus)
    for i, j in enumerate(corpus):
        length_sen[i] = len(j)
    return length_sen

In [None]:
num_of_word(ko_corpus)

In [None]:
from collections import Counter
def make_graph(length_sen ,title=None):
    num_num = Counter(length_sen)
    plt.figure(figsize=(16,10))
    if title:
        plt.title(title)
    plt.bar(range(len(num_num)), [num_num[i] for i in range(len(num_num))],)
    plt.show()

In [None]:
make_graph(num_of_word(ko_corpus) , 'korean')
make_graph(num_of_word(en_corpus) , 'english')

In [None]:
MAX_LENGTH = 80
en_ndarray = tf.keras.preprocessing.sequence.pad_sequences(en_corpus, maxlen=MAX_LENGTH,
                                                  truncating='post',
                                                  padding='post')
ko_ndarray = tf.keras.preprocessing.sequence.pad_sequences(ko_corpus, maxlen=MAX_LENGTH,
                                                  truncating='post',
                                                  padding='post')

In [None]:
en_ndarray[:5]
ko_ndarray[:5]

In [None]:
val_k_file_path= '/content/korean-english-park.dev.ko'
val_e_file_path = '/content/korean-english-park.dev.en'

with open(val_k_file_path, 'r') as f:
    val_ko_raw = f.read().splitlines()
with open(val_e_file_path, 'r') as f:
    val_en_raw = f.read().splitlines()

val_cleaned_corpus  = clean_corpus(val_ko_raw, val_en_raw)

In [None]:
val_kor, val_eng = zip(*val_cleaned_corpus)
val_ko_corpus = make_corpus(val_kor, ko_tokenizer)
val_en_corpus = make_corpus(val_eng, en_tokenzier)

val_en_ndarray = tf.keras.preprocessing.sequence.pad_sequences(val_en_corpus, maxlen=MAX_LENGTH,
                                                  truncating='post',
                                                  padding='post')
val_ko_ndarray = tf.keras.preprocessing.sequence.pad_sequences(val_ko_corpus, maxlen=MAX_LENGTH,
                                                  truncating='post',
                                                  padding='post')


In [None]:
val_ko_ndarray.shape , val_en_ndarray.shape

In [None]:
BATCH_SIZE = 256
train_dataset = tf.data.Dataset.from_tensor_slices(({'enc_in' : en_ndarray ,
                                                     'dec_in' :ko_ndarray},
                                                    ko_ndarray)).batch(batch_size = BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices(({'enc_in' : val_en_ndarray , 'dec_in' :val_ko_ndarray}, val_ko_ndarray)).batch(batch_size = BATCH_SIZE)
train_dataset

In [None]:
np.arange(500).reshape(1,-1)
np.zeros((100,10))
for i in range(0, 100, 2):
    print(i)

In [None]:
def positional_encoding(pos, d_model):
    def get_angles(pos, i, d_model):
        return   pos / 10000**(2*(i//2)/d_model)
    pos_line = np.arange(pos).reshape(-1,1)
    d_model_line = np.arange(d_model).reshape(1,-1)

    temp_table = get_angles(pos_line, d_model_line, d_model)

    sinusoid_table = np.zeros(temp_table.shape)

    sinusoid_table[:,0::2] = tf.math.cos(temp_table[:, 0::2])
    sinusoid_table[:,1::2] = tf.math.sin(temp_table[:, 1::2])
    return sinusoid_table

In [None]:
print(positional_encoding(4,5))

In [None]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, ko_tokenizer.pad_id()), 1.0, 0)
    return seq[: , tf.newaxis, tf.newaxis]

In [None]:
class MultiheadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiheadAttention, self).__init__()
        self.num_heads = tf.cast(num_heads, tf.float32)
        self.d_model = tf.cast(d_model, tf.float32)
        self.depth  = tf.cast(d_model // self.num_heads, tf.float32)
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(q,k,v,mask):
        matmul_qk = tf.matmul(q,k,transpose_b=True)
        matmul_qk = matmul_qk / tf.math.sqrt(self.depth)
        activation_score = tf.keras.activations.softmax(matmul_qk,axis=-1)
        out = tf.matmul(activation_score ,v)
        return out , activation_score

    def split_heads(self,x):
        split_x = tf.reshape(x, (x.shape[1],self.num_heads,self.depth))
        return tf.transpose(split_x,[0,2,1,3])

    def combine_head(self, x):
        combined_x = tf.transpose(x,[0,2,1,3])
        return tf.reshape(combined_x, [-1, x.shape[2],self.d_model])

    def call(self, x, mask):
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)

        split_q = self.split_heads(Q)
        split_k = self.split_heads(K)
        split_v = self.split_heads(V)
        out, attention_weight = self.scaled_dot_product_attention(split_q, split_k, split_v)

        out = self.combine_head(out)
        return out, attention_weight


In [None]:
# Position-wise Feed Forward Network 구현
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        #  구현
        out = self.fc1(x)  # -> 2048
        out = self.fc2(out) # -> 512

        return out

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)
        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.drop = tf.keras.layers.Dropout(dropout)

    def call(self, x, mask):
        '''
        Multi-Head Attention
        '''
        out, enc_attn = self.enc_self_attn(x,mask)
        out = self.drop(out)
        x = self.norm_1(out+x) # residual + layerNormalization

        '''
        Position-Wise Feed Forward Network
        '''
        out = self.ffn(x)
        out = self.drop(out) # dropout은 학습되는 층이 아니기 때문에 공유해서 사용합니다.
        out = self.norm_2(out+x)

        return out, enc_attn


In [None]:
encoderlayer = EncoderLayer(512,8,2048,0.4)

input_tensor_test = np.ones((16,50,512),dtype=np.float32)

mask = generate_padding_mask(np.ones((16,50),dtype=np.float32))

enc_out, attn = encoderlayer(input_tensor_test, mask)
enc_out[0], attn[0]

In [None]:
# Decoder 레이어 구현
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)
        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, enc_out, dec_enc_mask, dec_mask):

        '''
        Masked Multi-Head Attention
        '''
        #  구현
        out, dec_attn = self.dec_self_attn(x,dec_mask)
        out = self.do(out)
        x = self.norm_1(out+x)

        '''
        Multi-Head Attention
        '''
        #  구현
        out, dec_enc_attn = self.enc_dec_attn([x,enc_out,enc_out],dec_enc_mask)
        out = self.do(out)
        x = self.norm_2(out + x)

        '''
        Position-Wise Feed Forward Network
        '''
        #  구현
        out = self.ffn(x)
        out = self.do(out)
        out = self.norm_3(out + x)

        return out, dec_attn, dec_enc_attn


In [None]:
decoderlayer = DecoderLayer(512,8,2048,0.4)

tgt_tensor_test = np.ones((16,50,512),dtype=np.float32)

enc_mask, dec_enc_mask, dec_mask = generate_masks(np.ones((16,50),dtype=np.float32),np.ones((16,50),dtype=np.float32))

dec_out, dec_attn, dec_enc_attn = decoderlayer(tgt_tensor_test, enc_out, dec_enc_mask, dec_mask)
dec_out[0]


In [None]:
# Encoder 구현
class Encoder(tf.keras.Model):
    def __init__(self,
                 n_layers,
                 d_model,
                 n_heads,
                 d_ff,
                 dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout)
                        for _ in range(n_layers)]

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, mask):
        #  구현
        out = x
        for enc_layer in self.enc_layers:
            out, enc_attns = enc_layer(out,mask)


        return out, enc_attns


In [None]:
# Decoder 구현
class Decoder(tf.keras.Model):
    def __init__(self,
                 n_layers,
                 d_model,
                 n_heads,
                 d_ff,
                 dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout)
                            for _ in range(n_layers)]


    def call(self, x, enc_out, dec_enc_mask, dec_mask):
        #  구현
        out = x
        for dec_layer in self.dec_layers:
            out, dec_attns, dec_enc_attns = dec_layer(out, enc_out, dec_enc_mask, dec_mask)
        return out, dec_attns, dec_enc_attns

In [None]:
n_layers = 6
n_heads = 8
d_model = 512
d_ff = 2048
dropout= 0.4

encoder = Encoder(n_layers=n_layers,
                  n_heads = n_heads,
                  d_model=d_model,
                  d_ff=d_ff,
                  dropout=dropout)
decoder = Decoder(n_layers=n_layers,
                  n_heads = n_heads,
                  d_model=d_model,
                  d_ff=d_ff,
                  dropout=dropout)

input_test = np.ones((16,50,512),dtype=np.float32)

tgt_test = np.ones((16,50,512),dtype=np.float32)

enc_mask, dec_enc_mask, dec_mask = generate_masks(np.ones((16,50),dtype=np.float32),np.ones((16,50),dtype=np.float32))

enc_out, enc_attns = encoder(input_test,enc_mask)

dec_out, dec_attns, dec_enc_attns = decoder(tgt_test,enc_out,dec_enc_mask, dec_mask)


In [None]:
class Transformer(tf.keras.Model):
    def __init__(self,
                 n_layers,
                 d_model,
                 n_heads,
                 d_ff,
                 src_vocab_size,
                 tgt_vocab_size,
                 pos_len,
                 dropout=0.2,
                 shared_fc=True,
                 shared_emb=False):
        super(Transformer, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        self.train = True

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    # embedding, call 구현하기
    def embedding(self, emb, x):
        #  구현
        out = emb(x) + self.pos_encoding[:x.shape[1]]
        return out

    # 모델 학습 속도, 메모리를 최적화 시키기 위해 tf.tunction()사용
    @tf.function()
    def call(self, x):
        #  구현
        enc_in, dec_in = x['enc_in'],x['dec_in']
        enc_mask, dec_enc_mask, dec_mask = generate_masks(enc_in, dec_in)
        inp_tensor = self.do(self.embedding(self.enc_emb, enc_in))
        tgt_tensor = self.do(self.embedding(self.dec_emb, dec_in))
        enc_out, enc_attns = self.encoder(inp_tensor,enc_mask)
        dec_out, dec_attns, dec_enc_attns = self.decoder(tgt_tensor,enc_out,dec_enc_mask,dec_mask)
        logits = self.fc(dec_out)
        if self.train:
            return logits
        else: return logits, enc_attns, dec_attns, dec_enc_attns

    def train(self):
        self.train = True

    def eval(self):
        self.train = False

In [None]:
# 주어진 하이퍼파라미터로 Transformer 인스턴스 생성
n_layers = 2
n_heads = 8
d_model = 512
d_ff = 2048
dropout= 0.4
src_vocab_size = 10000
tgt_vocab_size = 10000
pos_len = 100
shared_fc = True
shared_emb = False

model = Transformer(n_layers=n_layers,
                    n_heads=n_heads,
                    d_model=d_model,
                    d_ff=d_ff,
                    dropout=dropout,
                    src_vocab_size=src_vocab_size,
                    tgt_vocab_size=tgt_vocab_size,
                    pos_len=pos_len,
                    shared_fc=shared_fc,
                    shared_emb=shared_emb)
test_data = {'enc_in':np.zeros((16,40),dtype=np.float32), 'dec_in':np.zeros((16,40),dtype=np.float32)}
model(test_data)
model.summary()

In [None]:
# Learning Rate Scheduler 구현
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()

        self.d_model = d_model
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        #  구현
        step = tf.cast(step,tf.float32)
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)


In [None]:
sample_learning_rate = LearningRateScheduler(d_model=128)

plt.plot(sample_learning_rate(tf.range(200000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

In [None]:
# Learning Rate 인스턴스 선언 & Optimizer 구현

learning_rate_scheduler = LearningRateScheduler(d_model=d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_scheduler)


In [None]:
# Loss Function 정의
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    #  구현 # real: (16,50), pred:(16,50,20000)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')(real, pred)

    mask = tf.cast(tf.not_equal(real, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

In [None]:
model.compile(optimizer=optimizer,loss=loss_function)

In [None]:
epochs = 40
model.fit(train_dataset,validation_data=val_dataset,epochs=epochs)