In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install sentencepiece

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sentencepiece as spm

import re
import os
import random
import math

from tqdm import tqdm
import matplotlib.pyplot as plt



In [None]:
! tar -zxvf /content/drive/MyDrive/korean-english-park.dev.tar.gz
! tar -zxvf /content/drive/MyDrive/korean-english-park.test.tar.gz
! tar -zxvf /content/drive/MyDrive/korean-english-park.train.tar.gz

In [None]:
k_file_path = '/content/korean-english-park.train.ko'
e_file_path = '/content/korean-english-park.train.en'

with open(k_file_path, 'r' ) as f:
    ko_raw = f.read().splitlines()

with open(e_file_path, 'r' ) as f:
    en_raw = f.read().splitlines()

print(ko_raw[:3])
print(en_raw[:3])

In [None]:
import re
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'([?!,."])', r' \1 ',sentence) # 특수문자 인정
    sentence = re.sub(r'[^A-zㄱ-ㅎㅏ-ㅣ가-힣0-9?!,."]', ' ', sentence) # 영어, 한국어, 숫자 표현만 인정
    sentence = re.sub(r'[" "]+', ' ',sentence)
    sentence = sentence.strip()
    return sentence

In [None]:
def clean_corpus(kor,eng):
    assert len(kor) == len(eng)
    print(' 데이터 수  :', len(kor))

    dataset = set()
    for i , j in tqdm(list(zip(kor, eng))):
        i = preprocess_sentence(i)
        j = preprocess_sentence(j)
        dataset.add((i,j))
    print(len(dataset))
    cleaned_corpus = list(dataset)
    return cleaned_corpus
# 데이터불러 오고 ->정규표현식 -> 중복 데이터
# 좋은데이터  = 1 . 많고, 2 . 카테고리 혼동 X ,3.다양하게
#

In [None]:
cleaned_corpus = clean_corpus(ko_raw, en_raw)

In [None]:
def generate_tokenizer(corpus, vocab_size, lang='ko',
                       pad_id =0,
                       bos_id  = 1, # 문장 시작
                       eos_id = 2, # 문장 끝
                       unk_id = 3, # unkown token
                       model_type='bpe'):
    file = './%s_corpus.txt' %  lang
    model = "./%s_spm" % lang

    with open(file , 'w') as f:
        for row in corpus: f.write(str(row) + '\n')

    spm.SentencePieceTrainer.train(
        input=file, model_prefix = model, vocab_size=vocab_size,
        pad_id=pad_id, bos_id=bos_id,eos_id=eos_id,unk_id=unk_id,
        model_type=model_type
    )
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load('%s.model'%model)
    return tokenizer

In [None]:
kor, eng = zip(*cleaned_corpus)
print(kor[0])
vocab_size = 10000
ko_tokenizer = generate_tokenizer(kor, vocab_size)
en_tokenzier = generate_tokenizer(eng, vocab_size)

In [None]:
# 번역전 문장
# 번역후 문장
# <Start>번역전문장<end><start>번역후문장<end>
# 오늘 점심 뭐 먹을까?
#<start>오늘 점심 뭐 먹을까?<end><start>오늘 점심은 식당에서 알아서 드세요 <end> dslkdfdfsdf

In [None]:
# 한국어를 맞춰보고 싶으므로 한국어 토크나이저에 bos토큰, eos 토큰 추가 옵션

ko_tokenizer.SetEncodeExtraOptions("bos:eos")

In [None]:
def make_corpus(sentences, tokenizer):
    corpus = []
    for sentence in tqdm(sentences):
        tokens = tokenizer.encode_as_ids(sentence)
        corpus.append(tokens)
    return corpus


In [None]:
ko_corpus = make_corpus(kor, ko_tokenizer)

In [None]:
en_corpus = make_corpus(eng, en_tokenzier)
# 전처리 데이터불러 오고 ->정규표현식 -> 중복 데이터 -> 토큰화 -> 길이
#14:02

In [None]:
print(kor[1])
print(ko_corpus[1])
print(eng[1])
print(en_corpus[1])

In [None]:
def num_of_word(corpus):
    length_sen = [0] * len(corpus)
    for i, j in enumerate(corpus):
        length_sen[i] = len(j)
    return length_sen

In [None]:
num_of_word(ko_corpus)

In [None]:
from collections import Counter
def make_graph(length_sen ,title=None):
    num_num = Counter(length_sen)
    plt.figure(figsize=(16,10))
    if title:
        plt.title(title)
    plt.bar(range(len(num_num)), [num_num[i] for i in range(len(num_num))],)
    plt.show()

In [None]:
make_graph(num_of_word(ko_corpus) , 'korean')
make_graph(num_of_word(en_corpus) , 'english')

In [None]:
MAX_LENGTH = 80
en_ndarray = tf.keras.preprocessing.sequence.pad_sequences(en_corpus, maxlen=MAX_LENGTH,
                                                  truncating='post',
                                                  padding='post')
ko_ndarray = tf.keras.preprocessing.sequence.pad_sequences(ko_corpus, maxlen=MAX_LENGTH,
                                                  truncating='post',
                                                  padding='post')

In [None]:
en_ndarray[:5]
ko_ndarray[:5]

In [None]:
val_k_file_path= '/content/korean-english-park.dev.ko'
val_e_file_path = '/content/korean-english-park.dev.en'

with open(val_k_file_path, 'r') as f:
    val_ko_raw = f.read().splitlines()
with open(val_e_file_path, 'r') as f:
    val_en_raw = f.read().splitlines()

val_cleaned_corpus  = clean_corpus(val_ko_raw, val_en_raw)

In [None]:
val_kor, val_eng = zip(*val_cleaned_corpus)
val_ko_corpus = make_corpus(val_kor, ko_tokenizer)
val_en_corpus = make_corpus(val_eng, en_tokenzier)

val_en_ndarray = tf.keras.preprocessing.sequence.pad_sequences(val_en_corpus, maxlen=MAX_LENGTH,
                                                  truncating='post',
                                                  padding='post')
val_ko_ndarray = tf.keras.preprocessing.sequence.pad_sequences(val_ko_corpus, maxlen=MAX_LENGTH,
                                                  truncating='post',
                                                  padding='post')


In [None]:
val_ko_ndarray.shape , val_en_ndarray.shape

In [None]:
BATCH_SIZE = 256
train_dataset = tf.data.Dataset.from_tensor_slices(({'enc_in' : en_ndarray ,
                                                     'dec_in' :ko_ndarray},
                                                    ko_ndarray)).batch(batch_size = BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices(({'enc_in' : val_en_ndarray , 'dec_in' :val_ko_ndarray}, val_ko_ndarray)).batch(batch_size = BATCH_SIZE)
train_dataset

In [None]:
np.arange(500).reshape(1,-1)
np.zeros((100,10))
for i in range(0, 100, 2):
    print(i)

In [None]:
def positional_encoding(pos, d_model):
    def get_angles(pos, i, d_model):
        return   pos / 10000**(2*(i//2)/d_model)
    pos_line = np.arange(pos).reshape(-1,1)
    d_model_line = np.arange(d_model).reshape(1,-1)

    temp_table = get_angles(pos_line, d_model_line, d_model)

    sinusoid_table = np.zeros(temp_table.shape)

    sinusoid_table[:,0::2] = tf.math.cos(temp_table[:, 0::2])
    sinusoid_table[:,1::2] = tf.math.sin(temp_table[:, 1::2])
    return sinusoid_table

In [None]:
print(positional_encoding(4,5))

In [None]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, ko_tokenizer.pad_id()), 1.0, 0)
    return seq[: , tf.newaxis, tf.newaxis]

In [None]:
class MultiheadAttention(tf.keras.layers.Layer):
    self.depth  = 500

    def scaled_dot_product_attention(q,k,v,mask):
        matmul_qk = tf.matmul(q,k,transpose_b=True)
        matmul_qk = matmul_qk / tf.math.sqrt(self.depth)
        activation_score = tf.keras.activations.softmax(matmul_qk,axis=-1)
        out = tf.matmul(activation_score ,v)