### Translation for kor → eng

In [1]:
import sys
sys.path.insert(0, '../attention-is-all-you-need/')
sys.path.insert(0, '../')

In [2]:
import sentencepiece as spm
import random
import pandas as pd
import os
import numpy as np
from setting import *
from dataset import load_dataset

In [3]:
def load_dataset_aihub(path='../dataset/aihub'):
    sent_pairs = []
    for f in os.listdir(path):
        one_df = pd.read_excel(os.path.join(path, f))
        one_df = one_df.rename(columns={
            '영어':'eng',
            '한국어':'kor',
            '원문':'kor',
            '영어 검수':'label',
            '영어검수':'label',
            '번역문':'label',
            'Review':'label',
            'REVIEW':'label',
        })
        sent_pairs.extend(one_df[['kor','label']].values.tolist())
    return sent_pairs

In [4]:
sent_pairs = load_dataset_aihub()

1_con_1.xlsx
6_off.xlsx
2_conv3.xlsx
5_off.xlsx
3_off_2.xlsx
4_off.xlsx
3_off_3.xlsx
3_off_4.xlsx
1_con_2.xlsx
3_off_1.xlsx


In [5]:
with open('./spm_src.txt', 'w', encoding='utf-8') as f:
    for sent in sent_pairs:
        f.write('{}\n'.format(sent[0]))

In [6]:
with open('./spm_trg.txt', 'w', encoding='utf-8') as f:
    for sent in sent_pairs:
        f.write('{}\n'.format(sent[1]))

In [7]:
! wc -l ./spm_*.txt

  1602418 ./spm_src.txt
  1602418 ./spm_trg.txt
  3204836 total


In [8]:
! head -n 3 ./spm_src.txt

'Bible Coloring'은 성경의 아름다운 이야기를 체험 할 수 있는 컬러링 앱입니다.
씨티은행에서 일하세요?
푸리토의 베스트셀러는 해외에서 입소문만으로 4차 완판을 기록하였다.


In [9]:
! head -n 3 ./spm_trg.txt

Bible Coloring' is a coloring application that allows you to experience beautiful stories in the Bible.
Do you work at a City bank?
PURITO's bestseller, which recorded 4th rough -cuts by words of mouth from abroad.


In [10]:
random.shuffle(sent_pairs)

In [11]:
sent_pairs[0]

['이씨가 스타트업 투자에 관심을 갖게 된 것은 은둔의 투자 고수로 잘 알려진 장덕수 DS자산운용의 회장의 영향인 것으로 전해졌다.',
 "Lee's interest in start-up investment is reportedly due to the influence of Jang Duk-soo, chairman of DS Asset Management Co., who is well known as a reclusive investment master."]

In [12]:
src_vocab_size = 50000  # 50000
trg_vocab_size = 32000  # 32000
src_prefix = 'spm-src-{}'.format(src_vocab_size)
trg_prefix = 'spm-trg-{}'.format(trg_vocab_size)
src_prefix, trg_prefix

('spm-src-50000', 'spm-trg-32000')

In [13]:
src_cmd = templates.format(
    src_input_file,
    pad_id,
    bos_id,
    eos_id,
    unk_id,
    src_prefix,
    src_vocab_size,
    character_coverage,
    model_type)
src_cmd

'--input=./spm_src.txt --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3 --model_prefix=spm-src-50000 --vocab_size=50000 --character_coverage=1.0 --model_type=unigram'

In [14]:
trg_cmd = templates.format(
    trg_input_file,
    pad_id,
    bos_id,
    eos_id,
    unk_id,
    trg_prefix,
    trg_vocab_size,
    character_coverage,
    model_type)
trg_cmd

'--input=./spm_trg.txt --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3 --model_prefix=spm-trg-32000 --vocab_size=32000 --character_coverage=1.0 --model_type=unigram'

In [15]:
spm.SentencePieceTrainer.Train(src_cmd)
spm.SentencePieceTrainer.Train(trg_cmd)

True

In [16]:
! ls -alh *.model

-rw-r--r-- 1 jkfirst deep-learners 1.2M  3월 26 10:40 spm-src-50000.model
-rw-r--r-- 1 jkfirst deep-learners 790K  3월 26 10:43 spm-trg-32000.model


In [17]:
src_spm = spm.SentencePieceProcessor()
trg_spm = spm.SentencePieceProcessor()
src_spm.Load('{}.model'.format(src_prefix)) 
trg_spm.Load('{}.model'.format(trg_prefix))

True

In [18]:
with open('{}.vocab'.format(src_prefix), encoding='utf-8') as f:
    src_vocab = [doc.strip() for doc in f]
src_vocab[:10]

['<pad>\t0',
 '<s>\t0',
 '</s>\t0',
 '<unk>\t0',
 '.\t-3.13177',
 '을\t-4.00631',
 '의\t-4.18175',
 ',\t-4.19567',
 '를\t-4.43072',
 '가\t-4.50283']

In [19]:
with open('{}.vocab'.format(trg_prefix), encoding='utf-8') as f:
    trg_vocab = [doc.strip() for doc in f]
trg_vocab[:10]

['<pad>\t0',
 '<s>\t0',
 '</s>\t0',
 '<unk>\t0',
 '▁the\t-2.83348',
 ',\t-3.18371',
 '.\t-3.36705',
 '▁of\t-3.55772',
 '▁and\t-3.77369',
 '▁to\t-3.80794']

In [20]:
idx = np.random.randint(0, len(sent_pairs))
idx
src_spm.EncodeAsPieces(sent_pairs[idx][0])

['▁레드벨벳',
 '▁조',
 '이',
 '가',
 '▁조세',
 '호를',
 '▁돕기',
 '▁위한',
 '▁특급',
 '▁게스트로',
 '▁나선다',
 '.']

In [21]:
idx = np.random.randint(0, len(sent_pairs))
trg_spm.EncodeAsPieces(sent_pairs[idx][1])

['▁President',
 '▁Trump',
 '▁also',
 '▁made',
 '▁remarks',
 '▁to',
 '▁Howard',
 '▁St',
 'ern',
 '▁in',
 '▁1997',
 '▁disparaging',
 '▁the',
 '▁entire',
 '▁Vietnam',
 '▁War',
 ',',
 '▁saying',
 '▁avoiding',
 '▁v',
 'ene',
 'real',
 '▁diseases',
 '▁was',
 '▁"',
 'my',
 '▁personal',
 '▁Vietnam',
 '▁War',
 '."']

In [22]:
idx = np.random.randint(0, len(sent_pairs))
src_sent = trg_spm.EncodeAsPieces(sent_pairs[idx][0])
trg_sent = trg_spm.EncodeAsPieces(sent_pairs[idx][1])
src_sent, trg_sent

(['▁',
  '특히',
  '▁',
  '현재',
  '▁',
  '교착상태에서',
  '▁',
  '벗어나',
  '▁',
  '다시',
  '▁',
  '비핵화',
  '▁',
  '프로세스가',
  '▁',
  '촉진되는',
  '▁',
  '데',
  '▁',
  '중국이',
  '▁',
  '기여하게',
  '▁',
  '된다면',
  '▁',
  '이후',
  '▁',
  '한반도',
  '▁',
  '평화와',
  '▁',
  '관련된',
  '▁',
  '논의에',
  '▁',
  '중국이',
  '▁',
  '당사자로',
  '▁',
  '참여할',
  '▁',
  '길이',
  '▁',
  '넓어진다',
  '.'],
 ['▁In',
  '▁particular',
  ',',
  '▁if',
  '▁China',
  '▁contribute',
  's',
  '▁to',
  '▁promote',
  '▁the',
  '▁',
  'denuclearization',
  '▁process',
  '▁again',
  '▁by',
  '▁coming',
  '▁out',
  '▁of',
  '▁the',
  '▁current',
  '▁deadlock',
  ',',
  '▁China',
  '▁will',
  '▁have',
  '▁more',
  '▁ways',
  '▁to',
  '▁participate',
  '▁as',
  '▁a',
  '▁party',
  '▁in',
  '▁future',
  '▁discussions',
  '▁on',
  '▁peace',
  '▁on',
  '▁the',
  '▁Korean',
  '▁peninsula',
  '.'])

In [23]:
len(sent_pairs)

1602418