In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
data_dir = "./drive/My Drive/Transformer/"

In [0]:
ko_lines = []
with open (data_dir+'korean-english-park.train.ko', mode='r', encoding='utf-8') as f:
    while True:
        line = f.readline()
        ko_lines.append(line)
        if not line: break

In [0]:
ko_lines[:5]

['개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"\n',
 '모든 광마우스와 마찬가지 로 이 광마우스도 책상 위에 놓는 마우스 패드를 필요로 하지 않는다.\n',
 '그러나 이것은 또한 책상도 필요로 하지 않는다.\n',
 '79.95달러하는 이 최첨단 무선 광마우스는 허공에서 팔목, 팔, 그외에 어떤 부분이든 그 움직임에따라 커서의 움직임을 조절하는 회전 운동 센서를 사용하고 있다.\n',
 '정보 관리들은 동남 아시아에서의 선박들에 대한 많은 (테러) 계획들이 실패로 돌아갔음을 밝혔으며, 세계 해상 교역량의 거의 3분의 1을 운송하는 좁은 해로인 말라카 해협이 테러 공격을 당하기 쉽다고 경고하고 있다.\n']

In [0]:
en_lines = []
with open (data_dir+"korean-english-park.train.en",  mode='r', encoding='utf-8') as f:
    while True:
        line = f.readline()
        en_lines.append(line)
        if not line: break

en_lines[:5]

['Much of personal computing is about "can you top this?"\n',
 'so a mention a few weeks ago about a rechargeable wireless optical mouse brought in another rechargeable, wireless mouse.\n',
 "Like all optical mice, But it also doesn't need a desk.\n",
 'uses gyroscopic sensors to control the cursor movement as you move your wrist, arm, whatever through the air.\n',

In [0]:
import os
import pickle
import argparse

import pandas as pd
from pathlib import Path

from torchtext import data as ttd
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

from torchtext import data as ttd
from torchtext.data import Example, Dataset

In [0]:
import sys

In [0]:
 sys.argv=['']; del sys

In [0]:
def build_tokenizer(train_data_path = 'pickles/train_data.pickle'):
    """
    Train soynlp tokenizer which will be used to tokenize Korean input sentence
    """
    print(f'Now building soy-nlp tokenizer . . .')

    with open(data_dir+train_data_path, 'rb') as f:
      train_data = pickle.load(f)
    ko_lines = (train_data.head()['Kor'].tolist())


    word_extractor = WordExtractor(min_frequency=5)
    word_extractor.train(ko_lines)

    word_scores = word_extractor.extract()
    cohesion_scores = {word: score.cohesion_forward
                       for word, score in word_scores.items()}

    with open(data_dir+'pickles/tokenizer.pickle', 'wb') as pickle_out:
        pickle.dump(cohesion_scores, pickle_out)


In [0]:
#build_tokenizer()

Now building soy-nlp tokenizer . . .
training was done. used memory 1.370 Gb
all cohesion probabilities was computed. # words = 100264
all branching entropies was computed # words = 192037
all accessor variety was computed # words = 192037


In [None]:
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

def en_tokenize(text):
    text = text.replace('\\', ' ')
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    return tokens

In [0]:
import re
def clean_text(text):
    text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`…》]', '', text)
    return text

In [0]:
def convert_to_dataset(data, kor, eng):

    # convert each row of DataFrame to torchtext 'Example' containing 'kor' and 'eng' Fields
    list_of_examples = [Example.fromlist(row.apply(lambda x: clean_text(x)).tolist(),
                                         fields=[('kor', kor), ('eng', eng)]) for _, row in data.iterrows()]

    # construct torchtext 'Dataset' using torchtext 'Example' list
    dataset = Dataset(examples=list_of_examples, fields=[('kor', kor), ('eng', eng)])

    return dataset


In [None]:
pickle_tokenizer = open(data_dir+'pickles/tokenizer.pickle', 'rb')
cohesion_scores = pickle.load(pickle_tokenizer)
tokenizer = LTokenizer(scores=cohesion_scores)


with open(data_dir+'pickles/train_data.pickle', 'rb') as f:
    train_data = pickle.load(f)

kor = ttd.Field(tokenize=tokenizer.tokenize,
                    lower=True,
                    batch_first=True)

eng = ttd.Field(tokenize=en_tokenize,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True,
                batch_first=True)
train_data = convert_to_dataset(train_data, kor, eng)


#print(train_data[-1].__dict__) 

In [0]:
def build_vocab(config):
    """
    Build vocab used to convert input sentence into word indices using soynlp and spacy tokenizer
    Args:
        config: configuration containing various options
    """
    pickle_tokenizer = open(data_dir+'pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    # include lengths of the source sentences to use pack pad sequence
    kor = ttd.Field(tokenize=tokenizer.tokenize,
                    lower=True,
                    batch_first=True)

    eng = ttd.Field(tokenize=en_tokenize,
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True,
                    batch_first=True)

    with open(data_dir+'pickles/train_data.pickle', 'rb') as f:
        train_data = pickle.load(f)
    train_data = convert_to_dataset(train_data, kor, eng)

    print(f'Build vocabulary using torchtext . . .')

    kor.build_vocab(train_data, max_size=config.kor_vocab)
    eng.build_vocab(train_data, max_size=config.eng_vocab)

    print(f'Unique tokens in Korean vocabulary: {len(kor.vocab)}')
    print(f'Unique tokens in English vocabulary: {len(eng.vocab)}')

    print(f'Most commonly used Korean words are as follows:')
    print(kor.vocab.freqs.most_common(20))

    print(f'Most commonly used English words are as follows:')
    print(eng.vocab.freqs.most_common(20))

    with open(data_dir+'pickles/kor.pickle', 'wb') as kor_file:
        pickle.dump(kor, kor_file)

    with open(data_dir+'pickles/eng.pickle', 'wb') as eng_file:
        pickle.dump(eng, eng_file)

In [0]:
#build_vocab()

Build vocabulary using torchtext . . .
Unique tokens in Korean vocabulary: 55002
Unique tokens in English vocabulary: 30004
Most commonly used Korean words are as follows:
[('의', 23305), ('이', 21928), ('는', 19702), ('에', 15945), ('을', 13559), ('가', 13317), ('를', 13315), ('있다', 12952), ('은', 12847), ('고', 11904), ('한', 9071), ('밝혔다', 7939), ('미국', 7718), ('말했다', 7135), ('있는', 6583), ('수', 6356), ('과', 6156), ('로', 6018), ('에서', 5909), ('와', 5712)]
Most commonly used English words are as follows:
[('the', 127922), ('to', 54251), ('of', 51374), ('a', 49154), ('in', 47100), ('and', 43481), ("'s", 21330), ('said', 19622), ('for', 17999), ('that', 17835), ('on', 17830), ('is', 14780), ('was', 13466), ('with', 12430), ('it', 11917), ('as', 10582), ('at', 10573), ('he', 10344), ('by', 9679), ('from', 9502)]


In [None]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Pickle Builder')
    #parser.add_argument('--train_data_path', type=str, default='pickles/train_data.pickle')
    parser.add_argument('--kor_vocab', type=int, default=55000)
    parser.add_argument('--eng_vocab', type=int, default=30000)

    config = parser.parse_args()

    build_tokenizer()
    build_vocab(config)

In [0]:
# with open(data_dir+'pickles/kor.pickle', 'rb') as kor_file:
#       kor = pickle.load(kor_file)

In [0]:
# for i in range(100):
#   print(kor.vocab.itos[i])

<unk>
<pad>
의
이
는
에
을
가
를
있다
은
고
한
밝혔다
미국
말했다
있는
수
과
로
에서
와
대통령
할
지난
그는
200
위해
대한
인
정부
이번
전했다
것이
대해
며
것이라고
하는
된
전
다
이라크
해
했다
것으로
더
도
것을
현지시간
중국
북한
”고
한국
중
하고
될
다른
그러나
으로
미
영국
문제
오바마
10
경찰
총리
또
세계
한편
에게
영화
위한
후보
관련
동안
여성
많은
주
통해
이후
일본
가장
같은
지
부시
그의
후
그
서
우리
않았다
국가
대변인은
한다
됐다
테러
20
였다
”며
계획
