# 사전 훈련된 Word2Vec을 이용한 영어-한국어 번역 모형

In [1]:
import os
os.chdir('C:/Users/HK/Desktop/GitHub/Deep-Learning/NLP')

import gensim
import nltk
import konlpy
import keras
import numpy as np
import pandas as pd

Using TensorFlow backend.


> ### 사전 훈련된 워드벡터 준비

- 언어별 Word2Vec 불러오기

In [4]:
from gensim.models import Word2Vec
word2vec_kor = Word2Vec.load('Data/ko.bin')

  "C extension not loaded, training will be slow. "


In [5]:
from gensim.models import KeyedVectors
word2vec_eng = KeyedVectors.load_word2vec_format('Data/GoogleNews-vectors-negative300.bin', binary = True)

> ### 데이터 준비

- 출처: Manythings.org http://www.manythings.org/anki/kor-eng.zip

In [9]:
lines = pd.read_table('Data/kor.txt', names = ['source', 'target'])
print(len(lines), 'observations')
lines.sample(10)

909 observations


Unnamed: 0,source,target
156,Tom isn't skinny.,톰은 마르지 않았다.
839,Tom lives in a small apartment on Park Street.,톰은 파크 스트리트의 작은 아파트에서 살고 있다.
617,I don't want to throw that away.,나는 그것을 버리고 싶지 않다.
205,We've been worried.,계속 걱정했어.
907,It's not always possible to eat well when you'...,"당신이 세계를 여행하는 동안, 항상 잘먹는 것이 가능하지는 않습니다."
770,There are a lot of sheep in the pasture.,목초지에 양이 많이 있다.
683,Tom has something to do right now.,톰은 당장 할일이 있어.
163,You'll regret it.,너 후회할거야.
45,Can I go now?,이제 가도 되나요?
178,Tom vomited blood.,톰은 피를 토했다.


In [10]:
lines.target = lines.target.apply(lambda x : '<sos> ' + x + ' <eos>')
lines.sample(10)

Unnamed: 0,source,target
203,Tie your shoelaces.,<sos> 신발끈을 묶으세요. <eos>
751,Everybody here except me has done that.,<sos> 나 빼고 여기에 있는 사람 모두 그것을 했다. (한 적이 있다.) <eos>
221,I don't know either.,<sos> 나도 몰라. <eos>
333,I dislike cold weather.,<sos> 나는 추운 날씨를 싫어한다. <eos>
121,"Sorry, I'm late.",<sos> 늦어서 미안합니다. <eos>
34,I don't lie.,<sos> 나는 거짓말 하지 않습니다. <eos>
470,It is already nine o'clock.,<sos> 벌써 아홉시다. <eos>
385,We'll take care of that.,<sos> 우리가 그것을 맡겠다. (처리하겠다.) <eos>
853,Tom told Mary he wouldn't let her go by herself.,<sos> 톰은 메리에게 그는 그녀를 혼자 가게 하지는 않겠다고 말했다. <eos>
692,His behavior aroused my suspicions.,<sos> 그의 행동은 나에게 의심을 불러일으켰다. <eos>


In [16]:
from keras.preprocessing.text import Tokenizer

In [19]:
source_t = Tokenizer()
source_t.fit_on_texts(lines.source)
source_vocab_size = len(source_t.word_index) + 1

target_t = Tokenizer()
target_t.fit_on_texts(lines.target)
target_vocab_size = len(target_t.word_index) + 1

In [20]:
print(source_vocab_size)
print(target_vocab_size)

1170
2049


In [26]:
encoder_input_data = source_t.texts_to_sequences(lines.source)
source_max_length = max(len(l) for l in encoder_input_data)

decoder_input_data = target_t.texts_to_sequences(lines.target)
decoder_target_data = [l[1:] for l in decoder_input_data]
target_max_length = max(len(l) for l in decoder_input_data)

In [27]:
print(source_max_length)
print(target_max_length)

19
17


In [28]:
print(encoder_input_data[0])
print(decoder_input_data[0])
print(decoder_target_data[0])

[92]
[1, 264, 2]
[264, 2]


In [25]:
from keras.preprocessing.sequence import pad_sequences

In [29]:
encoder_input_data = pad_sequences(
    encoder_input_data, maxlen = source_max_length, padding = 'post')
decoder_input_data = pad_sequences(
    decoder_input_data, maxlen = target_max_length, padding = 'post')
decoder_target_data = pad_sequences(
    decoder_target_data, maxlen = target_max_length, padding = 'post')

In [63]:
encoder_embedding_dim = word2vec_eng.vector_size
encoder_pretrained_embedding = np.zeros((source_vocab_size, encoder_embedding_dim))

for word, i in source_t.word_index.items():
    if word2vec_eng.vocab.get(word) is not None:
        encoder_pretrained_embedding[i] = word2vec_eng.get_vector(word)
    else:
        print('no pretrained vector')

no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector
no pretrained vector


In [62]:
decoder_embedding_dim = word2vec_kor.vector_size
decoder_pretrained_embedding = np.zeros((target_vocab_size, decoder_embedding_dim))

for word, i in target_t.word_index.items():
    if word2vec_kor.vocabulary.get(word) is not None:
        encoder_pretrained_embedding[i] = word2vec_eng.get_vector(word)
    else:
        print('no pretrained vector')

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.22558594, -0.01953125,  0.09082031, ...,  0.02819824,
        -0.17773438, -0.00604248],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.171875  ,  0.00543213, -0.10400391, ...,  0.05004883,
         0.25      ,  0.18066406],
       [ 0.04858398, -0.05444336,  0.08349609, ...,  0.03442383,
         0.0291748 ,  0.07421875],
       [ 0.35351562,  0.07080078,  0.03588867, ..., -0.04394531,
         0.00787354,  0.02648926]])