In [3]:
import numpy as np
import pandas as pd
from numpy import dot
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer

In [4]:
data = pd.read_csv('kor-eng/kor.txt', delimiter='\t')
data.columns = ['en', 'kor', 'cc']
data

Unnamed: 0,en,kor,cc
0,Hi.,안녕.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Run!,뛰어!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
2,Run.,뛰어.,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
3,Who?,누구?,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Wow!,우와!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
...,...,...,...
5864,I started a new blog. I'll do my best not to b...,난 블로그를 시작했어. 블로그를 초반에만 반짝 많이 하다가 관두는 사람처럼은 되지 ...,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
5865,I think it's a shame that some foreign languag...,몇몇 외국어 선생님이 한 번도 원어민과 공부해본 적도 없으면서 대학을 나올 수 있었...,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
5866,And the good news is that today the economy is...,"다음으로 좋은 소식은 오늘 경제가 재성장한다는 것입니다. 임금, 소득, 집값, 퇴직...",CC-BY 2.0 (France) Attribution: tatoeba.org #5...
5867,If someone who doesn't know your background sa...,만일 네 사정도 잘 모르는 사람이 원어민 같다고 말한다면 그건 그 사람이 네가 원어...,CC-BY 2.0 (France) Attribution: tatoeba.org #9...


In [5]:
model = SentenceTransformer('sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

In [6]:
data['embedding'] = data.apply(lambda row : model.encode(row.en), axis=1)

In [7]:
data.to_csv('kor-eng/kor_with_embedding.csv', index=False)

In [6]:
def cos_sim(A, B):
    return dot(A, B) / (norm(A)*norm(B))

In [7]:
def translate(sentence):
    embedding = model.encode(sentence)
    data['score'] = data.apply(lambda x : cos_sim(x['embedding'], embedding), axis=1)
    return data.loc[data['score'].idxmax()]['kor']

In [9]:
translate('What are you going to do?')

'뭘 할 계획이니?'

In [11]:
translate('What are you doing?')

'뭐하고 있어?'

In [12]:
translate('Please take me to the market.')

'나는 이것을 사고 싶다.'

In [13]:
from keras.models import load_model

In [16]:
model.save('Translate_BERT.h5')

In [18]:
trans_model = SentenceTransformer.load('Translate_BERT.h5')

In [19]:
def translate1(sentence):
    embedding = trans_model.encode(sentence)
    data['score'] = data.apply(lambda x : cos_sim(x['embedding'], embedding), axis=1)
    return data.loc[data['score'].idxmax()]['kor']

In [20]:
translate1('What are you going to do?')

'뭘 할 계획이니?'