In [8]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

import os, re
import glob
from time import time
from scipy.io.arff import loadarff
from scipy import stats
from statsmodels.formula.api import ols, glm
from functools import reduce
import nltk                         
from nltk.tokenize import word_tokenize   
from nltk.corpus import stopwords         
from nltk.stem import WordNetLemmatizer    
from collections import Counter 
from wordcloud import STOPWORDS, WordCloud
import glob           
import json
import gensim
import shutil
import unicodedata
import urllib3
import konlpy
from konlpy.tag import Okt, Mecab
from collections import Counter
from wordcloud import WordCloud

import sklearn
import statsmodels
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, LSTM, GRU, Embedding, Dense, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

In [9]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. Project_Translator

### (1) 데이터 불러오기

In [11]:
lines = pd.read_csv('/content/drive/MyDrive/translator/fra.txt', names=['eng', 'fra', 'cc'], sep='\t')
print('전체 샘플의 수 :',len(lines))
lines.sample(5)

전체 샘플의 수 : 197463


Unnamed: 0,eng,fra,cc
113685,Two wrongs don't make a right.,Les erreurs ne se compensent pas.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
85420,Can I stay a little longer?,Puis-je rester un peu plus longtemps ?,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
28160,You're very brave.,Vous êtes très braves.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
84605,You aren't ready for this.,Tu n'es pas prêt pour ça.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
155295,I'd like to speak with Tom in private.,J'aimerais parler avec Tom en privé.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...


In [12]:
lines = lines[['eng', 'fra']][:33000]
lines.sample(5)

Unnamed: 0,eng,fra
24937,Is Tom successful?,Tom a-t-il du succès ?
217,Come on!,Secouez-vous !
19204,I was threatened.,J'ai été menacée.
21881,What's this room?,C'est quoi cette pièce ?
19637,Is your mom home?,Ta mère est-elle chez elle ?


In [13]:
num_samples = 33000

### (2) 디코더의 문장에 시작 토큰과 종료 토큰을 넣기

In [14]:
def to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sent):
  sent = to_ascii(sent.lower())
  sent = re.sub(r"([?.!,¿])", r" \1", sent)  
  sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent) 
  sent = re.sub(r"\s+", " ", sent)  
  return sent

In [15]:
def load_preprocessed_data():
  encoder_input, decoder_input, decoder_target = [], [], []

  with open('/content/drive/MyDrive/translator/fra.txt', "r") as lines:
    for i, line in enumerate(lines):
      src_line, tar_line, _ = line.strip().split('\t')
      src_line = [w for w in preprocess_sentence(src_line).split()]
      tar_line = preprocess_sentence(tar_line)
      tar_line_in = [w for w in ("<sos> " + tar_line).split()]
      tar_line_out = [w for w in (tar_line + " <eos>").split()]

      encoder_input.append(src_line)
      decoder_input.append(tar_line_in)
      decoder_target.append(tar_line_out)

      if i == num_samples - 1:
        break

  return encoder_input, decoder_input, decoder_target

sents_en_in, sents_fra_in, sents_fra_out = load_preprocessed_data()

### (3) 케라스의 토크나이저로 텍스트를 숫자로 바꾸기

In [16]:
tokenizer_en = Tokenizer(filters="", lower=False)
tokenizer_en.fit_on_texts(sents_en_in)
encoder_input = tokenizer_en.texts_to_sequences(sents_en_in)
encoder_input = pad_sequences(encoder_input, padding="post")

tokenizer_fra = Tokenizer(filters="", lower=False)
tokenizer_fra.fit_on_texts(sents_fra_in)
tokenizer_fra.fit_on_texts(sents_fra_out)

decoder_input = tokenizer_fra.texts_to_sequences(sents_fra_in)
decoder_input = pad_sequences(decoder_input, padding="post")

decoder_target = tokenizer_fra.texts_to_sequences(sents_fra_out)
decoder_target = pad_sequences(decoder_target, padding="post")

In [17]:
src_vocab_size = len(tokenizer_en.word_index) + 1
tar_vocab_size = len(tokenizer_fra.word_index) + 1
print("영어 단어 : {:d}, 프랑스어 단어 : {:d}".format(src_vocab_size, tar_vocab_size))

영어 단어 : 4672, 프랑스어 단어 : 8137


In [18]:
src_to_index = tokenizer_en.word_index
index_to_src = tokenizer_en.index_word
tar_to_index = tokenizer_fra.word_index
index_to_tar = tokenizer_fra.index_word

In [19]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print('랜덤 시퀀스 :',indices)

랜덤 시퀀스 : [ 8270 15480 22136 ...  1997 16968  7672]


In [20]:
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

n_of_val = int(33000*0.1)

encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

### (4) 임베딩 층 사용하기

In [21]:
embedding_dim = 64
hidden_units = 64

In [22]:
# 인코더
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(src_vocab_size, embedding_dim)(encoder_inputs) 
enc_masking = Masking(mask_value=0.0)(enc_emb)                    
encoder_lstm = LSTM(hidden_units, return_state=True)              
encoder_outputs, state_h, state_c = encoder_lstm(enc_masking)       
encoder_states = [state_h, state_c]                        

In [23]:
# 디코더
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(tar_vocab_size, hidden_units)  
dec_emb = dec_emb_layer(decoder_inputs)                
dec_masking = Masking(mask_value=0.0)(dec_emb)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True) 
decoder_outputs, _, _ = decoder_lstm(dec_masking, initial_state=encoder_states)
decoder_dense = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

### (5) 모델 구현하기

In [24]:
model.fit(x=[encoder_input_train, decoder_input_train], y=decoder_target_train, \
          validation_data=([encoder_input_test, decoder_input_test], decoder_target_test), batch_size=128, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f2680463340>

In [25]:
# 인코더
encoder_model = Model(encoder_inputs, encoder_states)

# 디코더
decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

# 수정된 디코더
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

In [26]:
def decode_sequence(input_seq):
  states_value = encoder_model.predict(input_seq)

  target_seq = np.zeros((1,1))
  target_seq[0, 0] = tar_to_index['<sos>']

  stop_condition = False
  decoded_sentence = ''

  while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = index_to_tar[sampled_token_index]
    decoded_sentence += ' '+sampled_char

    if (sampled_char == '<eos>' or
        len(decoded_sentence) > 50):
        stop_condition = True

    target_seq = np.zeros((1,1))
    target_seq[0, 0] = sampled_token_index
    
    states_value = [h, c]

  return decoded_sentence

In [27]:
def seq_to_src(input_seq):
  sentence = ''
  for encoded_word in input_seq:
    if(encoded_word != 0):
      sentence = sentence + index_to_src[encoded_word] + ' '
  return sentence

def seq_to_tar(input_seq):
  sentence = ''
  for encoded_word in input_seq:
    if(encoded_word != 0 and encoded_word != tar_to_index['<sos>'] and encoded_word != tar_to_index['<eos>']):
      sentence = sentence + index_to_tar[encoded_word] + ' '
  return sentence

### (6) 모델 평가하기

In [31]:
# 훈련 데이터에 대해서 임의로 선택한 인덱스의 샘플 결과 출력
for seq_index in [30, 60, 90, 300, 3000]:
  input_seq = encoder_input_train[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(input_seq)
  print("입력 :",seq_to_src(encoder_input_train[seq_index]))
  print("정답 :",seq_to_tar(decoder_input_train[seq_index]))
  print("번역 :",decoded_sentence[1:-5])
  print("-"*55)

입력 : boil some water . 
정답 : faites bouillir de l eau . 
번역 : prends le dehors de l eau . 
-------------------------------------------------------
입력 : it wasn t enough . 
정답 : ce n etait pas assez . 
번역 : ce n etait pas assez . 
-------------------------------------------------------
입력 : you re too naive . 
정답 : tu es trop naive . 
번역 : tu es trop vieux . 
-------------------------------------------------------
입력 : i felt wonderful . 
정답 : j ai eu un sentiment merveilleux . 
번역 : je me suis senti fort . 
-------------------------------------------------------
입력 : don t touch my car . 
정답 : touche pas a ma bagnole ! 
번역 : ne le touche pas a faire de reproche . 
-------------------------------------------------------


In [32]:
# 테스트 데이터에 대해서 임의로 선택한 인덱스의 샘플 결과 출력
for seq_index in [30, 60, 90, 300, 3000]:
  input_seq = encoder_input_test[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(input_seq)
  print("입력 :",seq_to_src(encoder_input_test[seq_index]))
  print("정답 :",seq_to_tar(decoder_input_test[seq_index]))
  print("번역 :",decoded_sentence[1:-5])
  print("-"*55)

입력 : i said maybe . 
정답 : j ai dit peut etre . 
번역 : j ai dit que j ai bien . 
-------------------------------------------------------
입력 : you re very open . 
정답 : vous etes tres ouvertes . 
번역 : vous etes tres ouverte . 
-------------------------------------------------------
입력 : he was outraged . 
정답 : il etait indigne . 
번역 : il etait paralyse . 
-------------------------------------------------------
입력 : i love jokes . 
정답 : j adore les plaisanteries . 
번역 : j adore les echecs . 
-------------------------------------------------------
입력 : tom can t see you . 
정답 : tom ne peut pas te voir . 
번역 : tom ne s en peut etre vu . 
-------------------------------------------------------


# 2. References

[1] https://lms.aiffel.io/course/382/node/510

[2] https://wikidocs.net/21694