<a href="https://colab.research.google.com/github/linkyouhj/seq2seq/blob/main/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [560]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [561]:
def wmt_dataset(directory='./'):
  ret = []
  splits = ['train','val','test']

  for filename in splits:
    examples = []

    en_path = os.path.join(directory, filename + '.en')
    de_path = os.path.join(directory, filename + '.de')
    en_file = [l.strip() for l in open(en_path, 'r', encoding='utf-8')]
    de_file = [l.strip() for l in open(de_path, 'r', encoding='utf-8')]
    assert len(en_file) == len(de_file)
    for i in range(len(en_file)):
      if en_file[i] != '' and de_file[i] != '':
        examples.append({'en': en_file[i], 'de': de_file[i]})

    ret.append(examples)

  if len(ret) == 1:
    return ret[0]
  else:
    return tuple(ret)

In [562]:
dataset = wmt_dataset()

In [563]:
len(dataset)

3

In [564]:
dataset[0][0]['de']

'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'

In [565]:
data_en = []
data_de = []

for i in range(3):
  for sentence in dataset[i]:
    data_en.append(sentence['en'])
    data_de.append(sentence['de'])



In [566]:
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [567]:
# 각 문장에 대해서 구두점을 제거하고, 대문자를 소문자로 변환.
normalized_en = []
for string in data_en:
    string = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》“„]', ' ', string)
    string = re.sub(r"[^a-zA-Z0-9]+", r" ", string)
    tokens = string.lower().strip()
    normalized_en.append(tokens)


# 각 문장에 대해서 구두점을 제거하고, 대문자를 소문자로 변환.
normalized_de = []
for string in data_de:
    string = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》“„]', ' ', string)
    string = re.sub(r"[^a-zA-ZÄäÖöÜüß0-9]+", r" ", string)
    tokens = string.lower().strip()
    normalized_de.append(tokens)

In [568]:
result_de =  [word_tokenize(sentence) for sentence in normalized_de]
result_en =  [word_tokenize(sentence) for sentence in normalized_en]

In [569]:
from gensim.models import Word2Vec
model_en = Word2Vec(sentences=result_en, size = 256, window = 5,min_count=1,workers=4,sg=0)
model_de = Word2Vec(sentences=result_de, size = 256, window = 5,min_count=1,workers=4,sg=0)

In [570]:
from gensim.models import KeyedVectors
model_en.wv.save_word2vec_format('en_w2v') # 모델 저장
loaded_model_en = KeyedVectors.load_word2vec_format("en_w2v") # 모델 로드
model_de.wv.save_word2vec_format('de_w2v') # 모델 저장
loaded_model_de = KeyedVectors.load_word2vec_format("de_w2v") # 모델 로드

In [571]:
weights_en = torch.FloatTensor(loaded_model_en.vectors)
weights_de = torch.FloatTensor(loaded_model_de.vectors)

In [572]:
'''
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
'''
# 소문자, 다듬기, 그리고 문자가 아닌 문자 제거


def normalizeString(data):
    data["en"] = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》“„]', ' ', data["en"])
    data["en"] = re.sub(r"[^a-zA-Z0-9]+", r" ", data["en"])
    data["en"] = data["en"].lower().strip()
    
    data["de"] = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》“„]', ' ', data["de"])
    data["de"] = re.sub(r"[^a-zA-ZÄäÖöÜüß0-9]+", r" ", data["de"])
    data["de"] = data["de"].lower().strip()
    return data

In [573]:
MAX_LENGTH = 20

def filterPair(p,lang1,lang2):
    return len(p[lang1].split(' ')) < MAX_LENGTH and \
        len(p[lang2].split(' ')) < MAX_LENGTH

def filterPairs(pairs,lang1,lang2):
    return [pair for pair in pairs if filterPair(pair,lang1,lang2)]

In [574]:
def readLangs(dataset, reverse=False):
    print("Reading lines...")

    # 모든 줄을 쌍으로 분리하고 정규화
    # for i in range(len(dataset)):
    #   for data in dataset[i]:
    #     data = normalizeString(data)
    data = []
    for i in range(len(dataset)):
      for d in dataset[i]:
        data.append(d)
    for d in data:
      d = normalizeString(d)
    pairs = data

    # 쌍을 뒤집고, Lang 인스턴스 생성
    # input_lang = Lang(lang1)
    # output_lang = Lang(lang2)
    
    return pairs

In [575]:
def prepareData(lang1, lang2, dataset,reverse=False):
    pairs = readLangs(dataset, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs,lang1,lang2)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")

    return pairs


pairs = prepareData('de', 'en',dataset, True)
print(random.choice(pairs))

Reading lines...
Read 31014 sentence pairs
Trimmed to 29400 sentence pairs
Counting words...
{'en': 'a girl sits on a decorated bike with a younger boy while another girl takes a picture', 'de': 'ein mädchen sitzt mit einem kleineren jungen auf einem dekorierten fahrrad während ein weiteres mädchen ein foto macht'}


In [576]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding.from_pretrained(weights_de)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


In [577]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding.from_pretrained(weights_en)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [578]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [579]:
def indexesFromSentence(lang, sentence):
  if lang == 'en':
    return [loaded_model_en.vocab[word].index for word in sentence.lower().split(' ')]
  elif lang == 'de':
    print([word for word in sentence.split(' ')])
    return [loaded_model_de.vocab[word].index for word in sentence.lower().split(' ')]
  else: print("lang key error")

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair,lang1,lang2):
    input_tensor = tensorFromSentence(lang1, pair[lang1])
    target_tensor = tensorFromSentence(lang2, pair[lang2])
    return (input_tensor, target_tensor)

In [580]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing 포함: 목표를 다음 입력으로 전달
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Teacher forcing 미포함: 자신의 예측을 다음 입력으로 사용
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # 입력으로 사용할 부분을 히스토리에서 분리

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [581]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [582]:
def trainIters(encoder, decoder,lang1,lang2, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # print_every 마다 초기화
    plot_loss_total = 0  # plot_every 마다 초기화

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs),lang1,lang2)
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        print(torch.max(input_tensor))
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)


In [583]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # 주기적인 간격에 이 locator가 tick을 설정
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [584]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [585]:
def evaluateRandomly(encoder, decoder,lang1,lang2, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[lang1])
        print('=', pair[lang2])
        output_words, attentions = evaluate(encoder, decoder, pair[lang1])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [586]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1,'de','en', 75000, print_every=5000)

['das', 'kleine', 'mädchen', 'trägt', 'ein', 'violettes', 'kleid']
['ein', 'schwarz', 'brauner', 'hund', 'der', 'in', 'einem', 'feld', 'rennt']
['ein', 'mann', 'auf', 'rollerblades', 'fährt', 'auf', 'einem', 'parkweg']
['ein', 'hund', 'der', 'eine', 'orange', 'jacke', 'trägt', 'liegt', 'im', 'schnee']
['ein', 'bärtiger', 'mann', 'steht', 'in', 'arbeitskleidung', 'auf', 'einem', 'markt', 'für', 'meeresfrüchte']
['ein', 'junge', 'in', 'einem', 'blauen', 'hemd', 'hämmert', 'auf', 'den', 'boden']
['zwei', 'kleine', 'jungen', 'ringen', 'in', 'einem', 'sandkasten']
['ein', 'junge', 'und', 'sein', 'jüngerer', 'bruder', 'spielen', 'gemeinsam', 'auf', 'einem', 'spielplatz']
['fußballspielt', 'mit', 'teams', 'in', 'rot', 'und', 'blau']
['essen', 'und', 'getränke', 'auf', 'einem', 'japanischen', 'markt']
['eine', 'gruppe', 'von', 'mädchen', 'an', 'einem', 'strand']
['ein', 'reh', 'springt', 'über', 'einen', 'zaun']
['männer', 'die', 'weiße', 'tuniken', 'in', 'flammen', 'tragen', 'tragen', 'riesig

KeyError: ignored

In [None]:
evaluateRandomly(encoder1, attn_decoder1,'de','en')


In [None]:
import nltk.translate.bleu_score as bleu
candidate = 'It is a guide to action which ensures that the military always obeys the commands of the party'
references = [
    'It is a guide to action that ensures that the military will forever heed Party commands',
    'It is the guiding principle which guarantees the military forces always being under the command of the Party',
    'It is the practical guide for the army always to heed the directions of the party'
]
print(bleu.sentence_bleu(list(map(lambda ref: ref.split(), references)),candidate.split()))

In [None]:
torchtext.data.metrics.bleu_score()