In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import random
import json
import os

In [None]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
from torch.utils.data import TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string],'')
  plt.xlabel('Epoch')
  plt.ylabel(string)
  plt.legend([string,'val_'+string])
  plt.show()

In [None]:
PATH = "drive/MyDrive/Implementation/Attention/"
OUTPUT_PATH = "drive/MyDrive/Implementation/Attention/"
TRAIN_INPUT = 'train_input.npy'
TRAIN_OUTPUT = 'train_output.npy'
TRAIN_TARGET = 'train_target.npy'
DATA_CONFIGS = 'data_configs.json'

In [None]:
idx_input = np.load(open(PATH+TRAIN_INPUT, 'rb'))
idx_output = np.load(open(PATH+TRAIN_OUTPUT, 'rb'))
idx_target = np.load(open(PATH+TRAIN_TARGET, 'rb'))
config = json.load(open(PATH+DATA_CONFIGS, 'r'))

In [None]:
SEED_NUM = 777
torch.cuda.manual_seed(SEED_NUM)

In [None]:
print(len(idx_input), len(idx_output), len(idx_target))

11823 11823 11823


In [None]:
class Encoder(nn.Module):
  def __init__ (self, input_dim, hidden_dim, embed_dim, num_layers):
    super(Encoder, self).__init__()
    self.input_dim = input_dim
    self.embed_dim = embed_dim
    self.hidden_dim = hidden_dim
    self.num_layers = num_layers
    
    self.embedding = nn.Embedding(self.input_dim, self.embed_dim)
    self.gru = nn.GRU(self.embed_dim, self.hidden_dim, num_layers=self.num_layers, batch_first=True)

  def forward(self, src):

    embedded = self.embedding(src)
    output, hidden = self.gru(embedded)

    return output, hidden

In [None]:
class Decoder(nn.Module):
  def __init__(self, output_dim, hidden_dim, embed_dim, num_layers):
    super(Decoder, self).__init__()

    self.embed_dim = embed_dim
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim
    self.num_layers = num_layers
    
    self.embedding = nn.Embedding(self.output_dim, self.embed_dim)
    self.gru = nn.GRU(self.embed_dim, self.hidden_dim, num_layers=self.num_layers, batch_first=True)
    # self.out = nn.Linear(self.hidden_dim, output_dim)
    # self.softmax = nn.LogSoftmax(dim=2) # Vanishing gradient

  def forward(self, input, hidden):

    input = input.view(-1,1) # = unsqueeze(0)
    embedded = self.embedding(input)
    output, hidden = self.gru(embedded, hidden)
    # prediction = self.out(output)
    # prediction = self.softmax(out)

    return output, hidden

In [None]:
class Attention(nn.Module):
  def __init__(self, hidden_dim, output_dim):
    super(Attention, self).__init__()

    self.hidden_dim = hidden_dim
    self.output_dim = output_dim

    self.softmax = nn.Softmax(dim=1)
    self.attention_out = nn.Linear(self.hidden_dim*2, self.output_dim)

  def forward(self, enc_hidden_all, dec_hidden):
      
      dec_hidden_ = dec_hidden.permute(1,2,0)

      # Dot-Product Attention
      attention_score = enc_hidden_all.matmul(dec_hidden_)
      attention_distribution = self.softmax(attention_score)

      # Last layer hidden state
      attention_distribution = attention_distribution[:,:,attention_distribution.size(2)-1].unsqueeze(dim=2)
      temp_value = enc_hidden_all.mul(attention_distribution)
      attnetion_value = temp_value.sum(dim=1).unsqueeze(dim=0)
      last_dec_hidden = dec_hidden[dec_hidden.size(0)-1,:,:].unsqueeze(dim=0)
      concat = torch.cat([last_dec_hidden, attnetion_value], dim=2).permute(1,0,2)
      prediction = self.attention_out(concat)

      return prediction

In [None]:
class Seq2seq_attention(nn.Module):
  def __init__(self, encoder, decoder, attnetion, device):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.attnetion = attnetion
    self.device = device

  def forward(self, input_data, output_data, taget_data, teacher_forcing_ratio=0.5):
    
    target_length = input_data.size(1)
    batch_size = input_data.size(0)
    voca_size = self.decoder.output_dim

    outputs = torch.zeros(batch_size, target_length, voca_size).to(self.device)
    
    encoder_output, encoder_hidden = self.encoder(input_data)
    
    decoder_hidden = encoder_hidden

    decoder_input = output_data[:,0]
    
    # Backpropagation 어떻게 됨?
    for t in range(target_length):
      decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
      att_output = self.attnetion(encoder_output, decoder_hidden) 
      outputs[:,t,:] = att_output.squeeze(1) # or decoder_output
      teacher_force = random.random() < teacher_forcing_ratio
      top_1 = att_output.argmax(2) # topk
      decoder_input = (taget_data[:,t] if teacher_force else top_1[:,0])

      # if(teacher_force == False and input.item() == 2):
      #   break

    return outputs

In [None]:
def Model(model, input_tensor, output_tensor, target_tensor, model_optimizer, criterion):

  model_optimizer.zero_grad()

  loss = 0
  epoch_loss = 0

  output = model(input_tensor, output_tensor, target_tensor)

  output_ = output.view(-1,output.shape[-1])
  target_ = F.one_hot(target_tensor, num_classes=voca_size)
  target_ = target_.view(-1, target_.shape[-1]).type(torch.FloatTensor).to(device)

  loss = criterion(output_, target_)

  loss.backward()
  model_optimizer.step()
  epoch_loss = loss.item()

  return epoch_loss

In [None]:
def TrainModel(model, train_data, epoch=1000):
  
  optimizer = optim.Adam(model.parameters(), lr=0.001) # SGD
  criterion = nn.CrossEntropyLoss()

  model.train()

  for iter in range(epoch):
    loss = 0
    for input, output, target in tqdm(train_data):
      loss += Model(model, input, output, target, optimizer, criterion)
    
    if iter%1 == 0:
      print('iteration :%d\ntrain_loss : %.4f' % (iter, loss/len(train_data)))

  return model

In [None]:
# MODEL_NAME = 'seq2seq_kr'
# MAX_SEQUENCE = 20
EPOCH = 50
UNITS = 128
NUM_LAYERS = 2
BATCH_SIZE = 64
EMBEDDING_DIM = 128
# VALIDATION_SPLIT = 0.2

word2idx = config['word2idx']
idx2word = config['idx2word']
std_idx = config['std_symbol']
end_idx = config['end_symbol']
voca_size = config['voca_size']

In [None]:
train_input = torch.LongTensor(idx_input[1823:]).to(device)
train_output = torch.LongTensor(idx_output[1823:]).to(device)
train_target = torch.LongTensor(idx_target[1823:]).to(device)
test_input = torch.LongTensor(idx_input[:1823]).to(device)
test_output = torch.LongTensor(idx_output[:1823]).to(device)
test_target = torch.LongTensor(idx_target[:1823]).to(device)

In [None]:
dataset = TensorDataset(train_input, train_output, train_target)
train = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
enc = Encoder(input_dim = voca_size, hidden_dim=UNITS, embed_dim=EMBEDDING_DIM, num_layers=NUM_LAYERS).cuda()
dec = Decoder(output_dim = voca_size, hidden_dim=UNITS, embed_dim=EMBEDDING_DIM, num_layers=NUM_LAYERS).cuda()
att = Attention(hidden_dim=UNITS, output_dim=voca_size)
print(enc)
print(dec)
print(att)

Encoder(
  (embedding): Embedding(20705, 128)
  (gru): GRU(128, 128, num_layers=2, batch_first=True)
)
Decoder(
  (embedding): Embedding(20705, 128)
  (gru): GRU(128, 128, num_layers=2, batch_first=True)
)
Attention(
  (softmax): Softmax(dim=1)
  (attention_out): Linear(in_features=256, out_features=20705, bias=True)
)


In [None]:
model = Seq2seq_attention(enc, dec, att, device)
model.cuda()

Seq2seq_attention(
  (encoder): Encoder(
    (embedding): Embedding(20705, 128)
    (gru): GRU(128, 128, num_layers=2, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(20705, 128)
    (gru): GRU(128, 128, num_layers=2, batch_first=True)
  )
  (attnetion): Attention(
    (softmax): Softmax(dim=1)
    (attention_out): Linear(in_features=256, out_features=20705, bias=True)
  )
)

In [None]:
Model = TrainModel(model, train, epoch=EPOCH)

100%|██████████| 313/313 [00:46<00:00,  6.68it/s]


iteration :0
train_loss : 1.9950


100%|██████████| 313/313 [00:43<00:00,  7.20it/s]


iteration :1
train_loss : 1.6505


100%|██████████| 313/313 [00:42<00:00,  7.35it/s]


iteration :2
train_loss : 1.5789


100%|██████████| 313/313 [00:42<00:00,  7.33it/s]


iteration :3
train_loss : 1.5127


100%|██████████| 313/313 [00:42<00:00,  7.38it/s]


iteration :4
train_loss : 1.4194


100%|██████████| 313/313 [00:42<00:00,  7.36it/s]


iteration :5
train_loss : 1.3080


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :6
train_loss : 1.1767


100%|██████████| 313/313 [00:42<00:00,  7.34it/s]


iteration :7
train_loss : 1.0352


100%|██████████| 313/313 [00:42<00:00,  7.29it/s]


iteration :8
train_loss : 0.8884


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :9
train_loss : 0.7494


100%|██████████| 313/313 [00:42<00:00,  7.43it/s]


iteration :10
train_loss : 0.6220


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :11
train_loss : 0.5104


100%|██████████| 313/313 [00:42<00:00,  7.34it/s]


iteration :12
train_loss : 0.4297


100%|██████████| 313/313 [00:42<00:00,  7.38it/s]


iteration :13
train_loss : 0.3556


100%|██████████| 313/313 [00:42<00:00,  7.37it/s]


iteration :14
train_loss : 0.2930


100%|██████████| 313/313 [00:42<00:00,  7.33it/s]


iteration :15
train_loss : 0.2419


100%|██████████| 313/313 [00:42<00:00,  7.31it/s]


iteration :16
train_loss : 0.2009


100%|██████████| 313/313 [00:42<00:00,  7.37it/s]


iteration :17
train_loss : 0.1572


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :18
train_loss : 0.1279


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :19
train_loss : 0.1044


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :20
train_loss : 0.0816


100%|██████████| 313/313 [00:42<00:00,  7.32it/s]


iteration :21
train_loss : 0.0650


100%|██████████| 313/313 [00:42<00:00,  7.38it/s]


iteration :22
train_loss : 0.0540


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :23
train_loss : 0.0414


100%|██████████| 313/313 [00:42<00:00,  7.38it/s]


iteration :24
train_loss : 0.0337


100%|██████████| 313/313 [00:42<00:00,  7.33it/s]


iteration :25
train_loss : 0.0292


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :26
train_loss : 0.0255


100%|██████████| 313/313 [00:42<00:00,  7.42it/s]


iteration :27
train_loss : 0.0263


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :28
train_loss : 0.0262


100%|██████████| 313/313 [00:42<00:00,  7.35it/s]


iteration :29
train_loss : 0.0284


100%|██████████| 313/313 [00:42<00:00,  7.38it/s]


iteration :30
train_loss : 0.0294


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :31
train_loss : 0.0244


100%|██████████| 313/313 [00:42<00:00,  7.37it/s]


iteration :32
train_loss : 0.0205


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :33
train_loss : 0.0175


100%|██████████| 313/313 [00:42<00:00,  7.33it/s]


iteration :34
train_loss : 0.0153


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :35
train_loss : 0.0149


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :36
train_loss : 0.0144


100%|██████████| 313/313 [00:42<00:00,  7.43it/s]


iteration :37
train_loss : 0.0149


100%|██████████| 313/313 [00:42<00:00,  7.35it/s]


iteration :38
train_loss : 0.0166


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :39
train_loss : 0.0179


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :40
train_loss : 0.0222


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :41
train_loss : 0.0303


100%|██████████| 313/313 [00:42<00:00,  7.30it/s]


iteration :42
train_loss : 0.0238


100%|██████████| 313/313 [00:42<00:00,  7.36it/s]


iteration :43
train_loss : 0.0152


100%|██████████| 313/313 [00:42<00:00,  7.36it/s]


iteration :44
train_loss : 0.0122


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :45
train_loss : 0.0105


100%|██████████| 313/313 [00:42<00:00,  7.41it/s]


iteration :46
train_loss : 0.0109


100%|██████████| 313/313 [00:42<00:00,  7.31it/s]


iteration :47
train_loss : 0.0124


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :48
train_loss : 0.0094


100%|██████████| 313/313 [00:42<00:00,  7.41it/s]


iteration :49
train_loss : 0.0132


100%|██████████| 313/313 [00:42<00:00,  7.41it/s]


iteration :50
train_loss : 0.0195


100%|██████████| 313/313 [00:42<00:00,  7.33it/s]


iteration :51
train_loss : 0.0332


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :52
train_loss : 0.0271


100%|██████████| 313/313 [00:42<00:00,  7.38it/s]


iteration :53
train_loss : 0.0163


100%|██████████| 313/313 [00:42<00:00,  7.38it/s]


iteration :54
train_loss : 0.0134


100%|██████████| 313/313 [00:42<00:00,  7.31it/s]


iteration :55
train_loss : 0.0111


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :56
train_loss : 0.0109


100%|██████████| 313/313 [00:42<00:00,  7.36it/s]


iteration :57
train_loss : 0.0099


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :58
train_loss : 0.0101


100%|██████████| 313/313 [00:42<00:00,  7.36it/s]


iteration :59
train_loss : 0.0107


100%|██████████| 313/313 [00:42<00:00,  7.42it/s]


iteration :60
train_loss : 0.0107


100%|██████████| 313/313 [00:41<00:00,  7.47it/s]


iteration :61
train_loss : 0.0118


100%|██████████| 313/313 [00:41<00:00,  7.46it/s]


iteration :62
train_loss : 0.0143


100%|██████████| 313/313 [00:42<00:00,  7.45it/s]


iteration :63
train_loss : 0.0218


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :64
train_loss : 0.0350


100%|██████████| 313/313 [00:42<00:00,  7.45it/s]


iteration :65
train_loss : 0.0218


100%|██████████| 313/313 [00:42<00:00,  7.45it/s]


iteration :66
train_loss : 0.0133


100%|██████████| 313/313 [00:42<00:00,  7.44it/s]


iteration :67
train_loss : 0.0105


100%|██████████| 313/313 [00:42<00:00,  7.37it/s]


iteration :68
train_loss : 0.0106


100%|██████████| 313/313 [00:42<00:00,  7.45it/s]


iteration :69
train_loss : 0.0107


100%|██████████| 313/313 [00:42<00:00,  7.44it/s]


iteration :70
train_loss : 0.0104


100%|██████████| 313/313 [00:41<00:00,  7.47it/s]


iteration :71
train_loss : 0.0093


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :72
train_loss : 0.0092


100%|██████████| 313/313 [00:42<00:00,  7.43it/s]


iteration :73
train_loss : 0.0090


100%|██████████| 313/313 [00:42<00:00,  7.43it/s]


iteration :74
train_loss : 0.0100


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :75
train_loss : 0.0099


100%|██████████| 313/313 [00:42<00:00,  7.41it/s]


iteration :76
train_loss : 0.0115


100%|██████████| 313/313 [00:42<00:00,  7.37it/s]


iteration :77
train_loss : 0.0181


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :78
train_loss : 0.0346


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :79
train_loss : 0.0203


100%|██████████| 313/313 [00:42<00:00,  7.38it/s]


iteration :80
train_loss : 0.0133


100%|██████████| 313/313 [00:42<00:00,  7.34it/s]


iteration :81
train_loss : 0.0103


100%|██████████| 313/313 [00:42<00:00,  7.39it/s]


iteration :82
train_loss : 0.0097


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :83
train_loss : 0.0089


100%|██████████| 313/313 [00:42<00:00,  7.41it/s]


iteration :84
train_loss : 0.0085


100%|██████████| 313/313 [00:42<00:00,  7.34it/s]


iteration :85
train_loss : 0.0092


100%|██████████| 313/313 [00:42<00:00,  7.38it/s]


iteration :86
train_loss : 0.0104


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :87
train_loss : 0.0100


100%|██████████| 313/313 [00:42<00:00,  7.43it/s]


iteration :88
train_loss : 0.0105


100%|██████████| 313/313 [00:42<00:00,  7.33it/s]


iteration :89
train_loss : 0.0134


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :90
train_loss : 0.0215


100%|██████████| 313/313 [00:42<00:00,  7.43it/s]


iteration :91
train_loss : 0.0244


100%|██████████| 313/313 [00:42<00:00,  7.45it/s]


iteration :92
train_loss : 0.0157


100%|██████████| 313/313 [00:42<00:00,  7.37it/s]


iteration :93
train_loss : 0.0108


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :94
train_loss : 0.0090


100%|██████████| 313/313 [00:42<00:00,  7.42it/s]


iteration :95
train_loss : 0.0081


100%|██████████| 313/313 [00:42<00:00,  7.41it/s]


iteration :96
train_loss : 0.0085


100%|██████████| 313/313 [00:42<00:00,  7.34it/s]


iteration :97
train_loss : 0.0107


100%|██████████| 313/313 [00:42<00:00,  7.40it/s]


iteration :98
train_loss : 0.0105


100%|██████████| 313/313 [00:42<00:00,  7.41it/s]

iteration :99
train_loss : 0.0100





In [None]:
def evaluation(model, input_data, output_data, target_data):
  
  model.eval()
  with torch.no_grad():

    decoder_words = []

    output = model(input_data, output_data, target_data, teacher_forcing_ratio=0)
    for bt in range(output.size(0)): # batch size
      for sl in range(output.size(1)): # seq_len
        if output[bt][sl].argmax(0).item() == 2:
          decoder_words.append(2)
          break
        else:
          decoder_words.append(output[:,sl,:].argmax(1).item())

  return decoder_words

In [None]:
def idxtoword(sentence):
  
  seq_temp = []
  for i in sentence:
    seq_temp.append(idx2word[str(i.item())])
  
  return " ".join(seq_temp)

In [None]:
def pre_idxtoword(sentence):
  
  seq_temp = []
  for i in sentence:
    seq_temp.append(idx2word[str(i)])
  
  return " ".join(seq_temp)

In [None]:
def evaluationRandomly(model, input_data, output_data, target_data):
  for i,j,k in zip(input_data, output_data, target_data):
    print('input : {}'.format(idxtoword(i)))
    print('output : {}'.format(idxtoword(j)))
    print('target : {}'.format(idxtoword(k)))
    output = evaluation(model,input_data,output_data,target_data)
    print('predicted : {}'.format(pre_idxtoword(output)))

In [None]:
# 학습 데이터 : 데이터 shape : [1,20] 이여야함.
idx = 123
train_input_ = train_input[idx:idx+1]
train_output_ = train_output[idx:idx+1]
train_target_ = train_target[idx:idx+1]

In [None]:
# 테스트 데이터 : 데이터 shape : [1,20] 이여야함.
ind = 321
test_input_ = test_input[ind:ind+1]
test_output_ = test_output[ind:ind+1]
test_target_ = test_target[ind:ind+1]

In [None]:
# 결과 확인
evaluationRandomly(model, train_input_, train_output_, train_target_)

input : 발목 접질렀어 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
output : <SOS> 꾸준히 치료하세요 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
target : 꾸준히 치료하세요 <END> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
predicted : 꾸준히 치료하세요 <END>


In [None]:
# 결과 확인
# 처음 본 질문에는 답을 이상하게 함
evaluationRandomly(model, test_input_, test_output_, test_target_)

input : 그땐 그랬지 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
output : <SOS> 추억에 잠길 때도 필요해요 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
target : 추억에 잠길 때도 필요해요 <END> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
predicted : 상대방은 당신의 마음을 읽을 수 없어요 <END>
