In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import math
import json
import numpy as np
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
from torch.utils.data import TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
PATH = "drive/MyDrive/Implementation/Attention/"
OUTPUT_PATH = "drive/MyDrive/Implementation/Attention/"
TRAIN_INPUT = 'train_input.npy'
TRAIN_OUTPUT = 'train_output.npy'
TRAIN_TARGET = 'train_target.npy'
DATA_CONFIGS = 'data_configs.json'

In [5]:
idx_input = np.load(open(PATH+TRAIN_INPUT, 'rb'))
idx_output = np.load(open(PATH+TRAIN_OUTPUT, 'rb'))
idx_target = np.load(open(PATH+TRAIN_TARGET, 'rb'))
config = json.load(open(PATH+DATA_CONFIGS, 'r'))

In [6]:
print(len(idx_input), len(idx_output), len(idx_target))

11823 11823 11823


In [7]:
idx_input.shape

(11823, 20)

In [8]:
class ScaleDotProductAttention(nn.Module):
  def __init__(self):
    super(ScaleDotProductAttention, self).__init__()
    self.softmax = nn.Softmax(dim=3)

  def forward(self, query, key, value, mask=None):

    batch_size, head, length, d_tensor = key.size()

    k_t = key.view(batch_size, head, d_tensor, length) # transpose
    score = (query.matmul(k_t)) / math.sqrt(d_tensor)

    if mask is not None:
      score = score.masked_fill(mask == 0, -np.inf)
    
    score = self.softmax(score)
    value = score.matmul(value)

    return value, score

In [9]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, n_head):
    super(MultiHeadAttention, self).__init__()
    self.n_head = n_head
    self.attention = ScaleDotProductAttention()

    # self-attention을 위한 vector 변환 : query, key, value vector 만들기
    self.w_q = nn.Linear(d_model, d_model)
    self.w_k = nn.Linear(d_model, d_model)
    self.w_v = nn.Linear(d_model, d_model)

    self.w_concat = nn.Linear(d_model, d_model)

  def forward(self, query, key, value, masked=None):

    q, k, v = self.w_q(query), self.w_k(key), self.w_v(value)

    q, k, v = self.split(q), self.split(k), self.split(v)

    out, attnetion = self.attention(q, k, v, mask=masked)

    out = self.concat(out)
    out = self.w_concat(out)

    return out

  def split(self, tensor):

    batch_size, length, d_model = tensor.size()
    d_tensor = d_model // self.n_head
    tensor = tensor.view(batch_size, self.n_head, length, d_tensor)

    return tensor

  def concat(self, tensor):

    batch_size, head, length, d_tensor = tensor.size()
    d_model = head*d_tensor

    tensor = tensor.view(batch_size, length, d_model)

    return tensor

In [10]:
class LayerNorm(nn.Module):
  def __init__(self, d_model, eps=1e-45):
    super(LayerNorm, self).__init__()
    self.gamma = nn.Parameter(torch.ones(d_model))
    self.beta = nn.Parameter(torch.zeros(d_model))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1, keepdim=True)
    std = x.std(-1, keepdim=True)

    out = (x - mean) / (std + self.eps)
    out = self.gamma * out + self.beta

    return out

In [11]:
class PostionwiseFeedForward(nn.Module):
  def __init__(self, d_model, hidden, drop_prob=0.1):
    super(PostionwiseFeedForward, self).__init__()
    self.linear1 = nn.Linear(d_model, hidden)
    self.linear2 = nn.Linear(hidden, d_model)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(p=drop_prob)

  def forward(self, x):
    x = self.linear1(x)
    x = self.relu(x)
    x = self.dropout(x)
    x = self.linear2(x)

    return x

In [12]:
class EncoderLayer(nn.Module):
  def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
    super(EncoderLayer, self).__init__()
    self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
    self.norm1 = LayerNorm(d_model=d_model)
    self.dropout1 = nn.Dropout(p=drop_prob)

    self.ffn = PostionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
    self.norm2 = self.norm1 = LayerNorm(d_model=d_model)
    self.dropout2 = nn.Dropout(p=drop_prob)

  def forward(self, x):

    x_ = x
    x = self.attention(query=x, key=x, value=x, masked=None)
    
    x = self.norm1(x + x_)
    x = self.dropout1(x)

    x_ = x
    x = self.ffn(x)

    x = self.norm2(x + x_)
    x = self.dropout2(x)

    return x

In [13]:
class Encoder(nn.Module):
  def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
    super().__init__()
    self.emb = TransformerEmbedding(d_model=d_model,
                                    max_len=max_len,
                                    vocab_size=enc_voc_size,
                                    device=device)

    self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                              ffn_hidden=ffn_hidden,
                                              n_head=n_head,
                                              drop_prob=drop_prob)
                                  for _ in range(n_layers)])

  def forward(self, x):
    x = self.emb(x)

    for layer in self.layers:
      x = layer(x)

    return x

In [14]:
class DecoderLayer(nn.Module):
  def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
    super(DecoderLayer, self).__init__()
    self.self_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
    self.norm1 = LayerNorm(d_model=d_model)
    self.dropout1 = nn.Dropout(p=drop_prob)

    self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
    self.norm2 = LayerNorm(d_model=d_model)
    self.dropout2 = nn.Dropout(p=drop_prob)

    self.ffn = PostionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
    self.norm3 = LayerNorm(d_model=d_model)
    self.dropout3 = nn.Dropout(p=drop_prob)

  def forward(self, enc, dec, trg_mask):

    x_ = dec
    x = self.self_attention(query=dec, key=dec, value=dec, masked=trg_mask)

    x = self.norm1(x + x_)
    x = self.dropout1(x)

    # if문 필요하나?
    if enc is not None:
      x_ = x
      x = self.enc_dec_attention(query=x, key=x, value=x, masked=None)

      x = self.norm2(x + x_)
      x = self.dropout2(x)

    x_ = x
    x = self.ffn(x)

    x = self.norm3(x + x_)
    x = self.dropout3(x)

    return x

In [15]:
class Decoder(nn.Module):
  def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
    super().__init__()
    self.emb = TransformerEmbedding(d_model=d_model,
                                    max_len=max_len,
                                    vocab_size=dec_voc_size,
                                    device=device)

    self.layers = nn.ModuleList([DecoderLayer(d_model=d_model,
                                              ffn_hidden=ffn_hidden,
                                              n_head=n_head,
                                              drop_prob=drop_prob)
                                  for _ in range(n_layers)])

    self.linear = nn.Linear(d_model, dec_voc_size)

  def forward(self, enc, dec_input, trg_mask):
    dec_input = self.emb(dec_input)

    for layer in self.layers:
        dec_input = layer(enc, dec_input, trg_mask)

    # pass to LM head
    output = self.linear(dec_input)

    return output

In [16]:
class TransformerEmbedding(nn.Module):
  def __init__(self, vocab_size, d_model, max_len, device):
    super(TransformerEmbedding, self).__init__()

    self.tok_emb = TokenEmbedding(vocab_size, d_model)
    self.pos_emb = PositionalEncoding(d_model, max_len, device)

  def forward(self, x):
    tok_emb = self.tok_emb(x)
    pos_emb = self.pos_emb(x)
    
    return tok_emb + pos_emb

In [17]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len, device):
    super(PositionalEncoding, self).__init__()

    # pos : word location
    # d_model : vector size

    self.encoding = torch.zeros(max_len, d_model, device=device)
    # Gradient 계산 안해도 됨.
    self.encoding.requries_grad = False

    pos = torch.arange(0, max_len, device=device)
    pos = pos.float().unsqueeze(dim=1)
    _2i = torch.arange(0, d_model, step=2, device=device).float()

    self.encoding[:,0::2] = torch.sin(pos/(10000**(_2i / d_model)))
    self.encoding[:,1::2] = torch.cos(pos/(10000**(_2i / d_model)))
    
  def forward(self, x):

    batch_size, seq_len = x.size()

    return self.encoding[:seq_len, :]

In [18]:
class TokenEmbedding(nn.Embedding):
  def __init__(self, voca_size, d_model):
    super(TokenEmbedding, self).__init__(voca_size, d_model)

In [19]:
class Transformer(nn.Module):

  def __init__(self, enc, dec):
      super().__init__()
      
      self.encoder = enc
      self.decoder = dec

  def forward(self, input, output, target):

      out_mask = self.make_pad_mask(input, output)
      
      enc_src = self.encoder(input)
      output = self.decoder(enc_src, output, out_mask)

      return output

  def make_pad_mask(self, input, output):

    query_seq_len, key_seq_len = input.size(1), output.size(1)

    tril = np.tril(np.ones((query_seq_len, key_seq_len)), k=0)
    mask = torch.tensor(tril, requires_grad=False, device=device)

    return mask

In [20]:
def Model(model, input_tensor, output_tensor, target_tensor, model_optimizer, criterion):

  model_optimizer.zero_grad()

  loss = 0
  epoch_loss = 0

  output = model(input_tensor, output_tensor, target_tensor)

  output_ = output.view(-1,output.shape[-1])
  target_ = F.one_hot(target_tensor, num_classes=VOCA_SIZE)
  target_ = target_.view(-1, target_.shape[-1]).type(torch.FloatTensor).to(device)

  loss = criterion(output_, target_)

  loss.backward()
  model_optimizer.step()
  epoch_loss = loss.item()

  return epoch_loss

In [21]:
def TrainModel(model, train_data, epoch=1000):
  
  optimizer = optim.Adam(model.parameters(), lr=0.001) # SGD
  criterion = nn.CrossEntropyLoss()

  model.train()

  for iter in range(epoch):
    loss = 0
    for input, output, target in tqdm(train):
      loss += Model(model, input, output, target, optimizer, criterion)
    
    if iter%1 == 0:
      print('iteration :%d\ntrain_loss : %.4f' % (iter, loss/len(train)))

  return model

In [22]:
train_input = torch.LongTensor(idx_input[1823:]).to(device)
train_output = torch.LongTensor(idx_output[1823:]).to(device)
train_target = torch.LongTensor(idx_target[1823:]).to(device)
test_input = torch.LongTensor(idx_input[:1823]).to(device)
test_output = torch.LongTensor(idx_output[:1823]).to(device)
test_target = torch.LongTensor(idx_target[:1823]).to(device)

In [23]:
dataset = TensorDataset(train_input, train_output, train_target)
train = DataLoader(dataset, batch_size=128, shuffle=True)

In [24]:
MAX_SEQUENCE = 20
EPOCH = 30
HEAD = 4
UNITS = 128
DROPOUT = 0.2
NUM_LAYERS = 2
BATCH_SIZE = 64
EMBEDDING_DIM = 128
VOCA_SIZE = config['voca_size']

In [25]:
Enc = Encoder(enc_voc_size=VOCA_SIZE, max_len=MAX_SEQUENCE, d_model=EMBEDDING_DIM, ffn_hidden=UNITS, n_head=HEAD, n_layers=NUM_LAYERS, drop_prob=DROPOUT, device=device)
Dec = Decoder(dec_voc_size=VOCA_SIZE, max_len=MAX_SEQUENCE, d_model=EMBEDDING_DIM, ffn_hidden=UNITS, n_head=HEAD, n_layers=NUM_LAYERS, drop_prob=DROPOUT, device=device)

In [26]:
tf = Transformer(Enc, Dec).to(device)

In [27]:
model = TrainModel(tf, train, epoch=EPOCH)

100%|██████████| 79/79 [00:27<00:00,  2.91it/s]


iteration :0
train_loss : 2.8376


100%|██████████| 79/79 [00:23<00:00,  3.41it/s]


iteration :1
train_loss : 1.6092


100%|██████████| 79/79 [00:23<00:00,  3.35it/s]


iteration :2
train_loss : 1.5415


100%|██████████| 79/79 [00:23<00:00,  3.38it/s]


iteration :3
train_loss : 1.4849


100%|██████████| 79/79 [00:22<00:00,  3.44it/s]


iteration :4
train_loss : 1.4344


100%|██████████| 79/79 [00:23<00:00,  3.41it/s]


iteration :5
train_loss : 1.3828


100%|██████████| 79/79 [00:23<00:00,  3.43it/s]


iteration :6
train_loss : 1.3368


100%|██████████| 79/79 [00:23<00:00,  3.42it/s]


iteration :7
train_loss : 1.2839


100%|██████████| 79/79 [00:23<00:00,  3.43it/s]


iteration :8
train_loss : 1.2352


100%|██████████| 79/79 [00:23<00:00,  3.40it/s]


iteration :9
train_loss : 1.1813


100%|██████████| 79/79 [00:23<00:00,  3.41it/s]


iteration :10
train_loss : 1.1188


100%|██████████| 79/79 [00:22<00:00,  3.44it/s]


iteration :11
train_loss : 1.0478


100%|██████████| 79/79 [00:22<00:00,  3.44it/s]


iteration :12
train_loss : 0.9765


100%|██████████| 79/79 [00:22<00:00,  3.45it/s]


iteration :13
train_loss : 0.8995


100%|██████████| 79/79 [00:22<00:00,  3.44it/s]


iteration :14
train_loss : 0.8308


100%|██████████| 79/79 [00:22<00:00,  3.44it/s]


iteration :15
train_loss : 0.7550


100%|██████████| 79/79 [00:23<00:00,  3.38it/s]


iteration :16
train_loss : 0.6834


100%|██████████| 79/79 [00:22<00:00,  3.44it/s]


iteration :17
train_loss : 0.6195


100%|██████████| 79/79 [00:23<00:00,  3.43it/s]


iteration :18
train_loss : 0.5602


100%|██████████| 79/79 [00:22<00:00,  3.44it/s]


iteration :19
train_loss : 0.5064


100%|██████████| 79/79 [00:22<00:00,  3.44it/s]


iteration :20
train_loss : 0.4529


100%|██████████| 79/79 [00:22<00:00,  3.44it/s]


iteration :21
train_loss : 0.4050


100%|██████████| 79/79 [00:23<00:00,  3.39it/s]


iteration :22
train_loss : 0.3603


100%|██████████| 79/79 [00:22<00:00,  3.44it/s]


iteration :23
train_loss : 0.3212


100%|██████████| 79/79 [00:23<00:00,  3.43it/s]


iteration :24
train_loss : 0.2826


100%|██████████| 79/79 [00:22<00:00,  3.45it/s]


iteration :25
train_loss : 0.2547


100%|██████████| 79/79 [00:23<00:00,  3.42it/s]


iteration :26
train_loss : 0.2258


100%|██████████| 79/79 [00:22<00:00,  3.44it/s]


iteration :27
train_loss : 0.2004


100%|██████████| 79/79 [00:23<00:00,  3.39it/s]


iteration :28
train_loss : 0.1807


100%|██████████| 79/79 [00:23<00:00,  3.41it/s]

iteration :29
train_loss : 0.1613



