In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
'''
N : batch_size
T : sequence_length
h : num_head
H : d_model (단어의 임베딩된 차원)
d : H // h
'''
class ScaledDotProductAttention(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, query, key, value, mask=None):
    '''
    query : 비교주체 / (batch_size, sequence_length, num_head, head_dim) = (N, T, h, d)
    key   : 비교상대 / (batch_size, sequence_length, num_head, head_dim) = (N, T, h, d)
    value : 가중치가 곱해질 값들 / (batch_size, sequence_length, num_head, head_dim) = (N, T, h, d)
    mask  :
    '''
    _, T, h, H = query.shape

    # query 와 key 를 matmul 하여 가중치(유사도) a 를 구한다. (각각의 word 마다 자신이 포함된 sequence 에서 어떤 word 와 유사한 지(= 어떤 word 를 attend 하는 지)를 나타냄.)
    a = query.transpose(1, 2).matmul(key) / np.sqrt(H) # gradient vanishing problem 방지하고자 scale / (N, h, T, d) @ (N, h, d, T) -> (N, h, T, T)

    if mask is not None: # TODO : mask 값 확인해보기
      a.masked_fill(mask, 1e-9) # mask 가 True 인 경우 1e-9 로. (softmax 값 0으로.)

    # 가중치 a 를 0~1 사이 값 & 총합 1 로 scale
    a = nn.Softmax(a)

    # 가중치 a 와 value 를 matmul 하여 변환(번역)을 수행하는 데 필요한 정보가 담긴 맥락벡터 c 를 구한다. (어떤 word)
    c = a.matmul(value.transpose(1,2)) # (N, h, T, T) @ (N, h, T, d) -> (N, h, T, d)

    return c

class MultiHeadAttention(nn.Module):
  def __init__(self, batch_size, sequence_length, d_model, num_head, drop_ratio):
    super().__init__()
    self.N = batch_size
    self.T = sequence_length
    self.H = d_model
    self.h = num_head
    self.d = d_model // num_head # d = H * h

    # paper(d_model -> dmodel/h) 과 달리 for 문을 사용하지 않기 위해 Linear(d_model->d_model) 로 하고 view 로 나누어 한번에 계산.
    self.qLinear = nn.Linear(d_model, d_model) # query linear
    self.kLinear = nn.Linear(d_model, d_model) # key linear
    self.vLinear = nn.Linear(d_model, d_model) # value linear
    self.lLinear = nn.Linear(d_model, d_model) # last linear
    self.dropout = nn.Dropout(drop_ratio)

  def forward(self, query, key, value): # respective size : (N, T, H) #TODO : mask 추가.
    # multi head attention 을 위해 d_model 을 head 수 만큼 slice 
    multi_query = self.qLinear(query).view(self.N, self.T, self.h, self.d)
    multi_key   = self.kLinear(key).view(self.N, self.T, self.h, self.d)
    multi_value = self.vLinear(value).view(self.N, self.T, self.h, self.d)
                         
    c = ScaledDotProductAttention(multi_query, multi_key, multi_value) # (N, h, T, d)
    c = c.transpose(1, 2).view(self.N, self.T, -1) # concat (N, T, h, d) -> (N, T, h * d) = (N, T, H)

    output = self.lLinear(c) # (N, T, H)
    
    return self.dropout(output) # apply dropout to the output of each sub-layer, before it is added to the sub-layer input and normalized

class FeedForward(nn.Module):
  def __init__(self, d_model, inner_layer_dim, drop_ratio):
    '''
    inner_layer_dim : 중간 은닉층 dimension
    '''
    super().__init__()
    self.linear1 = nn.Linear(d_model, inner_layer_dim)
    self.linear2 = nn.Linear(inner_layer_dim, d_model)
    self.dropout = nn.Dropout(drop_ratio)

  def forward(self, x):
    output = self.linear2(self.linear1(x))
    return self.dropout(output) # apply dropout to the output of each sub-layer, before it is added to the sub-layer input and normalized
    
class EncoderLayer(nn.Module):
  def __init__(self, batch_size, sequence_length, d_model, num_head, inner_layer_dim, drop_ratio):
    super().__init__()
    self.layerNorm1 = nn.LayerNorm(d_model)
    self.layerNorm2 = nn.LayerNorm(d_model)
    self.multiHeadAttention = MultiHeadAttention(batch_size, sequence_length, d_model, num_head, drop_ratio)
    self.feedForward = FeedForward(d_model, inner_layer_dim, drop_ratio)

  def forward(self, src):
    x = self.layerNorm1(src + self.multiHeadAttention(src, src, src)) # src = query = key = value / residual & layernorm
    output = self.layerNorm2(x + self.feedForward(x)) # residual & layernorm
    return output

class Encoder(nn.Module):
  def __init__(self, batch_size, sequence_length, d_model, num_head, inner_layer_dim, drop_ratio, iter):
    '''
    iter : encoder layer 반복 수
    '''
    super().__init__()
    self.encoderLayers = nn.ModuleList([EncoderLayer(batch_size, sequence_length, d_model, num_head, inner_layer_dim, drop_ratio) for _ in range(iter)]) # iter 만큼 encoderLayer 반복

  def forward(self, src):
    for encoderLayer in self.encoderLayers:
      src = encoderLayer(src)

    return src # encoder 의 출력 값이자 모든 decoder layer 에 query, key 로 입력될 값.

class DecoderLayer(nn.Module):
  def __init__(self, batch_size, sequence_length, d_model, num_head, inner_layer_dim, drop_ratio):
    super().__init__()
    self.layerNorm1 = nn.LayerNorm(d_model)
    self.layerNorm2 = nn.LayerNorm(d_model)
    self.layerNorm3 = nn.LayerNorm(d_model)
    self.multiHeadAttention1 = MultiHeadAttention(batch_size, sequence_length, d_model, num_head, drop_ratio)
    self.multiHeadAttention2 = MultiHeadAttention(batch_size, sequence_length, d_model, num_head, drop_ratio)
    self.feedForward = FeedForward(d_model, inner_layer_dim, drop_ratio)

  def forward(self, src, enc_des):
    '''
    src     : decoder 에 입력되는 값 (src = query = key = value) / (N, T, H)
    enc_des : encoder 에서 출력된 값 / (N, T, H)
    '''
    # TODO masking 추가. 
    query = self.layerNorm1(src + self.multiHeadAttention1(src, src, src))

    # the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder
    x = self.layerNorm2(query + self.multiHeadAttention1(query, enc_des, enc_des))

    output = self.layerNorm3(x + self.feedForward(x))

    return output
    
class Decoder(nn.Module):
  def __init__(self, batch_size, sequence_length, d_model, num_head, inner_layer_dim, drop_ratio, iter):
    super().__init__()
    self.decoderLayers = nn.ModuleList([DecoderLayer(batch_size, sequence_length, d_model, num_head, inner_layer_dim, drop_ratio) for _ in range(iter)]) # iter 만큼 encoderLayer 반복

  def forward(self, src, enc_des):
    for decoderLayer in self.decoderLayers:
      src = decoderLayer(src, enc_des)

    return src

class PositionalEncoding(nn.Module):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.Embedding()

  def forward(self, src):
    pass
    
class Transformer(nn.Module):
  def __init__(self, batch_size, sequence_length, d_model, num_head, inner_layer_dim, drop_ratio, iter, inputs_vocab_size, outputs_vocab_size):
    '''
    inputs_vocab_size : encoder 에 입력되는 dataset 의 vcoab 수
    outputs_vocab_size : decoder 에 입력되는 dataset 의 vocab 수
    '''
    super().__init__()
    self.scale = torch.sqrt(d_model)

    # word embedding
    self.inputEmbedding = nn.Embedding(inputs_vocab_size, d_model)
    self.outputEmbedding = nn.Embedding(outputs_vocab_size, d_model) # TODO : multiply those weights by root(d_model)

    # positional embedding
    self.encPositionalEncoding = PositionalEncoding(inputs_vocab_size, d_model)
    self.decPositionalEncoding = PositionalEncoding(outputs_vocab_size, d_model)

    self.encoder = Encoder(batch_size, sequence_length, d_model, num_head, inner_layer_dim, drop_ratio, iter)
    self.decoder = Decoder(batch_size, sequence_length, d_model, num_head, inner_layer_dim, drop_ratio, iter)

    self.linear = nn.Linear(d_model, outputs_vocab_size)

  def forward(self, inputs, outputs):

    enc_src = self.inputEmbedding(inputs) * self.scale + self.encPositionalEncoding(inputs)
    enc_des = self.encoder(enc_src)

    dec_src = self.outputEmbedding(outputs) * self.scale + self.encPositionalEncoding(outputs)
    output = self.decoder(dec_src, enc_des)

    output_prob = nn.Softmax(self.linear(output)) # we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation
    return output_prob                            # 두개 embedding layer 는 shape 가 다른거 같은데.. output embedding 하고만 겹치지 않나?
                                                  # 번역 할 때는 언어마다 vocab 수가 달라져서 shape 달라지고 한 언어로 문장 생성할 때는 vocab 수가 같은데.. 어찌해야하는가?
                                                  # vocab 수가 같을 수도 다를 수도 있으니 일단은 case 를 나눠서 같은 경우 세 가중치 공유, 다른 경우 output 쪽만 공유하도록 해보자.
    