In [1]:
import torch
from torch import nn

class Encoder(nn.Module):

    def __init__(self, word_vec_size, hidden_size, n_layers=4, dropout_p=.2):
        super(Encoder, self).__init__()

        # Be aware of value of 'batch_first' parameter.
        # Also, its hidden_size is half of original hidden_size,
        # because it is bidirectional.
        self.rnn = nn.LSTM(
            word_vec_size, # input shape
            int(hidden_size / 2), # bidirectional 할 것이기 때문에, 나누기 2를 했다. -> 만약 소수점이 되버리면?
            num_layers=n_layers, # stacking LSTM
            dropout=dropout_p,
            bidirectional=True,
            batch_first=True, # batch의 쉐입이 첫번째가 아니라서 앞으로 오게 강제함
        )

    def forward(self, emb):
        # |emb| = (batch_size, length, word_vec_size)

        if isinstance(emb, tuple): # 임베딩 타입이 튜플이니? 
            x, lengths = emb
            x = pack(x, lengths.tolist(), batch_first=True) # https://simonjisu.github.io/nlp/2018/07/05/packedsequence.html
            # input : input은 T*B*(*) /T는 가장긴 시퀀스/B는 배치사이즈,/(*)은 dim
            # length : list of sequence lengths of each batch element


            # Below is how pack_padded_sequence works.
            # As you can see,
            # PackedSequence object has information about mini-batch-wise information,
            # not time-step-wise information.
            # 
            # a = [torch.tensor([1,2,3]), 
            #      torch.tensor([3,4])]

            # b = torch.nn.utils.rnn.pad_sequence(a, batch_first=True)
            # >>>>
            # tensor([[ 1,  2,  3],
            #         [ 3,  4,  0]])
            # torch.nn.utils.rnn.pack_padded_sequence(b, batch_first=True, lengths=[3,2]
            # >>>>PackedSequence(data=tensor([ 1,  3,  2,  4,  3]), batch_sizes=tensor([ 2,  2,  1]))
        
        else:
            x = emb

        y, h = self.rnn(x)
        # https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
        # y: containing the output features (h_t) from the last layer of the LSTM, for each t // 모든 t시점에서 나온 hidden
        # h: (containing the final hidden state for each element in the batch // containing the final cell state for each element in the batch.)
        # |y| = (batch_size, length, hidden_size) : hidden_size * 2(정방향) / 2(역방향)
        # |h[0]| = (num_layers * 2, batch_size, hidden_size / 2)
                # num_layer * num_direction
                # 바이다이렉셔널이라 num_layers * 2임 // ?배치사이즈 // ?(hidden_size / 2)

        if isinstance(emb, tuple):
            y, _ = unpack(y, batch_first=True) # 위에 packedsequence가 들어가있으면 풀어줘야 하기 때문에 씀.
        
        # y : [b, n, h]
        # h : [l*2, b, h/2], [l*2, b, h/2]
        return y, h


data = torch.randn(64,30,10)

In [2]:
encoder = Encoder(10, 5, 4, 0.2) 

In [3]:
data.shape

torch.Size([64, 30, 10])

In [4]:
encoder

Encoder(
  (rnn): LSTM(10, 2, num_layers=4, batch_first=True, dropout=0.2, bidirectional=True)
)

In [5]:
y,h = encoder(data)

class Decoder(nn.Module):

    def __init__(self, word_vec_size, hidden_size, n_layers=4, dropout_p=.2):
        super(Decoder, self).__init__()

        # Be aware of value of 'batch_first' parameter and 'bidirectional' parameter.
        self.rnn = nn.LSTM(
            word_vec_size + hidden_size, # input feeding? 을 해줄거기 때문에(concat) 차원이 늘어난다.
            hidden_size,
            num_layers=n_layers,
            dropout=dropout_p,
            bidirectional=False,
            batch_first=True,
        )

    def forward(self, emb_t, h_t_1_tilde, h_t_1):
        '''
        추론할때나, input feeding을 해줄것이기 때문에, 한스텝씩 들어올거야.

        h_t_1_tilde : 저번에 예측한 hidden의 정보값. before softmax
        h_t_1 : h_{t-1} = [h_{t-1}, c_{t-1}]   tuple임. // 전 스텝의 hidden값. //  [n layer, b, h]라는데(?)
        '''
        # |emb_t| = (batch_size, 1, word_vec_size)
        # |h_t_1_tilde| = (batch_size, 1, hidden_size)
        # |h_t_1[0]| = (n_layers, batch_size, hidden_size) : t-1 시점 전의 모든 히든들..같음 not sure yet
        batch_size = emb_t.size(0) # [batch]
        hidden_size = h_t_1[0].size(-1) # [hidden]
        if h_t_1_tilde is None:
            # If this is the first time-step, 이제 막 디코더가 시작한것임.
            h_t_1_tilde = emb_t.new(batch_size, 1, hidden_size).zero_() # .new -> 텐서는 디바이스와, 타입이 같아야 arithmetic이 가능한데,.. 그러면 두번을 설정해 줘야함. 귀찮자나..
                                                                                    # 가장 간단하게 하는 방법이. 저 텐서와 같은 디바이스, 타입인놈을 만들어줘. 하는게 new이다.
                                                                        # .zero_() -> inplace 연산이다.

        # Input feeding trick.
        x = torch.cat([emb_t, h_t_1_tilde], dim=-1) # [b, 1, w + h]

        # Unlike encoder, decoder must take an input for sequentially.
        y, h = self.rnn(x, h_t_1)
            # y : [b, n, h] // h: [l, b, h]
        return y, h

In [6]:
def fast_merge_encoder_hiddens(encoder_hiddens):
        '''
        parallel하게 해보자
        encoder : [l*2, b, h/2], [l*2, b, h/2]
        '''
        # Merge bidirectional to uni-directional
        # (layers*2, bs, hs/2) -> (layers, bs, hs).
        # Thus, the converting operation will not working with just 'view' method.
        h_0_tgt, c_0_tgt = encoder_hiddens # 두개 모두 [2layer, b, h/2]
        batch_size = h_0_tgt.size(1)

        # contiguous : 메모리상에 잘 붙어있게 선언하는것.
        # transpose까지 하면 : [b, 2layer, h/2]
        # view : [b, -1, hs] --> [b, layer, h]
        # transpose : [layer, b, h]
        h_0_tgt = h_0_tgt.transpose(0, 1).contiguous().view(batch_size,
                                                            -1,
                                                            4
                                                            ).transpose(0, 1).contiguous()
        c_0_tgt = c_0_tgt.transpose(0, 1).contiguous().view(batch_size,
                                                            -1,
                                                            4
                                                            ).transpose(0, 1).contiguous()
        # You can use 'merge_encoder_hiddens' method, instead of using above 3 lines.
        # 'merge_encoder_hiddens' method works with non-parallel way.
        # h_0_tgt = self.merge_encoder_hiddens(h_0_tgt)

        # |h_src| = (batch_size, length, hidden_size)
        # |h_0_tgt| = (n_layers, batch_size, hidden_size)
        # [l, b, h], [l, b, h]
        return h_0_tgt, c_0_tgt
    
    

tgt = torch.randn(64,30,10)
input_size = 30
word_vec_size = 10
    
h_src, h_0_tgt = encoder(data) # packed_padded_sequence로 처리를 함.
    # |h_src| = (b, n, h) : 인코더의 모든 t시점에서의 히든스테이트
    # |h_0_tgt| = [l*2, b, h/2], [l*2, b, h/2] : 인코더에서 레이어마다 나온 마지막 히든스테이트(컨텍스트)
        # -> 여기서 이친구를 decoder의 init hidden으로 넣어줘야 하는데,feature가 h/2임. 이걸 h로 변환해줘야함.

h_0_tgt = fast_merge_encoder_hiddens(h_0_tgt)
    # merge_encoder_hidden부터 살펴보자
    # [l, b, h], [l, b, h]

# teacher forcing이기 때문에 정답을 한꺼번에 만들어.
emb_tgt = tgt
    # |emb_tgt| = (b, l, emb)
h_tilde = [] # 여기도 한방에 들어갈거야.

h_t_tilde = None # 첫번째 타임스텝에서는 전에 있던 h_t_tilde는 없다.
decoder_hidden = h_0_tgt # ([layer, bs, hs], [layer, bs, hs])


decoder = Decoder(10,4,4,0.2)



In [7]:
emb_t.shape

NameError: name 'emb_t' is not defined

In [8]:
decoder

Decoder(
  (rnn): LSTM(14, 4, num_layers=4, batch_first=True, dropout=0.2)
)

In [9]:
class Attention(nn.Module):

    def __init__(self, hidden_size):
        super(Attention, self).__init__()

        self.linear = nn.Linear(hidden_size, hidden_size, bias=False) # 맨처음에 projection needed for 가중치 refer to encoder part
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, h_src, h_t_tgt, mask=None):
        # |h_src| = (batch_size, length, hidden_size) - 인코더의 모든 히든 스테잇
        # |h_t_tgt| = (batch_size, 1, hidden_size) - 디코더의 히든 스테잇
        # |mask| = (batch_size, length) - src의 마스킹할 정보

        query = self.linear(h_t_tgt)                     # [B,1,H] * [B,H,H] = [B,1,H]
        # |query| = (batch_size, 1, hidden_size)

        weight = torch.bmm(query, h_src.transpose(1, 2)) # [B,1,H] * [B, H, L] => [B, 1, L] // bmm : batch multiplication
        # |weight| = (batch_size, 1, length)
        if mask is not None:
            # Set each weight as -inf, if the mask value equals to 1.
            # Since the softmax operation makes -inf to 0, 
            # masked weights would be set to 0 after softmax operation.
            # Thus, if the sample is shorter than other samples in mini-batch,
            # the weight for empty time-step would be set to 0.
            weight.masked_fill_(mask.unsqueeze(1), -float('inf')) # mask가 있는 부분에 -float('inf')를 넣어줘
        weight = self.softmax(weight)

        context_vector = torch.bmm(weight, h_src)        # [B,1,L]*[B,L,H] -> [B,1,H]
        # |context_vector| = (batch_size, 1, hidden_size)
        # 해석으 해보면, 샘플 데이터에서, 디코더의 시점에서, 어텐션을 적용한 컨텐스트 벡터

        return context_vector

def generate_mask(self, x, length):
    '''
    x : [bs, n]
    length : [bs,] such as [4,3,1]
    '''
    mask = []

    max_length = max(length)
    for l in length:
        if max_length - l > 0:
            # If the length is shorter than maximum length among samples, 
            # set last few values to be 1s to remove attention weight.
            mask += [torch.cat([x.new_ones(1, l).zero_(),
                                x.new_ones(1, (max_length - l))
                                ], dim=-1)]
        else:
            # If the length of the sample equals to maximum length among samples, 
            # set every value in mask to be 0.
            mask += [x.new_ones(1, l).zero_()]

    mask = torch.cat(mask, dim=0).bool() # [[4,4], [4,4], [4,4]] -> [3, 4]짜리 텐서로 flatten

    '''
        length 에) 아래와 같은 텐서가 있을때 

        --- --- --- ---
        |  |   |   |  |  [4,
        ___ ___ ___ ___
        |  |   |   ||||   3,
        --- --- --- ---
        |   ||| ||| |||   1] 라는 x_length모양이 있을것임.
        --- --- --- ---

        --- --- --- ---
        | 0|  0|  0| 0|  
        ___ ___ ___ ___
        | 0|  0|  0| 1|  
        --- --- --- ---
        | 0| 1| | 1| 1|   
        --- --- --- ---
        으로 나오게 한다.
    '''
    return mask

    
    
hidden_size = 4
attn = Attention(hidden_size)
concat = nn.Linear(hidden_size * 2, hidden_size)
tanh = nn.Tanh() 

mask = torch.randn(64,30).zero_()
h_tilde = []


for t in range(tgt.size(1)): # length of sentence
    # Teacher Forcing: take each input from training set,
    # not from the last time-step's output.
    # Because of Teacher Forcing,
    # training procedure and inference procedure becomes different.
    # Of course, because of sequential running in decoder,
    # this causes severe bottle-neck.
    emb_t = emb_tgt[:, t, :].unsqueeze(1) # 한 단어씩 번갈아가면서 들어간다. // unsqueeze : 특정 차원에 차원을 추가한다.
        # 인덱싱할 경우 [b, l, emb] -> [b,emb]되버릴 수 있다. 따라서 명시적으로 그냥 선언하자.
    # |emb_t| = (batch_size, 1, word_vec_size)
    # |h_t_tilde| = (batch_size, 1, hidden_size)

    decoder_output, decoder_hidden = decoder(emb_t, # 현시점의 단어.
                                                  h_t_tilde, # 지난 타임 스텝의 틸다
                                                  decoder_hidden # [l, b, h], [l, b, h]
                                                  )
    # |decoder_output| = (batch_size, 1, hidden_size)
    # |decoder_hidden| = (n_layers, batch_size, hidden_size)
    
    context_vector = attn(y, decoder_output, mask)
    h_t_tilde = tanh(concat(torch.cat([decoder_output,
                                                 context_vector
                                                 ], dim=-1)))
    # |h_t_tilde| = (batch_size, 1, hidden_size)
    # self.concat -> 2h, h

    h_tilde += [h_t_tilde]

h_tilde = torch.cat(h_tilde, dim=1)
    # h_tilde = (b, 1, h)
    # concat on dim 1 => (b, m, h)
    # |h_tilde| = (b, length, h)

# y_hat = self.generator(h_tilde)
# |y_hat| = (b, length, output_size:vocab_size)



In [10]:
h_tilde.shape

torch.Size([64, 30, 4])

In [11]:
h_t_tilde.shape

torch.Size([64, 1, 4])

In [12]:
decoder_hidden[0].shape

torch.Size([4, 64, 4])

In [13]:
emb_t.shape

torch.Size([64, 1, 10])

In [14]:
h_t_tilde.shape

torch.Size([64, 1, 4])

In [15]:
decoder_hidden[0].size(-1)

4

In [79]:
rnn = nn.LSTM(
            10 + 4, # input feeding? 을 해줄거기 때문에(concat) 차원이 늘어난다.
            4,
            num_layers=4,
            dropout=.2,
            bidirectional=False,
            batch_first=True,
        )

x = torch.cat([emb_t, h_t_tilde], dim=-1) # [b, 1, w + h]

In [82]:
decoder_hidden[0].shape

torch.Size([4, 64, 4])

In [83]:
z,zz = rnn(x, decoder_hidden)
z.shape

torch.Size([64, 1, 4])

In [84]:
zz[0].shape

torch.Size([4, 64, 4])

In [85]:
rnn = nn.LSTM(10, 20, 2)
input = torch.randn(5, 3, 10) # 3 5 10 
h0 = torch.randn(2, 3, 20)
c0 = torch.randn(2, 3, 20)
output, (hn, cn) = rnn(input, (h0, c0))

In [87]:
output.shape

torch.Size([5, 3, 20])