해당 페이지는 Zichao Yang1, Diyi Yang1, Chris Dyer1, Xiaodong He2, Alex Smola1, Eduard Hovy1 (2016), "Hierarchical Attention Networks for Document Classification" 논문에 관한 구현입니다.
http://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf

**해당 논문의 구현은, 계층적으로 $word \rightarrow sentence \rightarrow document$로 되어 있어서, 데이터를 새로 manipulation 시켜주어야 했기 때문에, 최적화 과정까지는 하지 않고, 아키텍처를 쌓아가면서 이해를 돕는 정도로 진행하도록 하겠습니다. 미천한 실력 죄송합니다ㅠㅠ 가능하다면 추후에 보강하도록 하겠습니다!**

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F

from torchtext import datasets
from torchtext.data import Field, BucketIterator

import string
import random
import re
import os
import time

SEED = 1

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

해당 논문에서 분석한 데이터 셋 중 하나인 IMDB REVIEW에 대해서 분류 작업을 진행하도록 하겠습니다.

In [19]:
# set up fields
TEXT = Field(lower=True,  batch_first=True)
LABEL = Field(sequential=False)

train, test = datasets.IMDB.splits(TEXT, LABEL)
device = 'cpu'

# build the vocabulary
TEXT.build_vocab(train ) #get rid of vectors=GloVe(name='6B', dim=300)
LABEL.build_vocab(train)

batch_size = 1000
train_iter, test_iter = BucketIterator.splits(
    (train, test), batch_sizes=(batch_size,batch_size),
    shuffle=False,device=-1)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [20]:
word_to_idx_dict = TEXT.vocab.stoi
idx_to_word_dict = {val:idx for idx,val in word_to_idx_dict.items()}

In [21]:
batch = next(iter(train_iter))
batch.text.size()
# batch_size : 1000
# max_length : 1196

torch.Size([1000, 1196])

In [22]:
testing_doc = ','.join([idx_to_word_dict[i.item()] for i in batch.text[0] if i.item() != 1]).replace(",",' ')
testing_doc
# padding 을 제외하고, 문장을 뽑아낸 결과입니다.

'the film exposes the blatant exploitation of the chinese worker - generally female - garnering footage from the chinese business owner who shares his unashamed and delusional viewpoint  his american counterpart also as unashamed and delusional  the oppressed workers who are given a voice and  of course  the drunken americans who wear the beaded necklaces mindlessly celebrating in new orleans. <br /><br />the glimmer of hope comes when some americans are actually outraged that people making their beaded necklaces were getting paid like $0.10 per hour to do so. you also have a feeling that the workers may have a chance to escape working in the bead factory  but will probably do so when they get fed up with the punishment treatment popular with the factory owner and/or they just get too exhausted to work up to 20 hours a day of hard labor.<br /><br />i have wondered where those necklaces came from  not realizing how completely grueling and arduous it would be to make them. i just truly a

###  문장과 단어의 길이에 대한 논의
- 논문에서는 단어의 길이를 $T$라고 가정하고 수식을 전개하였고, 문장의 길이는 $L$로 가정하고 했습니다.
- 단순화(simplify)을 위해서, $T = 5$, $L = 2$이라고 하고, 패딩을 전개합니다.

In [23]:
def yield_num_to_str(batch) :  
    for num_sent in batch.text : 
        yield ','.join([idx_to_word_dict[i.item()] for i in num_sent if i.item() != 1]).replace(",",' ')
# <pad>가 word length 단위로 존재하는데 저희는 sentence length단위의 패딩과 
# word length 단위의 패딩이 모두 존재하고 있어야 하기 때문에 없애줍니다.
def return_str_ls(iterator) :
    str_ls = []
    for idx,batch in enumerate(iterator) : 
        batch_ls = list(yield_num_to_str(batch))
        str_ls.append(batch_ls)
    for idx1,i in enumerate(str_ls) : 
        for idx2,j in enumerate(i) : 
            str_ls[idx1][idx2] = j.replace('<br /><br />',' ') # br 토큰이 있는 것도 있고 없는 것도 있으니 없애줍니다.
    return str_ls

In [24]:
%%time
train_str_ls = return_str_ls(train_iter)
test_str_ls = return_str_ls(test_iter)

CPU times: user 1min 48s, sys: 21.2 ms, total: 1min 48s
Wall time: 1min 48s


In [25]:
def split_under_punctuation(str_ls) : 
    ls = str_ls.copy()
    for idx1 in range(len(ls)) : 
        for idx2 in range(len(ls[idx1])) : 
            process_ls = [i for i in re.split('[?!.]', ls[idx1][idx2]) if i]
            if len(process_ls) < 2 : 
                final_ls = [i for i in re.split('[?!.]', ls[idx1][idx2]) if i] + ["<pad>"] * (2-len(process_ls))
            else : 
                final_ls = [i for i in re.split('[?!.]', ls[idx1][idx2]) if i][:2] # L이 2가 됩니다.
            ls[idx1][idx2] = final_ls
    return ls

In [26]:
train_ls = split_under_punctuation(train_str_ls)
test_ls = split_under_punctuation(test_str_ls)

In [27]:
def slicing_the_ls(str_ls) : 
    ls = str_ls.copy()
    for idx1,i in enumerate(ls) : 
        for idx2,j in enumerate(i) : 
            for idx3,sent in enumerate(j) : 
                process_ls = [i for i in sent.split(" ") if i]
                if len(process_ls) < 10 :
                    final_ls = process_ls + ["<pad>"] * (10-len(process_ls))
                else : final_ls = process_ls[:10]
                ls[idx1][idx2][idx3] = final_ls
    return ls

In [28]:
new_train_ls = slicing_the_ls(train_ls)
new_test_ls = slicing_the_ls(test_ls)

In [29]:
new_train_ls[0][0]

[['the',
  'film',
  'exposes',
  'the',
  'blatant',
  'exploitation',
  'of',
  'the',
  'chinese',
  'worker'],
 ['the',
  'glimmer',
  'of',
  'hope',
  'comes',
  'when',
  'some',
  'americans',
  'are',
  'actually']]

In [30]:
def convert_word_to_idx(batch_ls) : 
    ls = batch_ls.copy()
    for idx1,i in enumerate(ls) : 
        for idx2,batch in enumerate(i) :
            for idx3,sent in enumerate(batch) :
                for idx4,word in enumerate(sent) : 
                    ls[idx1][idx2][idx3][idx4] = word_to_idx_dict[word]
    return ls

In [31]:
train_ls = convert_word_to_idx(new_train_ls)
test_ls = convert_word_to_idx(new_test_ls)

In [32]:
train_ls[0][0][0]

[2, 24, 11451, 2, 4466, 2659, 5, 2, 1683, 6363]

In [34]:
long_train_ls = torch.tensor(train_ls,dtype=torch.long,device='cpu')
long_test_ls = torch.tensor(test_ls,dtype=torch.long,device='cpu')

In [39]:
long_train_ls.size(),long_test_ls.size()

(torch.Size([25, 1000, 2, 10]), torch.Size([25, 1000, 2, 10]))

In [53]:
long_train_ls[0][0]
# 아래를 보시면 알 수 있듯, 각 문장은 10개의 단어로 구성되어 있고, 하나의 문서는 2개의 문장으로 이루어져있음을 알 수 있습니다.

tensor([[    2,    24, 11451,     2,  4466,  2659,     5,     2,  1683,  6363],
        [    2, 16928,     5,   418,   239,    50,    45,  1928,    22,   157]])

## Hierarchical Attentional Networks 에 대한 논의
- 문서를 문장 -> 단어 로 계층적으로 표현하는 네트워크를 사용합니다.
- 단어를 임베딩하여, Bi_LSTM 에 학습시켜, hidden states를 반환, 이를 attention layer에 넣습니다.
- attention layer에서 이전에 학습시킨 Bi_LSTM이 반환한 context vector와 각 step의 hidden state와의 alignment를 구하게 됩니다.
- aligment model을 softmax 취해주게 되고, 이 값은 문장을 구성하는 단어들의 집합을 가장 잘 요약하는 단어에 가중치가 차등적으로 할당되는 값을 의미하게 됩니다. 최종적으로는 각 step의 hidden states에 softmax를 곱해 weighted sum을 취합니다.
- 이렇게 나온 context vector 하나는 문장 하나를 대변하게 되고, 이러한 context vector로 다시 Bi_LSTM을 돌리면 됩니다.

## Word Encoder

In [122]:
class Word_Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, dropout):
        
        super().__init__()     
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(input_dim, embedding_dim,padding_idx=1)
        self.num_layers = num_layers
        self.bi_rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                            dropout=dropout,bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.hidden_dim = hidden_dim    

    def forward(self, text):
        # text 는 tuple 입니다.
        embedded = self.dropout(self.embedding(text)) #[max_length, batch_size, embedding_dim]
        # packed_embedded 또한 tuple형태입니다. 하지만 기존의 RNN based 모델에 인자로 넣어주면 됩니다!
        outputs, (hidden,cell) = self.bi_rnn(embedded)
        concated_hidden = torch.cat((hidden,cell),dim=2)
        return outputs, concated_hidden 
    # [max_length , batch_size , hidden_layer_dim*2], [2 , batch_size , hidden_layer_dim]*2 

In [123]:
input_dim = len(TEXT.vocab.itos)
embedding_dim = 256
hidden_dim = 128
num_layer = 1
dropout = 0.5
batch_size = 1000
enc = Word_Encoder(input_dim,embedding_dim,hidden_dim,num_layer,dropout)

  "num_layers={}".format(dropout, num_layers))


In [124]:
long_train_ls[0][:,0,:].permute(1,0).size()

torch.Size([10, 1000])

In [144]:
first_outputs,first_hidden = enc(long_train_ls[0][:,0,:].permute(1,0))
print(first_outputs.size(), first_hidden.size())

second_outputs,second_hidden = enc(long_train_ls[0][:,1,:].permute(1,0))
print(second_outputs.size(), second_hidden.size())

torch.Size([10, 1000, 256]) torch.Size([2, 1000, 256])
torch.Size([10, 1000, 256]) torch.Size([2, 1000, 256])


## Word Attention

In [142]:
class Word_Attention(nn.Module) : 
    def __init__(self, outputs, hidden_dim,context,batch_size):
        
        super().__init__()        
        self.outputs = outputs # Encoder가 출력한 outputs 입니다.
        self.hidden_dim = hidden_dim # hidden_dim의 dimension 입니다.
        self.score_fc1 = nn.Linear(hidden_dim*2*2,hidden_dim)
        self.score_fc2 = nn.Linear(hidden_dim*2,hidden_dim)
        self.softmax_fc = nn.Linear(hidden_dim,1)
        self.batch_size = batch_size
        self.context = context
    def forward(self) : 
        context = self.context.view(1,self.batch_size,-1) # hidden,cell을 옆으로 connect시켜줍니다.
        outputs = self.outputs.permute(1,0,2) # [batch_size, max_length, hidden_dim]
        context = context.permute(1,0,2) # [batch_size,1, hidden_dim]
        
        attention_score = torch.tanh(self.score_fc2(outputs) + self.score_fc1(context))
        # [batch_size, max_length, hidden_dim]
        attention_weights = torch.softmax(self.softmax_fc(attention_score),dim=1) # [batch_size, max_length, 1]
        
        context_vector = attention_weights * outputs#[batch_size, max_length, hidden_dim]
        new_context_vector = torch.sum(context_vector,dim=1)#[batch_size, hidden_dim]
        
        return new_context_vector.unsqueeze(1) #[batch_size, 1, hidden_dim*2]

In [145]:
first_word_attn = Word_Attention(first_outputs,hidden_dim,first_hidden,1000)
first_context_vector = first_word_attn()
print(first_context_vector.size())

second_word_attn = Word_Attention(second_outputs,hidden_dim,second_hidden,1000)
second_context_vector = second_word_attn()
print(second_context_vector.size())

torch.Size([1000, 1, 256])
torch.Size([1000, 1, 256])


## Sentence vector에 대한 논의
- 각 문장 내의 단어들이 word encoder와 attention layer를 통해 나와 각각의 context vector를 형성하였습니다.
- 이제 문장들을 aggregate해서 문서를 대표하는 값을 만들어야 합니다.
- 아래와 같이 concatenate를 통해서, 하나의 문장 벡터를 생성해 다시 처음부터 해줍니다!
- **다만 이때는 임베딩을 하지 않고 아래의 sentence vector를 바로 Bi_LSTM에 넣어줍니다!**

In [146]:
sentence_vector = torch.cat((first_context_vector,second_context_vector),dim=1)
sentence_vector.size()

torch.Size([1000, 2, 256])

In [157]:
# 아래 아키텍처에 들어갈 때에 [max_length, batch_size, hidden_dim]으로 들어가야 하기 때문에, permute시켜줍니다.
sentence_vector = sentence_vector.permute(1,0,2)
sentence_vector.size()

torch.Size([2, 1000, 256])

## Sentence Encoder

In [158]:
class Sentence_Encoder(nn.Module):
    def __init__(self, sentence_vec_dim, hidden_dim, num_layers):
        
        super().__init__()     
        self.sentence_vec_dim = sentence_vec_dim
        self.num_layers = num_layers
        self.bi_rnn = nn.LSTM(sentence_vec_dim, hidden_dim, num_layers=num_layers, 
                            dropout=dropout,bidirectional=True)
        self.hidden_dim = hidden_dim    
        
    def forward(self, sentence_vector):
        outputs, (hidden,cell) = self.bi_rnn(sentence_vector)
        concated_hidden = torch.cat((hidden,cell),dim=2)
        return outputs, concated_hidden 
    # [max_length , batch_size , hidden_layer_dim*2], [2 , batch_size , hidden_layer_dim]*2 

In [159]:
sentence_vec_dim = sentence_vector.size()[2] #256
hidden_dim = 128
num_layers = 1 # 

enc = Sentence_Encoder(sentence_vec_dim,hidden_dim,num_layers)

  "num_layers={}".format(dropout, num_layers))


In [160]:
outputs, hidden = enc(sentence_vector)
outputs.size(), hidden.size()

(torch.Size([2, 1000, 256]), torch.Size([2, 1000, 256]))

## Sentence Attention

In [161]:
class Sentence_Attention(nn.Module) : 
    def __init__(self, outputs, hidden_dim,context,batch_size):
        
        super().__init__()        
        self.outputs = outputs # Encoder가 출력한 outputs 입니다.
        self.hidden_dim = hidden_dim # hidden_dim의 dimension 입니다.
        self.score_fc1 = nn.Linear(hidden_dim*2*2,hidden_dim)
        self.score_fc2 = nn.Linear(hidden_dim*2,hidden_dim)
        self.softmax_fc = nn.Linear(hidden_dim,1)
        self.batch_size = batch_size
        self.context = context
    def forward(self) : 
        context = self.context.view(1,self.batch_size,-1)
        outputs = self.outputs.permute(1,0,2) # [batch_size, max_length, hidden_dim]
        context = context.permute(1,0,2) # [batch_size,1, hidden_dim]
        
        attention_score = torch.tanh(self.score_fc2(outputs) + self.score_fc1(context))
        # [batch_size, max_length, hidden_dim]
        attention_weights = torch.softmax(self.softmax_fc(attention_score),dim=1) # [batch_size, max_length, 1]
        
        context_vector = attention_weights * outputs#[batch_size, max_length, hidden_dim]
        new_context_vector = torch.sum(context_vector,dim=1)#[batch_size, hidden_dim]
        
        return new_context_vector.unsqueeze(1) #[batch_size, 1, hidden_dim*2]

In [162]:
Sentence_attn = Sentence_Attention(outputs,hidden_dim,hidden,1000)
context_vector = Sentence_attn()
print(context_vector.size())

torch.Size([1000, 1, 256])


## Fully connected

In [168]:
fc = nn.Linear(hidden_dim*2,2)
softmax = torch.softmax(fc(context_vector),dim=2)
softmax.size()

torch.Size([1000, 1, 2])

In [169]:
softmax[0]

tensor([[0.5109, 0.4891]], grad_fn=<SelectBackward>)