해당 페이지는 Zichao Yang1, Diyi Yang1, Chris Dyer1, Xiaodong He2, Alex Smola1, Eduard Hovy1 (2016), "Hierarchical Attention Networks for Document Classification" 논문에 관한 구현입니다.
http://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf

__________________________________

references : 
- https://github.com/pandeykartikey/Hierarchical-Attention-Network/blob/master/HAN%20yelp.ipynb
- https://github.com/vietnguyen91/Hierarchical-attention-networks-pytorch/blob/master/src/utils.py
- https://github.com/EdGENetworks/attention-networks-for-classification/blob/master/attention_model_validation_experiments.ipynb

In [88]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils import data

from nltk.tokenize import sent_tokenize,word_tokenize
from sklearn.model_selection import train_test_split

import string
import random
import re
import pandas as pd
from bs4 import BeautifulSoup
import os
from collections import defaultdict,Counter
import pickle

from termcolor import colored, cprint
SEED = 1

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Loading

In [2]:
os.chmod("yelp_academic_dataset_review.json", 0o777)

In [3]:
df_iter = pd.read_json('yelp_academic_dataset_review.json',lines=True,chunksize=100000)

In [4]:
df = pd.concat(df_iter,axis=0)
df.shape

(6685900, 9)

In [5]:
df = df.head(1000000)
df.shape
# 데이터가 너무 많아서 100만개만 사용합니다.

(1000000, 9)

# Preprocessing

In [6]:
def clean_str(string, max_seq_len):
    """
    adapted from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = BeautifulSoup(string, "lxml").text
    string = re.sub(r"[^A-Za-z0-9(),!?\"\`]", " ", string)
    string = re.sub(r"\"s", " \"s", string)
    string = re.sub(r"\"ve", " \"ve", string)
    string = re.sub(r"n\"t", " n\"t", string)
    string = re.sub(r"\"re", " \"re", string)
    string = re.sub(r"\"d", " \"d", string)
    string = re.sub(r"\"ll", " \"ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    s =string.strip().lower().split(" ")
    if len(s) > max_seq_len:
        return s[0:max_seq_len] 
    return s

In [7]:
X = df['text']
Y = (df['stars'] - 1)
X_train, X_test, y_train, y_test = \
train_test_split(X,Y, test_size=0.33, random_state=123)

In [8]:
X_train.head()

501213    Quite disturbed by no team work, seems to be s...
805861    Based on flavour of the food and quality of th...
882726    My sister was visiting and we were looking for...
371658    Some of the best food in the area. Great selec...
409957    the food is always great!, lots of vegetarian ...
Name: text, dtype: object

In [None]:
def create3DList(data, max_sent_len,max_seq_len):
    x = []; x1 = []
    for seq in sent_tokenize(data) :
        x1.append(clean_str(seq,max_sent_len))
    x.append(x1[:max_seq_len])
    return x

# max_sent_len : 한 문장에 들어가는 단어의 갯수
# max_seq_len : 한 문서에 들어가는 문장의 갯수
max_sent_len = 12; max_seq_len = 25

x_train = X_train.apply(lambda x : create3DList(x,max_sent_len,max_seq_len))
x_test = X_test.apply(lambda x : create3DList(x,max_sent_len,max_seq_len))

print("x_train: {}".format(len(x_train)))
print("x_test: {}".format(len(x_test)))

In [10]:
x_train = x_train.tolist()
x_test = x_test.tolist()

In [11]:
len(x_train), len(x_test)

(670000, 330000)

In [None]:
word_to_idx_dict = {'<unk>':0,'<pad>':1}

for idx,doc in enumerate(x_train) : 
    if idx % 1000 == 0 : print("{}번째 문서 처리 중이며 word_to_idx_dict의 길이는 {}입니다."\
                               .format(idx,len(word_to_idx_dict)))
    for sent in doc[0] : 
        for word in sent : 
            if word not in word_to_idx_dict.keys() :                 
                word_to_idx_dict[word] = len(word_to_idx_dict)

In [None]:
word_to_freq_dict = defaultdict(int)

for idx,doc in enumerate(x_train) : 
    if idx % 1000 == 0 : print("{}번째 문서 처리 중이며 word_to_freq_dict의 길이는 {}입니다."\
                               .format(idx,len(word_to_freq_dict)))
    for sent in doc[0] : 
        for word in sent : 
            word_to_freq_dict[word] += 1

In [98]:
idx_to_word_dict = {idx:val for val,idx in word_to_idx_dict.items()}

In [17]:
import pickle

In [4]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(obj,name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
save_obj(word_to_idx_dict,'word_to_idx_dict')
save_obj(word_to_freq_dict,'word_to_freq_dict')

In [20]:
def word_to_idx(doc,min_freq=5) : 
    """
    doc : train or validation or test datasets which are composed with list within list
    """

    min_freq_ls = [[word for word in sent if word_to_freq_dict[word] > min_freq] for sent in doc]
    idx_dict = \
    [[word_to_idx_dict[word] if word in word_to_idx_dict.keys() else 0 for word in sent]\
     for sent in min_freq_ls] #if there is no tokens which match with  test datasets vocab then, that token is changed into UNK token
    return idx_dict

In [21]:
train_X = [word_to_idx(batch[0]) for batch in x_train]
test_X = [word_to_idx(batch[0]) for batch in x_test]

- max_sent_len : 한 문서가 가지는 최대 문장 갯수이자, 최소 문장 갯수입니다.(패딩 적용)
- max_seq_len : 한 문장이 가지는 최대 단어 갯수이자, 최소 단어 갯수입니다. (패딩 적용)

In [22]:
## Padding the number of sentence
train_X = [doc + [[1]]*(max_sent_len - len(doc)) if len(doc) <= max_sent_len else doc[:max_sent_len]\
           for doc in train_X]
test_X = [doc + [[1]]*(max_sent_len - len(doc)) if len(doc) <= max_sent_len else doc[:max_sent_len]\
           for doc in test_X]

## Padding the number of word
train_X = [[sent + [1] * (max_seq_len - len(sent)) for sent in doc] for doc in train_X]
test_X = [[sent + [1] * (max_seq_len - len(sent)) for sent in doc] for doc in test_X]

## Make Datasets with iterators 

In [23]:
print("한 문장 안에 있는 단어의 길이 : ",set([len(sent) for doc in train_X for sent in doc]))
print("한 문서 안에 있는 문장의 길이 : ",set([len(doc) for doc in train_X]))

한 문장 안에 있는 단어의 길이 :  {25}
한 문서 안에 있는 문장의 길이 :  {12}


In [24]:
# 뒤에 나오는 DataLoader는 Cuda Tensor를 지원하지 않습니다.
train_X = torch.LongTensor(train_X)
train_y = torch.LongTensor(y_train.tolist())
test_X = torch.LongTensor(test_X)
test_y = torch.LongTensor(y_test.tolist())

In [25]:
train_X.shape, train_y.shape, test_X.shape, test_y.shape

(torch.Size([670000, 12, 25]),
 torch.Size([670000]),
 torch.Size([330000, 12, 25]),
 torch.Size([330000]))

In [None]:
torch.save(train_X,'train_X.pt')
torch.save(train_y,'train_y.pt')

torch.save(test_X,'test_X.pt')
torch.save(test_y,'test_y.pt')

## 만약 커널을 끄고 다시 시작하실 경우, 여기서부터 진행하면 됩니다!!

In [2]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
word_to_idx_dict = load_obj('word_to_idx_dict')
word_to_freq_dict = load_obj('word_to_freq_dict')

In [4]:
train_X = torch.load('train_X.pt')
train_y = torch.load('train_y.pt')
test_X = torch.load('test_X.pt')
test_y = torch.load('test_y.pt')

In [5]:
class Dataset(data.Dataset):
    def __init__(self, X, y):
        'Initialization'
        self.y = y
        self.X = X

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.X)

    def __getitem__(self, index):
        # Load data and get label
        'Generates one sample of data'
        # Select sample
        X = self.X[index]
        y = self.y[index]

        return X, y


In [6]:
# Parameters
params = {'batch_size': 100,
          'shuffle': True,
          'num_workers': 10}

# Generators
training_set = Dataset(train_X,train_y)
train_iter = data.DataLoader(training_set, **params)

testing_set = Dataset(test_X,test_y)
test_iter = data.DataLoader(testing_set, **params)

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for local_batch, local_labels in train_iter:
    local_batch, local_labels = local_batch.to(device), local_labels.to(device)
    break

In [8]:
local_batch.size(),local_labels.size()
#[batch_size, sent_len, word_len]
# 25개의 단어를 가지고 12개의 문장을 가진 64개의 문서가 있는 것입니다.

(torch.Size([100, 12, 25]), torch.Size([100]))

# Pre-trained GloVe Embedding Vectors

In [9]:
import numpy as np

In [10]:
# Extract word embeddings from the Glove
embeddings_index = dict()
f = open('glove.twitter.27B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [11]:
matrix_len = len(word_to_idx_dict)
weights_matrix = np.zeros((matrix_len, 200))
words_found = 0

for i, word in enumerate(word_to_idx_dict.keys()):
    try: 
        weights_matrix[i] = embeddings_index[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(200, ))
        
print("전체 단어의 길이 {} 개에서 GloVe 벡터로 초기화된 단어의 갯수는 {} 개입니다.".format(len(word_to_idx_dict),words_found))

전체 단어의 길이 170007 개에서 GloVe 벡터로 초기화된 단어의 갯수는 83087 개입니다.


In [12]:
weights_matrix.shape

(170007, 200)

# Modeling

In [13]:
class WordAttention(nn.Module) : 
    
    def __init__(self,batch_size,hidden_size) : 
        
        super(WordAttention,self).__init__() 
        self.batch_size = batch_size
        self.linear = nn.Linear(hidden_size*2,hidden_size*2).to(device)
        self.word_proj_params = nn.Parameter(torch.Tensor(hidden_size*2,1)).to(device)
        self.initialize_weight()
        
    def initialize_weight(self) : 
        torch.nn.init.xavier_uniform_(self.linear.weight)
        torch.nn.init.xavier_uniform_(self.word_proj_params)
        
    def forward(self,outputs) : 
        
        outputs = outputs.permute(1,0,2) #[batch_size, sent_len, hidden_dim*2]

        u = torch.tanh(self.linear(outputs)) #[batch_size, sent_len, hidden_dim*2]  
        word_proj_params = self.word_proj_params.expand(self.batch_size,-1,-1) #[batch_size,hidden_dim*2,1]
    
        atten = torch.bmm(u,word_proj_params) #[batch_size,sent_len,1]
        a = torch.softmax(atten,dim=1) #[batch_size,sent_len,1]
        s = torch.sum(torch.mul(a,outputs),dim=1) #[batch_size,hidden_dim*2]
        
        return s,a

In [41]:
def create_emb_layer(weights_matrix):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim,padding_idx = 1) # <pad>
    emb_layer.weight = nn.Parameter(torch.tensor(weights_matrix,dtype=torch.float32))
    
    return emb_layer
    
class WordRNN(nn.Module) : 
    
    def __init__(self,batch_size,vocab_size,embed_size,hidden_size,num_layer,max_sent_len,weights_matrix) : 
        
        super(WordRNN,self).__init__()
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.gru_hidden_size = hidden_size
        self.num_layer = num_layer
        self.max_sent_len = max_sent_len
#         self.embeddings = nn.Embedding(vocab_size,embed_size,padding_idx = 1).to(device)
        self.embeddings = create_emb_layer(weights_matrix).to(device)
        # GloVe 로 Initialize만 시키고, Training이 가능하게 해줍니다.
        self.gru = nn.GRU(embed_size,hidden_size,num_layer,bidirectional=True).to(device)
        
        self.word_atten = WordAttention(batch_size,hidden_size).to(device)
        self.initialize_weight()
        
    def initialize_weight(self) : 
        for layer_p in gru._all_weights:
            for p in layer_p:
                if 'weight' in p:
                    nn.init.xavier_normal_(gru.__getattr__(p),)

    def forward(self,input_,hidden) : 
        
        sent_vec_ls = []; word_attention_ls = []
        
        for i in range(self.max_sent_len) : 
            x = input_[:,i,:]  # x : [batch_size, T :(word length per sentence)]
            embeds = self.embeddings(x).permute(1,0,2) # [T, batch_size, embed_dim] 

            outputs, hidden = self.gru(embeds,hidden)
            
            sent_vec,word_attention = self.word_atten(outputs)
        
            sent_vec_ls.append(sent_vec.unsqueeze(1))
            word_attention_ls.append(word_attention.permute(0,2,1))
        
        sent_vec = torch.cat(sent_vec_ls,dim=1)
        word_attention = torch.cat(word_attention_ls,dim=1)
                
        return sent_vec,word_attention,hidden
    # [batch_size,sent_len,hidden_size]
    # [batch_size,sent_len,word_len]
    # [num_layer*bidirectional(2), batch_size, hidden_size]

In [42]:
batch_size = 100
vocab_size = len(word_to_idx_dict)
embed_size = 200
hidden_size = 50
num_layer = 1
max_sent_len = 12

word_model = WordRNN(batch_size,vocab_size,embed_size,hidden_size,num_layer,max_sent_len,weights_matrix)
hidden = \
        Variable(torch.randn(num_layer*2, batch_size, hidden_size, device=device))

In [43]:
sent_vec,word_attention,hidden = word_model(local_batch,hidden)

In [45]:
class SentAttention(nn.Module) : 
    
    def __init__(self,batch_size,hidden_size) : 
        
        super(SentAttention,self).__init__() 
        self.batch_size = batch_size
        self.linear = nn.Linear(hidden_size*2,hidden_size*2).to(device)
        self.sent_proj_params = nn.Parameter(torch.Tensor(hidden_size*2,1)).to(device)
        self.initialize_weight()
        
    def initialize_weight(self) : 
        torch.nn.init.xavier_uniform_(self.linear.weight)
        torch.nn.init.xavier_uniform_(self.sent_proj_params)
        
    def forward(self,outputs) : 
        
        outputs = outputs.permute(1,0,2) #[batch_size, doc_len, hidden_dim*2]
        u = torch.tanh(self.linear(outputs)) #[batch_size, doc_len, hidden_dim*2]
        sent_proj_params = self.sent_proj_params.expand(self.batch_size,-1,-1) #[batch_size,hidden_dim*2,1]
        atten = torch.bmm(u,sent_proj_params) #[batch_size,doc_len,1]
        a = torch.softmax(atten,dim=1) #[batch_size,doc_len,1]
        v = torch.sum(a * outputs,dim=1) #[batch_size,hidden_dim*2]
        return v,a

In [46]:
class SentRNN(nn.Module) : 
    
    def __init__(self,batch_size,vocab_size,embed_size,hidden_size,num_layer) : 
        
        super(SentRNN,self).__init__()
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.gru_hidden_size = hidden_size
        self.num_layer = num_layer
        
        self.gru = nn.GRU(hidden_size*2,hidden_size,num_layer,bidirectional=True).to(device)
        
        self.sent_atten = SentAttention(batch_size,hidden_size)
        self.initialize_weight()
        
    def initialize_weight(self) : 
        for layer_p in gru._all_weights:
            for p in layer_p:
                if 'weight' in p:
                    nn.init.xavier_normal_(gru.__getattr__(p),)

    def forward(self,x,hidden) : 
        
        x = x.permute(1,0,2) #x : [doc_len,batch_size, hidden*2]

        outputs, hidden = self.gru(x,hidden)
    
        doc_vec,sent_attention = self.sent_atten(outputs)
        
        return doc_vec,sent_attention,hidden
    
    #[batch_size,hidden_dim*2]
    #[batch_size,doc_len,1]
    #[num_layer*2,batch_size,hidden_dim]

In [47]:
sent_model = SentRNN(batch_size,vocab_size,embed_size,hidden_size,num_layer)

In [48]:
doc_vec,sent_attention,hidden = sent_model(sent_vec,hidden)

In [61]:
class HAN(nn.Module) : 
    
    def __init__(self,batch_size,vocab_size,embed_size,hidden_size,num_layer,max_sent_len,num_class,weights_matrix) : 
        
        super(HAN,self).__init__()
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.num_layers = num_layer
        self.max_sent_len = max_sent_len
        self.num_class = num_class
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
        self.word_encoder =\
        WordRNN(batch_size,vocab_size,embed_size,hidden_size,num_layer,max_sent_len,weights_matrix).to(self.device)
        
        self.sent_encoder =\
        SentRNN(batch_size,vocab_size,embed_size,hidden_size,num_layer).to(self.device)
        
        self.proj_layer = nn.Linear(hidden_size*2,num_class).to(self.device)
        self.initialize_weight()
        
    def initialize_weight(self) : 
        torch.nn.init.xavier_uniform_(self.proj_layer.weight)
        
    def init_hidden(self,batch_size):
        hidden = \
        Variable(torch.zeros(self.num_layers*2, batch_size, self.hidden_size, device=self.device))
            
        return hidden
    
    def forward(self,input_) : 
        
        (batch_size,sent_len,doc_len) = input_.size()
        
        word_encoder_hidden = self.init_hidden(batch_size)
        sent_vec,word_attention,hidden = self.word_encoder(input_,word_encoder_hidden)
        sent_vec = nn.LayerNorm(self.hidden_size*2).to(device)(sent_vec)
        
        sent_encoder_hidden = self.init_hidden(batch_size)
        doc_vec,sent_attention,hidden = self.sent_encoder(sent_vec,sent_encoder_hidden)
        doc_vec = nn.LayerNorm(self.hidden_size*2).to(device)(doc_vec)
        
        logit = self.proj_layer(doc_vec)
        log_softmax = torch.log_softmax(logit,dim=1)
        
        return log_softmax, word_attention, sent_attention

In [62]:
params = {'batch_size' : 100,
'vocab_size' : len(word_to_idx_dict),
'embed_size' : 200,
'hidden_size' : 50,
'num_layer' : 1,
'max_sent_len' : 12,       
'num_class' : 5,
'weights_matrix' : weights_matrix,
}

model = HAN(**params).to(device)
model

HAN(
  (word_encoder): WordRNN(
    (embeddings): Embedding(170007, 200, padding_idx=1)
    (gru): GRU(200, 50, bidirectional=True)
    (word_atten): WordAttention(
      (linear): Linear(in_features=100, out_features=100, bias=True)
    )
  )
  (sent_encoder): SentRNN(
    (gru): GRU(100, 50, bidirectional=True)
    (sent_atten): SentAttention(
      (linear): Linear(in_features=100, out_features=100, bias=True)
    )
  )
  (proj_layer): Linear(in_features=100, out_features=5, bias=True)
)

In [63]:
log_softmax, word_attention, sent_attention = model(local_batch)

# Trainig and Testing

In [67]:
def adjust_learning_rate(optimizer, epoch, init_lr=0.1, decay = 0.1 ,per_epoch=10):
    """Decay learning rate by a factor of 0.1 every lr_decay_epoch epochs."""
    for param_group in optimizer.param_groups:
        param_group['lr'] *= 1/(1 + decay)

    return optimizer , float(param_group['lr'])

In [68]:
def train(model,train_loader , test_loader , epochs = 10, lr = 0.01, batch_size = 100) :
    
    optimizer = torch.optim.Adam(model.parameters(),lr)
    criterion = nn.NLLLoss().to(device)

    for epoch in range(1,epochs+1) :
        optimizer , lr_int = \
        adjust_learning_rate(optimizer, epoch, init_lr=lr, decay = 0.1 ,per_epoch=10)
        model.train()        
        n_correct = 0
        batch_count = 0
        for local_batch, local_labels in train_loader:
            
            batch_count += 1 
            if batch_count % 1000 == 0 : 
                print("{}번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.".format(batch_count))
                
            local_batch,local_labels = local_batch.to(device),local_labels.to(device)
        
            train_softmax, word_attention, sent_attention = model(local_batch)
            train_predict = train_softmax.argmax(dim=1)

            n_correct += (train_predict == local_labels).sum().item()            
            loss = criterion(train_softmax,local_labels)
            
            if loss.item() == 'nan' : 
                return local_batch,local_labels,train_softmax,word_attention,sent_attention
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        acc = n_correct / (len(train_loader) * batch_size)  
        with open('log.txt', 'a') as f:
            f.write('Train epoch : %s,  loss : %s,  accuracy :%.3f, learning rate :%.3f\n\n'%(epoch, loss.item(), acc,lr_int))
            
        print('Train epoch : %s,  loss : %s,  accuracy :%.3f, learning rate :%.3f'%(epoch, loss.item(), acc,lr_int))
        print('=================================================================================================')
        
        if (epoch) % 2 == 0:
            model.eval()
            n_correct = 0  # accuracy 계산을 위해 맞은 갯수 카운트
            val_loss = 0

            for local_batch, local_labels in test_loader:
                local_batch,local_labels = local_batch.to(device),local_labels.to(device)
                
                test_softmax, word_attention, sent_attention = model(local_batch)
                test_predict = test_softmax.argmax(dim = 1)

                val_loss = criterion(test_softmax, local_labels)
                
                n_correct += (test_predict == local_labels).sum().item() #맞은 갯수                

            val_acc = n_correct / (len(test_loader) * batch_size)
            with open('log.txt','a') as f : 
                f.write('Val Epoch : %s, Val Loss : %.03f , Val Accuracy : %.03f\n\n'%(epoch, val_loss, val_acc))
                
            print('*************************************************************************************************')
            print('*************************************************************************************************')
            print('Val Epoch : %s, Val Loss : %.03f , Val Accuracy : %.03f'%(epoch, val_loss, val_acc))
            print('*************************************************************************************************')
            print('*************************************************************************************************')



In [69]:
local_batch,local_labels,train_softmax,word_attention,sent_attention = \
train(model, train_iter, test_iter, epochs=30)

1000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
2000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
3000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
4000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
5000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
6000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
Train epoch : 1,  loss : 0.7867679595947266,  accuracy :0.658, learning rate :0.009
1000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
2000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
3000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
4000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
5000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
6000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
Train epoch : 2,  loss : 0.6004376411437988,  accuracy :0.665, learning rate :0.008
*************************************************************************************************
*************************************************************************************************
Val Epoch : 2, Val Loss : 0.686 , Val Accuracy : 0.659
*************************************************************************************************
***************************************************

*************************************************************************************************
*************************************************************************************************
Val Epoch : 14, Val Loss : 0.747 , Val Accuracy : 0.670
*************************************************************************************************
*************************************************************************************************
1000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.
2000번째 배치가 돌고 있습니다. 한 에포크는 6700입니다.


KeyboardInterrupt: 

In [70]:
torch.save(model,'HAN.pt')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


` params = {'batch_size' : 100,
'vocab_size' : len(word_to_idx_dict),
 'embed_size' : 128,
 'hidden_size' : 100,
 'num_layer' : 1,
 'max_sent_len' : 12,       
 'num_class' : 5}`

**Highest Validation Accuracy at Epoch 15 without pretrained embedding and Layer Normalization BUT there was no weight initialization such as xavier so, word attention wasn't changed and NaN values are appeared sometimes: 0.59**
____________________
`params = {'batch_size' : 100,
 'vocab_size' : len(word_to_idx_dict),
 'embed_size' : 200,
 'hidden_size' : 50,
 'num_layer' : 1,
 'max_sent_len' : 12,       
 'num_class' : 5
 }`

 **Highest Validation Accuracy at Epoch 15 : 0.40**
 ____________________
 ` params = {'batch_size' : 100,
'vocab_size' : len(word_to_idx_dict),
 'embed_size' : 128,
 'hidden_size' : 100,
 'num_layer' : 1,
 'max_sent_len' : 12,       
 'num_class' : 5}`

**Highest Validation Accuracy at Epoch 15 without pretrained embedding and Layer Normalization with xavier weight normalization: 0.67**

# Implication
- 최종 test accuracy는 약 60퍼센트이다.

In [109]:
batch = next(iter(test_iter))
X = batch[0].cuda(); y = batch[1].cuda()

In [110]:
log_softmax, word_attention, sent_attention = model(X.cuda())

In [111]:
correct_ratio = (log_softmax.argmax(dim=1) == y).sum().item() / batch_size

print("한 배치 사이즈에 대한 Accuracy는 {} % 입니다.".format(correct_ratio*100))

한 배치 사이즈에 대한 Accuracy는 70.0 % 입니다.


In [112]:
doc_ls = [[idx_to_word_dict[word.item()] for word in sent if word != 1]for sent in X[0]]

In [113]:
pd.DataFrame.from_dict(\
    dict(zip([','.join(sent).replace(",",' ') for sent in doc_ls],\
         [i.item() for i in sent_attention[0]])),orient='index',columns=['sent_attn_score'])\
            .sort_values('sent_attn_score',ascending=False)

Unnamed: 0,sent_attn_score
this place was a disappointment and way overpriced !,0.125161
unfortunately it was horrible ! ! !,0.123758
the pork came with beans and they were way too salty,0.121092
the pork tasted like it was a day old and it was,0.09867
we also ordered the caesar salad and it was doused with dressing,0.098664
had to ask for a replacement with dressing on the side,0.098221
my wife got the kailua pig as it was also recommended,0.071795
the service was the best part of the meal,0.071777
so we were told the calamari is very fresh and has,0.06957
it did taste very good and we were excited for our dinner,0.06957


# Coloring to word and sentence under Attention score

- **Text colors** :
    - grey : relative influencial word in positive sentence (first quatile)
    - red : relative influencial word in negative sentence (first quatile)
    - green : relative non-influencial word in sentence 
    
- **Text highlights** :
    - on_grey : relative influencial word in positive doc (first quatile)
    - on_red : relative influencial word in negative doc (first quatile)
    - on_green : relative non-influencial word in doc 

In [301]:
def coloring(model,X,y,index,sent_attention,word_attention,first_threshold=80,second_treshold=60) : 
    
    sent_ls = np.array([idx_to_word_dict[word.item()] for sent in X[index] for word in sent])
    sent_attention_ls = np.array([i[0].item() for i in sent_attention[index] for _ in range(25)])
    word_attention_ls = np.array([j.item() for i in word_attention[index] for j in i])

    first_sent_attn_threshold = np.percentile(sent_attention_ls,first_threshold)
    first_word_attn_threshold = np.percentile(word_attention_ls,first_threshold)

    second_sent_attn_threshold = np.percentile(sent_attention_ls,second_treshold)

    colored_doc = []

    for word,sent_attn,word_attn in zip(sent_ls,sent_attention_ls,word_attention_ls) : 
        color_ls = ''
        if word == '<pad>' : continue
        if sent_attn > first_sent_attn_threshold : 
            color_ls = 'green'
            if word_attn > first_word_attn_threshold : 
                colored_doc.append(colored(word,color_ls,attrs=['underline']))
            else : colored_doc.append(colored(word,color_ls))
        elif sent_attn > second_sent_attn_threshold : 
            color_ls = 'yellow'
            if word_attn > first_word_attn_threshold : 
                colored_doc.append(colored(word,color_ls,attrs=['underline']))
            else : colored_doc.append(colored(word,color_ls))
        else : 
            color_ls = 'grey'
            if word_attn > first_word_attn_threshold : 
                colored_doc.append(colored(word,color_ls,attrs=['underline']))
            else : colored_doc.append(colored(word,color_ls))
                
    for i in colored_doc : 
        print(i,end=' ')
    
    print("\n")
    display(Markdown('### The true label is {} and our model predicts the label {}.\
    If the model misclassifies the label, then the annotation might be also wrong.'\
         .format(y[index].item(),(model(X)[0].argmax(1) == y)[index].item())))

In [302]:
coloring(model,X,y,0,sent_attention,word_attention,first_threshold=90)

[30mso[0m [30m,[0m [30mwe[0m [30mwere[0m [4m[30mtold[0m [30mthe[0m [4m[30mcalamari[0m [30mis[0m [4m[30mvery[0m [30mfresh[0m [30mand[0m [30mhas[0m [30mit[0m [30mdid[0m [30mtaste[0m [4m[30mvery[0m [4m[30mgood[0m [30mand[0m [30mwe[0m [30mwere[0m [4m[30mexcited[0m [30mfor[0m [30mour[0m [30mdinner[0m [30mmy[0m [4m[30mwife[0m [30mgot[0m [30mthe[0m [4m[30mkailua[0m [30mpig[0m [4m[30mas[0m [30mit[0m [30mwas[0m [30malso[0m [4m[30mrecommended[0m [4m[33munfortunately[0m [33m,[0m [33mit[0m [4m[33mwas[0m [4m[33mhorrible[0m [33m![0m [33m![0m [33m![0m [33mthe[0m [33mpork[0m [4m[33mtasted[0m [33mlike[0m [33mit[0m [33mwas[0m [33ma[0m [33mday[0m [33mold[0m [33mand[0m [33mit[0m [33mwas[0m [33mthe[0m [33mpork[0m [33mcame[0m [33mwith[0m [33mbeans[0m [33mand[0m [33mthey[0m [33mwere[0m [33mway[0m [4m[33mtoo[0m [4m[33msalty[0m [4m[30mwe[0m [4m[30malso[0m [4

### The true label is 0 and our model predicts the label 0.    If the model misclassifies the label, then the annotation might be also wrong.