In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils import data

from nltk.tokenize import sent_tokenize,word_tokenize
from sklearn.model_selection import train_test_split

import string
import random
import re
import pandas as pd
from bs4 import BeautifulSoup
import os
from collections import defaultdict,Counter
import pickle
import numpy as np
import random
import warnings
import re
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

SEED = 1

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
word_to_idx_dict = load_obj('../data/yelp_word_to_idx_dict')
idx_to_word_dict = {idx:word for (word,idx) in word_to_idx_dict.items()}

train_X = torch.load('../data/yelp_train_X.pt')
train_y = torch.load('../data/yelp_train_y.pt')
test_X = torch.load('../data/yelp_test_X.pt')
test_y = torch.load('../data/yelp_test_y.pt')

train_X.shape, test_X.shape

(torch.Size([670000, 12, 25]), torch.Size([330000, 12, 25]))

In [4]:
test_X = torch.cat((train_X[1000:],test_X)); test_y = torch.cat((train_y[1000:],test_y))
train_X = train_X[:1000,:]; train_y = train_y[:1000]
test_X = test_X[:90000]; test_y = test_y[:90000]

train_X.shape, test_X.shape

(torch.Size([1000, 12, 25]), torch.Size([90000, 12, 25]))

In [5]:
class Dataset(data.Dataset):
    def __init__(self, X, y):
        'Initialization'
        self.y = y
        self.X = X
    
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.X)

    def __getitem__(self, index):
        # Load data and get label
        'Generates one sample of data'
        # Select sample
        X = self.X[index]
        y = self.y[index]

        return X, y


In [6]:
# Parameters
params = {'batch_size': 100,
          'shuffle': False,
          'num_workers': 10}

# Generators
training_set = Dataset(train_X,train_y)
train_iter = data.DataLoader(training_set, **params)

testing_set = Dataset(test_X,test_y)
test_iter = data.DataLoader(testing_set, **params)

# Pre-trained GloVe Embedding Vectors

In [7]:
# Extract word embeddings from the Glove
embeddings_index = dict()
f = open('../data/glove.twitter.27B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

matrix_len = len(word_to_idx_dict)
weights_matrix = np.zeros((matrix_len, 200))
words_found = 0

for i, word in enumerate(word_to_idx_dict.keys()):
    try: 
        weights_matrix[i] = embeddings_index[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(200, ))
        
print("전체 단어의 길이 {} 개에서 GloVe 벡터로 초기화된 단어의 갯수는 {} 개입니다.".format(len(word_to_idx_dict),words_found))

weights_matrix.shape

전체 단어의 길이 170007 개에서 GloVe 벡터로 초기화된 단어의 갯수는 83087 개입니다.


(170007, 200)

# Model Architecture

In [8]:
class WordAttention(nn.Module) : 
    
    def __init__(self,batch_size,hidden_size) : 
        
        super(WordAttention,self).__init__() 
        self.batch_size = batch_size
        self.linear = nn.Linear(hidden_size*2,hidden_size*2).to(device)
        self.word_proj_params = nn.Parameter(torch.Tensor(hidden_size*2,1)).to(device)
        self.initialize_weight()
        
    def initialize_weight(self) : 
        torch.nn.init.xavier_uniform_(self.linear.weight)
        torch.nn.init.xavier_uniform_(self.word_proj_params)
        
    def forward(self,outputs) : 
        
        outputs = outputs.permute(1,0,2) #[batch_size, sent_len, hidden_dim*2]

        u = torch.tanh(self.linear(outputs)) #[batch_size, sent_len, hidden_dim*2]  
        word_proj_params = self.word_proj_params.expand(self.batch_size,-1,-1) #[batch_size,hidden_dim*2,1]
    
        atten = torch.bmm(u,word_proj_params) #[batch_size,sent_len,1]
        a = torch.softmax(atten,dim=1) #[batch_size,sent_len,1]
        s = torch.sum(torch.mul(a,outputs),dim=1) #[batch_size,hidden_dim*2]
        
        return s,a

def create_emb_layer(weights_matrix):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim,padding_idx = 1) # <pad>
    emb_layer.weight = nn.Parameter(torch.tensor(weights_matrix,dtype=torch.float32))
    
    return emb_layer
    
class WordRNN(nn.Module) : 
    
    def __init__(self,batch_size,vocab_size,embed_size,hidden_size,num_layer,max_sent_len,weights_matrix) : 
        
        super(WordRNN,self).__init__()
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.gru_hidden_size = hidden_size
        self.num_layer = num_layer
        self.max_sent_len = max_sent_len
#         self.embeddings = nn.Embedding(vocab_size,embed_size,padding_idx = 1).to(device)
        self.embeddings = create_emb_layer(weights_matrix).to(device)
        # GloVe 로 Initialize만 시키고, Training이 가능하게 해줍니다.
        self.gru = nn.GRU(embed_size,hidden_size,num_layer,bidirectional=True).to(device)
        
        self.word_atten = WordAttention(batch_size,hidden_size).to(device)
        self.initialize_weight()
        
    def initialize_weight(self) : 
        for layer_p in self.gru._all_weights:
            for p in layer_p:
                if 'weight' in p:
                    nn.init.xavier_normal_(self.gru.__getattr__(p),)

    def forward(self,input_,hidden) : 
        
        sent_vec_ls = []; word_attention_ls = []
        
        for i in range(self.max_sent_len) : 
            x = input_[:,i,:]  # x : [batch_size, T :(word length per sentence)]
            embeds = self.embeddings(x).permute(1,0,2) # [T, batch_size, embed_dim] 

            outputs, hidden = self.gru(embeds,hidden)
            
            sent_vec,word_attention = self.word_atten(outputs)
        
            sent_vec_ls.append(sent_vec.unsqueeze(1))
            word_attention_ls.append(word_attention.permute(0,2,1))
        
        sent_vec = torch.cat(sent_vec_ls,dim=1)
        word_attention = torch.cat(word_attention_ls,dim=1)
                
        return sent_vec,word_attention,hidden
    # [batch_size,sent_len,hidden_size]
    # [batch_size,sent_len,word_len]
    # [num_layer*bidirectional(2), batch_size, hidden_size]

class SentAttention(nn.Module) : 
    
    def __init__(self,batch_size,hidden_size) : 
        
        super(SentAttention,self).__init__() 
        self.batch_size = batch_size
        self.linear = nn.Linear(hidden_size*2,hidden_size*2).to(device)
        self.sent_proj_params = nn.Parameter(torch.Tensor(hidden_size*2,1)).to(device)
        self.initialize_weight()
        
    def initialize_weight(self) : 
        torch.nn.init.xavier_uniform_(self.linear.weight)
        torch.nn.init.xavier_uniform_(self.sent_proj_params)
        
    def forward(self,outputs) : 
        
        outputs = outputs.permute(1,0,2) #[batch_size, doc_len, hidden_dim*2]
        u = torch.tanh(self.linear(outputs)) #[batch_size, doc_len, hidden_dim*2]
        sent_proj_params = self.sent_proj_params.expand(self.batch_size,-1,-1) #[batch_size,hidden_dim*2,1]
        atten = torch.bmm(u,sent_proj_params) #[batch_size,doc_len,1]
        a = torch.softmax(atten,dim=1) #[batch_size,doc_len,1]
        v = torch.sum(a * outputs,dim=1) #[batch_size,hidden_dim*2]
        return v,a

class SentRNN(nn.Module) : 
    
    def __init__(self,batch_size,vocab_size,embed_size,hidden_size,num_layer) : 
        
        super(SentRNN,self).__init__()
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.gru_hidden_size = hidden_size
        self.num_layer = num_layer
        
        self.gru = nn.GRU(hidden_size*2,hidden_size,num_layer,bidirectional=True).to(device)
        
        self.sent_atten = SentAttention(batch_size,hidden_size)
        self.initialize_weight()
        
    def initialize_weight(self) : 
        for layer_p in self.gru._all_weights:
            for p in layer_p:
                if 'weight' in p:
                    nn.init.xavier_normal_(self.gru.__getattr__(p),)

    def forward(self,x,hidden) : 
        
        x = x.permute(1,0,2) #x : [doc_len,batch_size, hidden*2]

        outputs, hidden = self.gru(x,hidden)
    
        doc_vec,sent_attention = self.sent_atten(outputs)
        
        return doc_vec,sent_attention,hidden
    
    #[batch_size,hidden_dim*2]
    #[batch_size,doc_len,1]
    #[num_layer*2,batch_size,hidden_dim]

class HAN(nn.Module) : 
    
    def __init__(self,batch_size,vocab_size,embed_size,hidden_size,num_layer,max_sent_len,num_class,weights_matrix) : 
        
        super(HAN,self).__init__()
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.num_layers = num_layer
        self.max_sent_len = max_sent_len
        self.num_class = num_class
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
        self.word_encoder =\
        WordRNN(batch_size,vocab_size,embed_size,hidden_size,num_layer,max_sent_len,weights_matrix).to(self.device)
        
        self.sent_encoder =\
        SentRNN(batch_size,vocab_size,embed_size,hidden_size,num_layer).to(self.device)
        
        self.proj_layer = nn.Linear(hidden_size*2,num_class).to(self.device)
        self.initialize_weight()
        
    def initialize_weight(self) : 
        torch.nn.init.xavier_uniform_(self.proj_layer.weight)
        
    def init_hidden(self,batch_size):
        hidden = \
        Variable(torch.zeros(self.num_layers*2, batch_size, self.hidden_size, device=self.device))
            
        return hidden
    
    def forward(self,input_) : 
        
        (batch_size,sent_len,doc_len) = input_.size()
        
        word_encoder_hidden = self.init_hidden(batch_size)
        sent_vec,word_attention,hidden = self.word_encoder(input_,word_encoder_hidden)
        sent_vec = nn.LayerNorm(self.hidden_size*2).to(device)(sent_vec)
        
        sent_encoder_hidden = self.init_hidden(batch_size)
        doc_vec,sent_attention,hidden = self.sent_encoder(sent_vec,sent_encoder_hidden)
        doc_vec = nn.LayerNorm(self.hidden_size*2).to(device)(doc_vec)
        
        logit = self.proj_layer(doc_vec)
        log_softmax = torch.log_softmax(logit,dim=1)
        
        return log_softmax, word_attention, sent_attention

params = {'batch_size' : 100,
'vocab_size' : len(word_to_idx_dict),
'embed_size' : 200,
'hidden_size' : 50,
'num_layer' : 1,
'max_sent_len' : 12,       
'num_class' : 5,
'weights_matrix' : weights_matrix,
}

model = HAN(**params).to(device)
model.load_state_dict(torch.load('../data/yelp_HAN.pt'))

# Trainig and Testing with AL

In [9]:
class QueryStrategy() : 
        
    def least_confidence(self,x,k=10,score=False) :

        max_idx_ls = x.max(1)[0].cpu().data.numpy()
        
        if score : 
            return max_idx_ls
        return list(np.argsort(1-max_idx_ls)[-k:])

    def margin_sampling(self,x,k=10,score=False) : 

        sorted_x = torch.sort(x,dim=1)[0]

        margin = sorted_x[:,-1] - sorted_x[:,-2]

        if score : 
            return margin
        
        return list(np.argsort(margin.cpu().data.numpy())[-k:])

    def entropy(self,x,k=10,score=False) : 

        entropy_ls = []

        def calc_entropy(softmax) : 

            entropy = 0

            for x in softmax : 
                entropy += -1 * np.exp(x.cpu().data.numpy()) * np.log2(np.exp(x.cpu().data.numpy()))
                # exponential 을 취해준 이유는 모델의 반환값이 log softmax 이기 때문입니다.
            return entropy

        for i in x : 
            entropy_ls.append(calc_entropy(i))
            
        if score : 
            return entropy_ls
        
        return list(np.argsort(np.array(entropy_ls))[-k:])

In [10]:
def add_to_loader(train_loader,test_loader,idx_ls) : 
    
    X = torch.cat((train_loader.dataset.X,test_loader.dataset.X[idx_ls]),dim=0)
    y = torch.cat((train_loader.dataset.y,test_loader.dataset.y[idx_ls]),dim=0)
    
    params = {'batch_size': 100,
          'shuffle': True,
          'num_workers': 10}

    training_set = Dataset(X,y)
    train_iter = data.DataLoader(training_set, **params)
    
    return train_iter

def pop_to_loader(loader,query_ls) : 
    
    orgin_idx_ls = [i for i in range(loader.dataset.X.shape[0])]
    
    idx_ls = []
    for val in orgin_idx_ls : 
        if val not in query_ls : 
            idx_ls.append(val)
            
    X = loader.dataset.X[idx_ls]
    y = loader.dataset.y[idx_ls]
    
    params = {'batch_size': 100,
          'shuffle': False,
          'num_workers': 10}

    # test_iter 는 shuffle 을 해서는 안됩니다. 그 이유는 validation datasets 에 대해서 softmax 값을 순차적으로 나열하고,
    # 이에 따라, k-means 를 적용 및 informative instance를 쿼리하기 때문에, sequence 정보를 보존해야 합니다.
    # 또한, 해당 데이터셋은 training 에는 적용되는 데이터가 아니기 때문에, shuffle 을 하지 않아도 무관합니다.
    
    testing_set = Dataset(X,y)
    iter_ = data.DataLoader(testing_set, **params)
    
    return iter_

# Train and Validation

In [11]:
def train(model, train_loader, test_loader, lr = 0.01, batch_size = 100, epoch = 10) :
    
    query = QueryStrategy()
    optimizer = torch.optim.Adam(model.parameters(),lr)
    criterion = nn.NLLLoss().to(device)
    criterion_cpu = nn.NLLLoss()
    
    early_bird_ls = [0.0]
    early_bird_count = 0
    
    iteration = 0
    
    for _ in range(15) : 
        iteration += 1
        val_batch_count = 0; batch_count = 0
        n_correct = 0; val_n_correct = 0
        score_ls = []
        
        train_length = train_loader.dataset.X.shape[0]
        test_length = test_loader.dataset.X.shape[0]
        with open("han_uncertainty_sampling.txt",'a') as f : 
            f.write("training 데이터셋의 크기 : {} , validation 데이터셋의 크기 : {}\n".format(train_length,test_length))
            f.write("###################################################################\n")
        
        if train_length > (train_length + test_length) * 0.6 : 
            with open("han_uncertainty_sampling.txt",'a') as f : 
                f.write("training 데이터셋의 크기가 전체의 60%가 넘었기 때문에, iteration 을 종료합니다.\n")
                
            return
        
        model.train()        
        
        for time in range(epoch) : 
            for train_batch, train_labels in train_loader:  

                batch_count += 1 
                if batch_count % 100 == 0 and batch_count != 0: 
                    print("{}번째 Training 배치가 돌고 있습니다.".format(batch_count),end='\r')

                train_softmax, _, _  = model(train_batch.to(device))
                train_predict = train_softmax.argmax(dim=1)     

                loss = criterion(train_softmax,train_labels.to(device))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if time == epoch - 1 : 
                    n_correct += (train_predict == train_labels.to(device)).sum().item()                    
        
                    
        acc = n_correct / (batch_size * len(train_loader))   
        early_bird_ls.append(acc)
        with open("han_uncertainty_sampling.txt",'a') as f : 
            f.write("{}번째 iteration 의 Training loss 는 {}입니다.\n".format(iteration,loss))
            f.write("{}번째 iteration 의 Training accuracy 는 {}입니다.\n".format(iteration,acc))
            f.write("###################################################################\n")
        
        model.eval()

        for test_batch, test_labels in test_loader: 
        
            val_batch_count += 1
            
            if val_batch_count % 100 == 0 : 
                print("{}번째 Validation 배치가 돌고 있습니다.".format(val_batch_count),end='\r')
            test_softmax, _, _  = model(test_batch.to(device))
            
            test_predict = test_softmax.argmax(dim=1)
            val_n_correct += (test_predict.cpu() == test_labels).sum().item()     
            
            loss = criterion_cpu(test_softmax.cpu(), test_labels)
            
            test_score = list(query.margin_sampling(test_softmax.data,k=0,score=True).cpu().data.numpy())
            score_ls += test_score
            
        val_acc = val_n_correct / (len(test_loader) * batch_size)
        
        query_idx_ls = np.argsort(score_ls)[-1000:]
        val_acc = val_n_correct / (len(test_loader) * batch_size)   
        train_loader = add_to_loader(train_loader,test_loader,query_idx_ls)
        test_loader = pop_to_loader(test_loader,query_idx_ls)
            
        with open("han_uncertainty_sampling.txt",'a') as f :
            f.write('{}번째 iteration 의 Validation loss 는 {}입니다.\n'.format(iteration,loss))
            f.write('{}번째 iteration 의 Validation accuracy 는 {}입니다.\n'.format(iteration,val_acc))
            f.write("###################################################################\n")
            
        if early_bird_ls[-1] < early_bird_ls[-2] : 
            early_bird_count += 1
            
            if early_bird_count > 5 : 
                break
    return print("EARLY BIRD BREAK!!")

In [12]:
train(model,train_iter,test_iter,epoch=3)

EARLY BIRD BREAK!!치가 돌고 있습니다.
