In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data

import pickle
import numpy as np
import random
import warnings
import re
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

SEED = 1

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
word_to_idx_dict = load_obj('english_mr_word_to_idx_dict')
idx_to_word_dict = {idx:word for (word,idx) in word_to_idx_dict.items()}

In [4]:
train_X = torch.load('english_mv_train_X.pt')
train_y = torch.load('english_mv_train_y.pt')
test_X = torch.load('english_mv_test_X.pt')
test_y = torch.load('english_mv_test_y.pt')

train_X.shape, test_X.shape

(torch.Size([39720, 100]), torch.Size([9986, 100]))

In [5]:
test_X = torch.cat((train_X[1000:],test_X)); test_y = torch.cat((train_y[1000:],test_y))
train_X = train_X[:1000,:]; train_y = train_y[:1000]

train_X.shape, test_X.shape

(torch.Size([1000, 100]), torch.Size([48706, 100]))

In [6]:
class Dataset(data.Dataset):
    def __init__(self, X, y):
        'Initialization'
        self.y = y
        self.X = X
    
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.X)

    def __getitem__(self, index):
        # Load data and get label
        'Generates one sample of data'
        # Select sample
        X = self.X[index]
        y = self.y[index]

        return X, y


In [7]:
# Parameters
params = {'batch_size': 100,
          'shuffle': False,
          'num_workers': 10}

# Generators
training_set = Dataset(train_X,train_y)
train_iter = data.DataLoader(training_set, **params)

testing_set = Dataset(test_X,test_y)
test_iter = data.DataLoader(testing_set, **params)

# Model Architecture

In [8]:
class CNN(nn.Module) : 
    
    def __init__(self,VOCAB_SIZE, EMBED_SIZE, HID_SIZE, DROPOUT, BATCH_SIZE ,KERNEL_SIZE, NUM_FILTER, N_CLASS ) : 
        super(CNN, self).__init__()
        self.vocab_size = VOCAB_SIZE 
        self.embed_size = EMBED_SIZE 
        self.hid_size = HID_SIZE 
        self.dropout = DROPOUT 
        self.batch_size = BATCH_SIZE
        if type(KERNEL_SIZE) !=list :
            self.kernel_size = list(KERNEL_SIZE)
        else : self.kernel_size = KERNEL_SIZE 
        self.num_filter = NUM_FILTER 
        self.num_class = N_CLASS 
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
        self.embedding = nn.Embedding(
            num_embeddings = self.vocab_size,
            embedding_dim = self.embed_size,
            padding_idx = 1) 
        
        self.convs = nn.ModuleList([(nn.Conv2d(in_channels = 1,out_channels = self.num_filter,\
        kernel_size = (kernel,self.embed_size))) for kernel in self.kernel_size])
        
        self.fully_connect = nn.Sequential(
        nn.Linear(self.num_filter * len(self.kernel_size),self.hid_size),nn.ReLU(),
        nn.Dropout(self.dropout),nn.Linear(self.hid_size , self.num_class),
        )
        
        self.initialize_weight()
        
    def initialize_weight(self) : 
        for conv in self.convs : 
            torch.nn.init.xavier_uniform_(conv.weight)
        
    def forward(self,x) : 
        embed = self.embedding(x) 
        embed = embed.unsqueeze(1)
                
        convolution = [conv(embed).squeeze(3) for conv in self.convs]
        
        pooled = [F.max_pool1d(conv,(conv.size(2))).squeeze(2) for conv in convolution]
        
        dropout = [F.dropout(pool,self.dropout) for pool in pooled]
        
        concatenate = torch.cat(dropout, dim = 1) 
        # [batch_size , num_filter * num_kernel]

        logit = self.fully_connect(concatenate)
        
        return torch.log_softmax(logit,dim=1)

In [9]:
VOCAB_SIZE = len(word_to_idx_dict)
EMBED_SIZE = 256
HID_SIZE = 128
DROPOUT = 0.5
BATCH_SIZE = 100
KERNEL_SIZE = [2,3,4,5]
NUM_FILTER = 4
N_CLASS = 2

model = CNN(VOCAB_SIZE, EMBED_SIZE, HID_SIZE, DROPOUT, BATCH_SIZE, KERNEL_SIZE, NUM_FILTER, N_CLASS).to(device)

In [10]:
log_softmax = model(next(iter(train_iter))[0].cuda())

# Trainig and Testing with AL

In [11]:
class QueryStrategy() : 
        
    def least_confidence(self,x,k=10,score=False) :

        max_idx_ls = x.max(1)[0].cpu().data.numpy()
        
        if score : 
            return max_idx_ls
        return list(np.argsort(1-max_idx_ls)[-k:])

    def margin_sampling(self,x,k=10,score=False) : 

        sorted_x = torch.sort(x,dim=1)[0]

        margin = sorted_x[:,-1] - sorted_x[:,-2]

        if score : 
            return margin
        
        return list(np.argsort(margin.cpu().data.numpy())[-k:])

    def entropy(self,x,k=10,score=False) : 

        entropy_ls = []

        def calc_entropy(softmax) : 

            entropy = 0

            for x in softmax : 
                entropy += -1 * x.cpu().data.numpy() * np.log2(x.cpu().data.numpy())

            return entropy

        for i in x : 
            entropy_ls.append(calc_entropy(i))
            
        if score : 
            return entropy_ls
        
        return list(np.argsort(np.array(entropy_ls))[-k:])
    

In [12]:
def add_to_loader(train_loader,test_loader,idx_ls) : 
    
    X = torch.cat((train_loader.dataset.X,test_loader.dataset.X[idx_ls]),dim=0)
    y = torch.cat((train_loader.dataset.y,test_loader.dataset.y[idx_ls]),dim=0)
    
    params = {'batch_size': 100,
          'shuffle': True,
          'num_workers': 10}

    training_set = Dataset(X,y)
    train_iter = data.DataLoader(training_set, **params)
    
    return train_iter

def pop_to_loader(loader,query_ls) : 
    
    orgin_idx_ls = [i for i in range(loader.dataset.X.shape[0])]
    
    idx_ls = []
    for val in orgin_idx_ls : 
        if val not in query_ls : 
            idx_ls.append(val)
            
    X = loader.dataset.X[idx_ls]
    y = loader.dataset.y[idx_ls]
    
    params = {'batch_size': 100,
          'shuffle': False,
          'num_workers': 10}

    # test_iter 는 shuffle 을 해서는 안됩니다. 그 이유는 validation datasets 에 대해서 softmax 값을 순차적으로 나열하고,
    # 이에 따라, k-means 를 적용 및 informative instance를 쿼리하기 때문에, sequence 정보를 보존해야 합니다.
    # 또한, 해당 데이터셋은 training 에는 적용되는 데이터가 아니기 때문에, shuffle 을 하지 않아도 무관합니다.
    
    testing_set = Dataset(X,y)
    iter_ = data.DataLoader(testing_set, **params)
    
    return iter_

# Diverse Mini-Batch Active Learning

#### 알고리즘 정의
- 전체 데이터 셋에 대한 softmax 값을 저장
- pre-filter 데이터셋 확보 $\beta k > k$
- weighted-kmeans 를 계산 (sklearn Kmeans 사용)
- centriod 와 각 instance 간의 distance 를 Euclidean distance를 통해 계산 및 sorting
- 상위 $k$ 개의 인스턴스 index를 반환
_________________________
- $\beta = 50$
- $k = 100$
- 이에 따라 $\beta k$ 즉, 1000개를 pre-filter 한다.
- 논문에서 $\beta$ 가 10일 경우, robust 하다고 했지만, Experiment results에서 MNIST 의 경우, CNN 아키텍처를 사용하고, 데이터 갯수도 많아졌을 경우, $\beta = 50$이 10일때보다 성능이 좋다고 하여, 50을 사용한다.

# Train and Validation

In [13]:
def train(model, train_loader, test_loader, lr = 0.01, batch_size = 100, epoch = 10) :
    
    query = QueryStrategy()
    optimizer = torch.optim.Adam(model.parameters(),lr)
    criterion = nn.NLLLoss().to(device)
    criterion_cpu = nn.NLLLoss()
    
    early_bird_ls = [0.0]
    early_bird_count = 0
    
    iteration = 0
    
    for _ in range(20) : 
        iteration += 1
        val_batch_count = 0; batch_count = 0
        n_correct = 0; val_n_correct = 0
        score_ls = []
        
        train_length = train_loader.dataset.X.shape[0]
        test_length = test_loader.dataset.X.shape[0]
        with open("english_random_sampling.txt",'a') as f : 
            f.write("training 데이터셋의 크기 : {} , validation 데이터셋의 크기 : {}\n".format(train_length,test_length))
            f.write("###################################################################\n")
        
        if train_length > (train_length + test_length) * 0.6 : 
            with open("english_random_sampling.txt",'a') as f : 
                f.write("training 데이터셋의 크기가 전체의 60%가 넘었기 때문에, iteration 을 종료합니다.\n")
                
            return
        
        model.train()        
        
        for time in range(epoch) : 
            for train_batch, train_labels in train_loader:  

                batch_count += 1 
                if batch_count % 100 == 0 and batch_count != 0: 
                    print("{}번째 Training 배치가 돌고 있습니다.".format(batch_count),end='\r')

                train_softmax = model(train_batch.to(device))
                train_predict = train_softmax.argmax(dim=1)     

                loss = criterion(train_softmax,train_labels.to(device))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if time == epoch - 1 : 
                    n_correct += (train_predict == train_labels.to(device)).sum().item()                    
        
                    
        acc = n_correct / (batch_size * len(train_loader))   
        early_bird_ls.append(acc)
        
        model.eval()

        for test_batch, test_labels in test_loader: 
        
            val_batch_count += 1
            
            if val_batch_count % 100 == 0 : 
                print("{}번째 Validation 배치가 돌고 있습니다.".format(val_batch_count),end='\r')
            test_softmax = model(test_batch.to(device)).cpu()
            
            test_predict = test_softmax.argmax(dim=1)
            val_n_correct += (test_predict == test_labels).sum().item()     
            
            loss = criterion(test_softmax, test_labels)
                    
        val_acc = val_n_correct / (len(test_loader) * batch_size)   
        
        query_idx_ls = np.random.choice(test_loader.dataset.X.shape[0], 1000, replace=False)
        # random sampling
        
        train_loader = add_to_loader(train_loader,test_loader,query_idx_ls)
        test_loader = pop_to_loader(test_loader,query_idx_ls)
            
        with open("english_random_sampling.txt",'a') as f :
            f.write('{}번째 iteration 의 Validation loss 는 {}입니다.\n'.format(iteration,loss))
            f.write('{}번째 iteration 의 Validation accuracy 는 {}입니다.\n'.format(iteration,val_acc))
            f.write("###################################################################\n")
            
#         if early_bird_ls[-1] < early_bird_ls[-2] : 
#             early_bird_count += 1
            
#             if early_bird_count > 5 : 
#                 break
    return print("EARLY BIRD BREAK!!")

In [14]:
train(model,train_iter,test_iter,epoch=3)

EARLY BIRD BREAK!!치가 돌고 있습니다.
