In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data

import pickle
import numpy as np
import random
import warnings

from sklearn.cluster import KMeans

SEED = 1

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
word_to_idx_dict = load_obj('word_to_idx_dict')
word_to_freq_dict = load_obj('word_to_freq_dict')

idx_to_word_dict = {idx:word for (word,idx) in word_to_idx_dict.items()}

In [4]:
train_X = torch.load('train_X.pt')
train_y = torch.load('train_y.pt')
test_X = torch.load('test_X.pt')
test_y = torch.load('test_y.pt')

In [5]:
class Dataset(data.Dataset):
    def __init__(self, X, y):
        'Initialization'
        self.y = y
        self.X = X
    
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.X)

    def __getitem__(self, index):
        # Load data and get label
        'Generates one sample of data'
        # Select sample
        X = self.X[index]
        y = self.y[index]

        return X, y


전체 데이터셋의 67% 가 training 데이터셋, 33%가 validation 데이터셋이였습니다. 가정이 labeled data의 갯수가 적다는 가정이기 때문에, 이를 바꿔서 DataLoader 를 할당해줍니다.

In [6]:
# Parameters
params = {'batch_size': 50,
          'shuffle': True,
          'num_workers': 10}

# Generators
training_set = Dataset(train_X,train_y)
test_iter = data.DataLoader(training_set, **params)

testing_set = Dataset(test_X,test_y)
train_iter = data.DataLoader(testing_set, **params)

# Model Architecture

In [7]:
class CNN(nn.Module) : 
    
    def __init__(self,VOCAB_SIZE, EMBED_SIZE, HID_SIZE, DROPOUT, BATCH_SIZE ,KERNEL_SIZE, NUM_FILTER, N_CLASS ) : 
        super(CNN, self).__init__()
        self.vocab_size = VOCAB_SIZE 
        self.embed_size = EMBED_SIZE 
        self.hid_size = HID_SIZE 
        self.dropout = DROPOUT 
        self.batch_size = BATCH_SIZE
        if type(KERNEL_SIZE) !=list :
            self.kernel_size = list(KERNEL_SIZE)
        else : self.kernel_size = KERNEL_SIZE 
        self.num_filter = NUM_FILTER 
        self.num_class = N_CLASS 
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
        self.embedding = nn.Embedding(
            num_embeddings = self.vocab_size,
            embedding_dim = self.embed_size,
            padding_idx = 1) 
        
        self.convs = nn.ModuleList([(nn.Conv2d(in_channels = 1,out_channels = self.num_filter,\
        kernel_size = (kernel,self.embed_size))) for kernel in self.kernel_size])
        
        self.fully_connect = nn.Sequential(
        nn.Linear(self.num_filter * len(self.kernel_size),self.hid_size),nn.ReLU(),
        nn.Dropout(self.dropout),nn.Linear(self.hid_size , self.num_class),
        )
        
    def forward(self,x) : 
        x = x.view(self.batch_size,-1) # [batch_size,max_length]
        
        embed = self.embedding(x) 
        embed = embed.unsqueeze(1)
                
        convolution = [conv(embed).squeeze(3) for conv in self.convs]
        
        pooled = [F.max_pool1d(conv,(conv.size(2))).squeeze(2) for conv in convolution]
        
        dropout = [F.dropout(pool,self.dropout) for pool in pooled]
        
        concatenate = torch.cat(dropout, dim = 1) 
        # [batch_size , num_filter * num_kernel]

        logit = self.fully_connect(concatenate)
        
        return torch.softmax(logit,dim=1)

In [8]:
VOCAB_SIZE = len(word_to_idx_dict)
EMBED_SIZE = 256
HID_SIZE = 128
DROPOUT = 0.5
BATCH_SIZE = 50
KERNEL_SIZE = [2,3,4,5]
NUM_FILTER = 4
N_CLASS = 5

model = CNN(VOCAB_SIZE, EMBED_SIZE, HID_SIZE, DROPOUT, BATCH_SIZE, KERNEL_SIZE, NUM_FILTER, N_CLASS).to(device)

# Trainig and Testing with AL

In [9]:
class QueryStrategy() : 
        
    def least_confidence(self,x,k=10,score=False) :

        max_idx_ls = x.max(1)[0].cpu().data.numpy()
        
        if score : 
            return max_idx_ls
        return list(np.argsort(1-max_idx_ls)[-k:])

    def margin_sampling(self,x,k=10,score=False) : 

        sorted_x = torch.sort(x,dim=1)[0]

        margin = sorted_x[:,-1] - sorted_x[:,-2]

        if score : 
            return margin
        
        return list(np.argsort(margin.cpu().data.numpy())[-k:])

    def entropy(self,x,k=10,score=False) : 

        entropy_ls = []

        def calc_entropy(softmax) : 

            entropy = 0

            for x in softmax : 
                entropy += -1 * x.cpu().data.numpy() * np.log2(x.cpu().data.numpy())

            return entropy

        for i in x : 
            entropy_ls.append(calc_entropy(i))
            
        if score : 
            return entropy_ls
        
        return list(np.argsort(np.array(entropy_ls))[-k:])

In [10]:
def add_to_loader(train_loader,test_loader,idx_ls) : 
    
    X = torch.cat((train_loader.dataset.X,test_loader.dataset.X[idx_ls]),dim=0)
    y = torch.cat((train_loader.dataset.y,test_loader.dataset.y[idx_ls]),dim=0)
    
    params = {'batch_size': 50,
          'shuffle': True,
          'num_workers': 10}

    # Generators
    training_set = Dataset(X,y)
    train_iter = data.DataLoader(training_set, **params)
    
    return train_iter

def pop_to_loader(loader,query_ls) : 
    
    orgin_idx_ls = np.array([i for i in range(loader.dataset.X.shape[0])])
    not_int_ls = np.isin(orgin_idx_ls,query_ls,invert=True)
    idx_ls = [i for i in not_int_ls if i]
    
    X = loader.dataset.X[idx_ls]
    y = loader.dataset.y[idx_ls]
    
    params = {'batch_size': 50,
          'shuffle': True,
          'num_workers': 10}

    # Generators
    testing_set = Dataset(X,y)
    iter_ = data.DataLoader(testing_set, **params)
    
    return iter_

# Diverse Mini-Batch Active Learning

#### 알고리즘 정의
- 전체 데이터 셋에 대한 softmax 값을 저장
- pre-filter 데이터셋 확보 $\beta k > k$
- weighted-kmeans 를 계산 (sklearn Kmeans 사용)
- centriod 와 각 instance 간의 distance 를 Euclidean distance를 통해 계산 및 sorting
- 상위 $k$ 개의 인스턴스 index를 반환
_________________________
- $\beta = 10$
- $k = 100$
- 이에 따라 $\beta k$ 즉, 1000개를 pre-filter 한다.

In [12]:
class Diverse_AL() : 
    
    def __init__(self,loader,score_ls,beta,k,rnd_state=123) : 
        
        """
        loader : iterator for training or validation dataset 
        score_ls : container which save the uncertainty score such as margin sampling // dtype : list
        beta : pre-filter hyperparameter
        k : number of query instance        
        """
        
        self.X_data = loader.dataset.X
        self.batch = loader.dataset.X.view(self.X_data.shape[0],-1).cpu().data.numpy()        
        self.score = torch.cat(score_ls).cpu().data.numpy()
        self.k = k ; self.beta = beta; self.rnd_state = rnd_state
        self.prefilter()
        
    def prefilter(self) : 
        """
        When run the kmeans algorithm, 
        if the number of distinct data is less than the number of clusters
        add code that reduces the number of clusters by half 
        """
        self.prefiltered_batch = self.batch[np.argsort(self.score) < self.k * self.beta]
        self.prefiltered_score = np.argpartition(self.score,-self.k * self.beta)[-self.k * self.beta:]

        with warnings.catch_warnings():
            warnings.simplefilter("error")
            try : 
                kmeans = KMeans(n_clusters=self.k, random_state = self.rnd_state,n_jobs = 10)\
                    .fit(self.prefiltered_batch, sample_weight = self.prefiltered_score)
            except : 
                kmeans = KMeans(n_clusters=self.k//2 , random_state = self.rnd_state,n_jobs = 10)\
                .fit(self.prefiltered_batch, sample_weight = self.prefiltered_score)
            
        self.clusters=kmeans.fit_predict(self.prefiltered_batch)
        self.centroids = kmeans.cluster_centers_
            
    def return_instance(self) : 

        idx_ls = []; return_ls = []
        np_array = self.prefiltered_batch.copy()

        def _k_mean_distance(data, centroids, clusters):

            '''
            data : batch_data // dtype : numpy.array([])
            centriod : kmeans.cluster_centers_ // [num_clusters,data_dimension]
            clusters : label assigned to each batch // [batch_size,]
            '''

            label_ls = []; dist_ls = []

            for idx in list(set(clusters)) :
                label_ls.append(idx)
                dist_ls.append(np.sqrt(((np_array[clusters==idx] - centroids[idx]) ** 2).sum(axis=1)))

            return dict(zip(label_ls,dist_ls))

        dist_dict = _k_mean_distance(np_array,self.centroids,self.clusters)

        for key in list(dist_dict.keys()) : 
            idx_ls.append(np.argmin(dist_dict[key]))

        for idx,key in zip(idx_ls,list(dist_dict.keys())) : 

            return_ls.append(np_array[self.clusters==key][idx])

        return return_ls
    
    def query(self) : 

        query_idx_ls = []
        query_ls = self.return_instance()

        for idx1 in range(len(self.prefiltered_batch)) : 
            for idx2 in range(len(query_ls)) : 
                if all(self.prefiltered_batch[idx1] == query_ls[idx2]) : 
                    query_idx_ls.append(idx1)
                
        return query_idx_ls

# Train and Validation

In [13]:
def train(model, train_loader, test_loader, lr = 0.01, batch_size = 50) :
    
    query = QueryStrategy()
    optimizer = torch.optim.Adam(model.parameters(),lr)
    criterion = nn.CrossEntropyLoss().to(device)
    criterion_cpu = nn.CrossEntropyLoss()
    
    training_loss_ls = []; validation_loss_ls = []
    
    iteration = 0
    
    while True : 
        iteration += 1
        batch_count = 0; val_batch_count = 0
        score_ls = []
        
        train_length = train_loader.dataset.X.shape[0]
        test_length = test_loader.dataset.X.shape[0]
        print("training 데이터셋의 크기 : {} , validation 데이터셋의 크기 : {}".format(train_length,test_length))
        print("###################################################################")

        if train_length > 700000 : 
            print("training 데이터셋의 크기가 전체의 70%가 넘었기 때문에, iteration 을 종료합니다.")
            print("최종 Validation Loss : {}".format(validation_loss_ls[-1])
                 )
        model.train()        
        
        for train_batch, train_labels in train_loader:  
            
            batch_count += 1 
            if batch_count % 100 == 0 and batch_count != 0: 
                print("{}번째 Training 배치가 돌고 있습니다."\
                      .format(batch_count),end='\r')
                        
            train_softmax = model(train_batch.to(device))
            train_predict = train_softmax.argmax(dim=1)

            loss = criterion(train_softmax,train_labels.to(device))
            training_loss_ls.append(loss)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        print("{}번째 iteration 의 Training loss 는 {}입니다.".format(iteration,loss))
        print("###################################################################")
        
        model.eval()

        for test_batch, test_labels in test_loader: 
        
            val_batch_count += 1
            
            if val_batch_count % 100 == 0 : 
                print("{}번째 Validation 배치가 돌고 있습니다.".format(val_batch_count),end='\r')
            test_softmax = model(test_batch.to(device)).cpu()
            
            loss = criterion(test_softmax, test_labels)
            validation_loss_ls.append(loss.item())
            
            test_score = query.margin_sampling(test_softmax.data,score=True)             
            score_ls.append(test_score)
        
        mini_batch_al = Diverse_AL(test_loader,score_ls,10,100) # beta = 10, k = 10
        query_idx_ls = mini_batch_al.query()
        
        train_loader = add_to_loader(train_loader,test_loader,query_idx_ls)
        test_loader = pop_to_loader(test_loader,query_idx_ls)
                
        print('{}번째 interation 의 Validation loss 는 {}입니다.'.format(iteration,loss))
        print("###################################################################")
    return training_loss_ls, validation_loss_ls