## NLP implementation of paper **Source Hypothesis learning for unsupervised domain adaptation**

Paper: https://arxiv.org/pdf/2002.08546v4.pdf

Code: https://github.com/tim-learn/SHOT

### Relevant Information about Pretrained Model

*   inputs of pretrained model is input_ids and attention_masks, one per sample, as model(input_ids,attention_masks)
*   Each input_id and attention mask has feature size of 128 for each sample
*   output of the model is tuple of shape 2
*   **out[0]**: output for classification with shape 2 for binary classification
*   **out[1]**: outputs for hidden layers of Roberta with shape 13 in the order: **word embeddings + 12 hidden layers of encoder**
*   **out[1][0]**: word embedding layer; **out[1][12]**: last layer of encoder (before pooling)



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# !pip3 install -r 'drive/My Drive/source-free-domain-adaptation/baselines/negation/requirements.txt'
# !pip3 install torchviz
!pip3 install transformers==3.0.2
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)
%cd 'drive/My Drive/source-free-domain-adaptation'

1.7.0+cu101
True
10.1
[Errno 2] No such file or directory: 'drive/My Drive/source-free-domain-adaptation'
/content/drive/My Drive/source-free-domain-adaptation


In [None]:
import logging, os, argparse
import numpy as np
from torch.utils.data.dataset import Dataset
from transformers.data.processors.utils import InputExample, InputFeatures, DataProcessor
from transformers.data.processors.glue import glue_convert_examples_to_features
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
import argparse
import os, sys, time
import os.path as osp
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import random, pdb, math, copy
from tqdm import tqdm
from scipy.spatial.distance import cdist
from sklearn.metrics import confusion_matrix
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score,precision_score,recall_score
from tabulate import tabulate

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

labels = ["-1", "1"]
max_length = 128
logger = logging.getLogger(__name__)

In [None]:
SEED = 2020
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [None]:
model_name = "tmills/roberta_sfda_sharpseed"
config = AutoConfig.from_pretrained(model_name,output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, config=config)

### importing non trainable pretrained model
'''
inputs of pretrained model is input_ids and attention_masks as model(input_ids,attention_masks)
output of the model is tuple of shape 2
out[0]: output for classification with shape: 2 for binary classification
out[1]: outputs for hidden layers of Roberta with shape 13 in the order: word embeddings + 12 hidden layers of encoder
'''
fixed_source_net = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

### loading target traiable network
target_net = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

Some weights of the model checkpoint at tmills/roberta_sfda_sharpseed were not used when initializing RobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at tmills/roberta_sfda_sharpseed were not used when initializing RobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architectu

In [None]:
class NegationDataset(Dataset):
    def __init__(self, features):
        self.features = features
        self.label_list = ["-1", "1"]

    def __len__(self):
        return len(self.features)

    def __getitem__(self, i) -> InputFeatures:
        return self.features[i]

    def get_labels(self):
        return self.label_list

    @classmethod
    def from_tsv(cls, tsv_file, tokenizer):
        """Creates examples for the test set."""
        lines = DataProcessor._read_tsv(tsv_file)

        examples = []
        for (i, line) in enumerate(lines):
            guid = 'instance-%d' % i
            if line[0] in labels: text_a = '\t'.join(line[1:])
            else: text_a = '\t'.join(line)
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=None))

        features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, label_list=labels,
            output_mode='classification')
        return cls(features)

# generating train dataset for negation
train_dataset = NegationDataset.from_tsv('practice_text/negation/train.tsv', tokenizer)

In [None]:
### inputs of pretrained model is input_ids and attention_masks as model(input_ids,attention_masks)
input_ids = []
attention_masks = []
for feat in train_dataset.features:
  input_ids.append(feat.input_ids)
  attention_masks.append(feat.attention_mask)

input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
print(input_ids.shape,attention_masks.shape)

torch.Size([2886, 128]) torch.Size([2886, 128])


In [None]:
dataset = TensorDataset(input_ids, attention_masks,torch.arange(input_ids.size(0)))
batch_size=32 # 32

train_pseudolabels_dataloader = DataLoader(
            dataset,  # The training samples.
            # sampler = RandomSampler(dataset), # Select batches randomly
            # shuffle= True,
            batch_size = batch_size # Trains with this batch size.
        )

train_dataloader = DataLoader(
            dataset,  # The training samples.
            # sampler = RandomSampler(dataset), # Select batches randomly
            shuffle= True,
            batch_size = batch_size # Trains with this batch size.
        )

In [None]:
# just for testing
# target_net = target_net.cuda()
# for i,d in enumerate(train_pseudolabels_dataloader):
#     print(d[0].size())
#     out = target_net(d[0].cuda(),d[1].cuda())
#     # print(np.shape(out[1][-1]))
#     # feat = target_net.roberta.pooler(out[1][-1])
#     # print(np.shape(feat))
#     print('done')
#     break

In [None]:
def op_copy(optimizer):
    for param_group in optimizer.param_groups:
        param_group['lr0'] = param_group['lr']
    return optimizer

def lr_scheduler(optimizer, iter_num, max_iter, gamma=10, power=0.75):
    decay = (1 + gamma * iter_num / max_iter) ** (-power)
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr0'] * decay
        param_group['weight_decay'] = 1e-3
        param_group['momentum'] = 0.9
        param_group['nesterov'] = True
    return optimizer

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import math
import torch.nn.functional as F
import pdb

def Entropy(input_):
    bs = input_.size(0)
    entropy = -input_ * torch.log(input_ + 1e-5)
    entropy = torch.sum(entropy, dim=1)
    return entropy

In [None]:
def obtain_pseudolabels(loader, target_net): # target_net must be gpu enabled i.e. target_net.cuda()
    start_test = True
    with torch.no_grad():
        iter_test = iter(loader)
        for _ in range(len(loader)):
            data = iter_test.next()
            input_id = data[0].cuda()
            attention_mask = data[1].cuda()
            out = target_net(input_id,attention_mask)
            feas = out[1][-1][:,0,:]
            outputs = out[0] 
            if start_test:
                all_fea = feas.float().cpu()
                all_output = outputs.float().cpu()
                start_test = False
            else:
                all_fea = torch.cat((all_fea, feas.float().cpu()), 0)
                all_output = torch.cat((all_output, outputs.float().cpu()), 0)
            # break
    all_output = nn.Softmax(dim=1)(all_output)
    _, predict = torch.max(all_output, 1)
    
    all_fea = torch.cat((all_fea, torch.ones(all_fea.size(0), 1)), 1)
    all_fea = (all_fea.t() / torch.norm(all_fea, p=2, dim=1)).t()
    all_fea = all_fea.float().cpu().detach().numpy()

    K = all_output.size(1)
    aff = all_output.float().cpu().detach().numpy()
    initc = aff.transpose().dot(all_fea)
    initc = initc / (1e-8 + aff.sum(axis=0)[:,None])
    dd = cdist(all_fea, initc, 'cosine')
    pred_label = dd.argmin(axis=1)
    
    for round in range(1):
        aff = np.eye(K)[pred_label]
        initc = aff.transpose().dot(all_fea)
        initc = initc / (1e-8 + aff.sum(axis=0)[:,None])
        dd = cdist(all_fea, initc, 'cosine')
        pred_label = dd.argmin(axis=1)

    return pred_label.astype('int')

In [None]:
# iter_test = iter(train_pseudolabels_dataloader)
# for i in range(len(train_pseudolabels_dataloader)):
#     data = iter_test.next()
#     x = data[0].cuda()
#     y = x.cuda()
#     print(type(y))
#     break

In [None]:
### test obtain_pseudolabels
'''
takes about 19 mins for obtaining pseudo labels in CPU
Main reason - feed forward is slow
- much Faster on GPU
'''
# import time
# s = time.time()
# pseudo_labels= obtain_pseudolabels(train_pseudolabels_dataloader,target_net.cuda())
# e = time.time()
# print(np.shape(pseudo_labels))
# print((e-s)/60,'mins')
# print(np.shape(pseudo_labels),np.shape(predict))
# print(np.sum(pseudo_labels!=predict))

'\ntakes about 19 mins for obtaining pseudo labels in CPU\nMain reason - feed forward is slow\n- much Faster on GPU\n'

In [None]:
def train_target(loader, target_net, cls_par=0.1, epochs=10, lr=0.01, ent=True, gent=True, ent_par=1.0, 
                 save=True, save_path='tmp/task1_app2'):
    # target_net = target_net.cuda()
    target_net.zero_grad()

    for i,[name,param] in enumerate(target_net.named_parameters()):
        if i>198: #or i<165: ### i>196: pooler+classifier; i<133: first 8 layers 
            param.requires_grad=False
        else:
            param.requires_grad=True

    param_group = []
    for name, param in target_net.named_parameters():
        if param.requires_grad:
            param_group += [{'params': param, 'lr': lr}]

    optimizer = optim.SGD(param_group)
    optimizer = op_copy(optimizer)

    epoch_loss=0.0
    max_iter = epochs * len(loader)
    interval_iter = len(loader)
    iter_num = 0

    while iter_num < max_iter:
        optimizer.zero_grad()
        
        print(iter_num // interval_iter,iter_num % interval_iter,'completed')
        if iter_num % interval_iter == 0 and cls_par > 0.001:
            target_net.eval()
            mem_label = obtain_pseudolabels(train_pseudolabels_dataloader, target_net)
            mem_label = torch.from_numpy(mem_label)
            mem_label = mem_label.cuda()
            target_net.train()

        iter_num += 1
        lr_scheduler(optimizer, iter_num=iter_num, max_iter=max_iter)

        try:
            input_id, attention_mask, tar_idx = iter_test.next()
        except:
            iter_test = iter(loader)
            input_id, attention_mask, tar_idx = iter_test.next()

        # if input_id.size(0) == 1:
        #     continue

        input_id = input_id.cuda()
        attention_mask = attention_mask.cuda()
        target_net.train()
        out = target_net(input_id,attention_mask)
        features_test = out[1][-1][:,0,:]
        outputs_test = out[0]

        if cls_par > 0.001:
            pred = mem_label[tar_idx]
            classifier_loss = cls_par * nn.CrossEntropyLoss()(outputs_test, pred)
        else:
            classifier_loss = torch.tensor(0.0)
            classifier_loss = classifier_loss.cuda()

        if ent:
            softmax_out = nn.Softmax(dim=1)(outputs_test)
            entropy_loss = torch.mean(Entropy(softmax_out))
            if gent:
                msoftmax = softmax_out.mean(dim=0)
                ### torch.sum(-msoftmax * torch.log(msoftmax + 1e-5)) is actually -Ldiv because of (-msoftmax)
                ### IM Loss = Lent+Ldiv = Lent-(-Ldiv); that's why substraction
                entropy_loss -= torch.sum(-msoftmax * torch.log(msoftmax + 1e-6))

            im_loss = entropy_loss * ent_par
            classifier_loss += im_loss
        
        optimizer.zero_grad()
        classifier_loss.backward()
        optimizer.step()
        # print(target_net.classifier.out_proj.bias.grad)
        
        epoch_loss += (outputs_test.size(0)*classifier_loss.item())

        if iter_num % interval_iter==0:
            print('epoch',iter_num//interval_iter,'loss',epoch_loss/2886)
            epoch_loss=0.0
            # if (iter_num//interval_iter)%5==0:
            target_net.eval()
            fixed_source_net.eval()
            evaluation_in_train(test_dataloader,target_net.cuda(),fixed_source_net.cuda())
            

        # if save and (iter_num%interval_iter)==0:
        #     torch.save(target_net.state_dict(), osp.join(save_path, "target_net_" + '0' + ".pt"))
        #     print("model saved after epoch",iter_num//interval_iter)
        
    return target_net

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

start=time.time()
target_net = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
target_net = train_target(train_dataloader,target_net.cuda(),
                          lr=1e-3,cls_par=1.0,ent_par=1.0,epochs=20,ent=False,gent=False)
end=time.time()

print('total time:',(end-start)/60,'mins')

Some weights of the model checkpoint at tmills/roberta_sfda_sharpseed were not used when initializing RobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0 0 completed
0 1 completed
0 2 completed
0 3 completed
0 4 completed
0 5 completed
0 6 completed
0 7 completed
0 8 completed
0 9 completed
0 10 completed
0 11 completed
0 12 completed
0 13 completed
0 14 completed
0 15 completed
0 16 completed
0 17 completed
0 18 completed
0 19 completed
0 20 completed
0 21 completed
0 22 completed
0 23 completed
0 24 completed
0 25 completed
0 26 completed
0 27 completed
0 28 completed
0 29 completed
0 30 completed
0 31 completed
0 32 completed
0 33 completed
0 34 completed
0 35 completed
0 36 completed
0 37 completed
0 38 completed
0 39 completed
0 40 completed
0 41 completed
0 42 completed
0 43 completed
0 44 completed
0 45 completed
0 46 completed
0 47 completed
0 48 completed
0 49 completed
0 50 completed
0 51 completed
0 52 completed
0 53 completed
0 54 completed
0 55 completed
0 56 completed
0 57 completed
0 58 completed
0 59 completed
0 60 completed
0 61 completed
0 62 completed
0 63 completed
0 64 completed
0 65 completed
0 66 completed
0 67 

In [None]:
def load_testdata():
    test_dataset = NegationDataset.from_tsv('practice_text/negation/dev.tsv', tokenizer)

    input_ids = []
    attention_masks = []
    for feat in test_dataset.features:
        input_ids.append(feat.input_ids)
        attention_masks.append(feat.attention_mask)

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    print(input_ids.shape,attention_masks.shape)

    dataset = TensorDataset(input_ids, attention_masks,torch.arange(input_ids.size(0)))
    
    # batch_size=16 # 32
    print(batch_size)
    test_dataloader = DataLoader(dataset, batch_size = batch_size)

    return test_dataloader

In [None]:
test_dataloader = load_testdata()
def evaluation_in_train(test_dataloader,target_net,fixed_source_net):
    # target_net.load_state_dict(torch.load('tmp/task1_app2/target_net_0.pt'))
    target_net.eval()
    start=True
    for data in test_dataloader:
        out = target_net(data[0].cuda(),data[1].cuda())
        if start:
            predict = out[0].cpu().detach().numpy()
            start = False
        else:
            predict = np.concatenate((predict,out[0].cpu().detach().numpy()))
    predict = np.argmax(predict,1)
    # print(predict.shape)

    # fixed_source_net = fixed_source_net.cuda()
    # start=True
    # for data in test_dataloader:
    #     out1 = fixed_source_net(data[0].cuda(),data[1].cuda())
    #     if start:
    #         predict1 = out1[0].cpu().detach().numpy()
    #         start = False
    #     else:
    #         predict1 = np.concatenate((predict1,out1[0].cpu().detach().numpy()))
    # predict1 = np.argmax(predict1,1)
    # print(predict1.shape)

    pred = np.array([0]*5545)
    for i in range(5545):
        pred[i] = labels[predict[i]]
    # print(pred.shape)

    # pred1 = np.array([0]*5545)
    # for i in range(5545):
    #     pred1[i] = labels[predict1[i]]
    # print(pred1.shape)
    
    test_true = np.loadtxt('practice_text/negation/dev_labels.txt',dtype=np.int32)

    # print('pretrained',f1_score(test_true,pred1),precision_score(test_true,pred1),recall_score(test_true,pred1))
    # print('trained',f1_score(test_true,pred),precision_score(test_true,pred),recall_score(test_true,pred))

    scores = [['pretrained',0.834019,0.850746,0.817937],
              ['trained',f1_score(test_true,pred), precision_score(test_true,pred), recall_score(test_true,pred)]]
    print(tabulate(scores,headers=['model','f1 score','precision','recall']))

torch.Size([5545, 128]) torch.Size([5545, 128])
32


In [None]:
evaluation(test_dataloader,target_net,fixed_source_net)

NameError: ignored

In [None]:
for i,[name,param] in enumerate(target_net.named_parameters()):
    print(i,name)