In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip3 install torchviz
!pip3 install transformers==3.02
!pip3 install anaforatools==1.1.0
!pip3 install spacy==2.3.2



In [None]:
import os
import argparse
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    InputFeatures,
    AutoConfig,
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments
)
from spacy.lang.en import English
import anafora

In [None]:
model_name = "clulab/roberta-timex-semeval"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          config=config,
                                          use_fast=True)
fixed_source_net = AutoModelForTokenClassification.from_pretrained(model_name,
                                                        config=config)
fixed_source_net.cuda()

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm

In [None]:
import torch.nn as nn
import copy
class classification_head(nn.Module):
  def __init__(self):
      super().__init__()
      self.dropout = copy.deepcopy(fixed_source_net.dropout)
      self.classifier = copy.deepcopy(fixed_source_net.classifier) 
      # self.dropout = nn.Dropout(p=0.1,inplace=False)
      # self.classifier = nn.Linear(in_features=768, out_features=65, bias=True)
  def forward(self,x):
    x = self.dropout(x)
    x = self.classifier(x)
    return x
classifier = classification_head()
classifier


classification_head(
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=65, bias=True)
)

In [None]:
class TimexInputFeatures(InputFeatures):

    def __init__(self, input_ids, attention_mask, offset_mapping):
        super().__init__(input_ids=input_ids, attention_mask=attention_mask)
        self.offset_mapping = offset_mapping

    @classmethod
    def from_sentence(cls, input_data, sent_idx, sent_offset):
        input_ids = input_data["input_ids"][sent_idx]
        attention_mask = input_data["attention_mask"][sent_idx]
        offset_mapping = input_data["offset_mapping"][sent_idx]
        for token_idx, offset in enumerate(offset_mapping):
            start, end = offset.numpy()
            if start == end:
                continue
            start += sent_offset
            end += sent_offset
            offset_mapping[token_idx][0] = start
            offset_mapping[token_idx][1] = end
        return cls(
            input_ids,
            attention_mask,
            offset_mapping
        )


class TimexDataset(Dataset):

    def __init__(self, doc_indices, features):
        self.doc_indices = doc_indices
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, i):
        return self.features[i]

    @classmethod
    def from_texts(cls, text_dir, nlp, tokenizer):
        if not os.path.exists(text_dir):
            raise Exception("The %s directory does not exist." % text_dir)
        text_directory_files = anafora.walk(text_dir, xml_name_regex=".*((?<![.].{3})|[.]txt)$")
        features = []
        doc_indices = []
        cnt=0
        for text_files in text_directory_files:
            doc_index = len(features)
            text_subdir_path, text_doc_name, text_file_names = text_files
            if len(text_file_names) != 1:
                raise Exception("Wrong number of text files in %s" % text_subdir_path)
            text_file_path = os.path.join(text_dir, text_subdir_path, text_file_names[0])
            with open(text_file_path) as txt_file:
                text = txt_file.read()
            doc = nlp(text)
            
            input_raw = [sent.text_with_ws for sent in doc.sents]
            
            input_data = tokenizer(input_raw,
                                   return_tensors="pt",
                                   padding="max_length",
                                   truncation="longest_first",
                                   return_offsets_mapping=True)
            # if cnt==0:
            #   print("text: ",text)
            #   print("input_raw: ",input_raw)
            #   print("input_data: ",input_data)
            sent_offset = 0
            for sent_idx, _ in enumerate(input_data["input_ids"]):
                features.append(TimexInputFeatures.from_sentence(
                    input_data,
                    sent_idx,
                    sent_offset))
                sent_offset += len(input_raw[sent_idx])
            doc_indices.append((text_subdir_path, doc_index, len(features)))
            cnt = cnt+1
            # if cnt==57:
            #   break
        return cls(doc_indices, features)


def write_anafora(output_dir, dataset, predictions, tokenizer, config):

    def add_entity(data, doc_name, label, offset):
        entity_label = config.id2label[label] if label > 0 else None
        if entity_label is not None:
            anafora.AnaforaEntity()
            entity = anafora.AnaforaEntity()
            num_entities = len(data.xml.findall("annotations/entity"))
            entity.id = "%s@%s" % (num_entities, doc_name)
            entity.spans = ((offset[0], offset[1]),)
            entity.type = entity_label.replace("B-", "")
            data.annotations.append(entity)

    for doc_index in dataset.doc_indices:
        doc_subdir, doc_start, doc_end = doc_index
        doc_name = os.path.basename(doc_subdir)
        doc_features = dataset.features[doc_start:doc_end]
        doc_predictions = predictions[doc_start:doc_end]
        # print(doc_predictions)
        doc_predictions = np.argmax(doc_predictions, axis=2)
        # print(doc_predictions.shape)
        data = anafora.AnaforaData()
        for sent_labels, sent_features in zip(doc_predictions, doc_features):
            # Remove padding and <s> </s>
            special_mask = tokenizer.get_special_tokens_mask(sent_features.input_ids,
                                                             already_has_special_tokens=True)
            # print(special_mask)
            non_specials = np.count_nonzero(np.array(special_mask) == 0)
            sent_labels = sent_labels[1: non_specials + 1]
            sent_offsets = sent_features.offset_mapping[1: non_specials + 1]

            previous_label = 0
            previous_offset = [None, None]  # (start, end)
            for token_label, token_offset in zip(sent_labels, sent_offsets):
                label_diff = token_label - previous_label
                if token_label % 2 != 0:  # If odd number, it is B label
                    add_entity(data, doc_name, previous_label, previous_offset)
                    previous_label = token_label
                    previous_offset = token_offset
                elif label_diff == 1:  # If even number and diff with previous is 1, it is I label
                    previous_offset[1] = token_offset[1]
                elif previous_label > 0:  # If current is O label and previous not O we must write it.
                    add_entity(data, doc_name, previous_label, previous_offset)
                    previous_label = 0
                    previous_offset = [None, None]
            if previous_label > 0:  # If remaining previous not O we must write it.
                entity_label = config.id2label[previous_label]
                add_entity(data, doc_name, entity_label, previous_offset)
        doc_path = os.path.join(output_dir, doc_subdir)
        os.makedirs(doc_path, exist_ok=True)
        doc_path = os.path.join(doc_path,
                                "%s.TimeNorm.system.completed.xml" % doc_name)
        data.to_file(doc_path)


In [None]:
nlp = English()
nlp.add_pipe(nlp.create_pipe("sentencizer"))

dataset = TimexDataset.from_texts("drive/My Drive/SFDA/Time/data/Train", nlp, tokenizer) #loaded the whole dataset
cnt=0
input_ids = []
attention_mask = []
for f in dataset.features:
  input_ids.append(f.input_ids.numpy())
  attention_mask.append(f.attention_mask.numpy()) 

input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)
print(input_ids.shape)
def Entropy(input_):
    bs = input_.size(0)
    entropy = -input_ * torch.log(input_ + 1e-9)
    entropy = torch.sum(entropy, dim=1)
    return entropy

torch.Size([1116, 512])


In [None]:


entropyl = torch.tensor([]).cuda()
y_truel = torch.tensor([]).cuda()
m=torch.nn.Softmax(dim=1)
cnt=0
for feat in dataset.features:
  # print(feat)
  out = fixed_source_net(torch.tensor([feat.input_ids.numpy()]).cuda(),torch.tensor([feat.attention_mask.numpy()]).cuda())
  pred = m(out[0][0]).detach()
  y_true = torch.argmax(out[0][0],dim = 1)
  print(y_true)
  # print(pred.shape)
  del out
  entropy = Entropy(pred)
  del pred
  # print(entropy.shape)
  entropyl = torch.cat((entropyl,entropy),0)
  y_truel = torch.cat((y_truel,y_true),0)
  del entropy,y_true
  # print(entropyl.shape)
  # print(y_truel.shape)
# print(entropyl)
print(entropyl.shape)
ids = y_truel!=0


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 0, 9, 0, 0, 0, 0, 0, 0, 0,
     

In [None]:
l = entropyl.cpu().numpy()
# l = l.numpy()
l = l[ids.cpu().numpy()]
quantiles = np.quantile(l,[0.5,0.6,0.7,0.8,1])
print(quantiles)
print(ids.cpu().numpy())
print(l)

[0.02398647 0.04843569 0.0937101  0.25172172 3.11537933]
[False False False ... False False False]
[0.00771167 0.00889709 0.02310346 ... 0.05382148 0.01520937 0.00477003]


In [None]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 
dataset = TensorDataset(input_ids, attention_mask) 
# dataset.tensors
batch_size=1 # 32
train_dataloader = DataLoader(
            dataset,  # The training samples.
            # sampler = RandomSampler(dataset), # Select batches randomly        ### was commented for 0.8759
            shuffle = True,  ### not here for 0.8759
            batch_size = batch_size # Trains with this batch size.
        )
APM_dataloader = DataLoader(
    dataset,
    batch_size = batch_size # T
)


In [None]:
def op_copy(optimizer):
    for param_group in optimizer.param_groups:
        param_group['lr0'] = param_group['lr']
    return optimizer

def lr_scheduler(optimizer, iter_num, max_iter, gamma=10, power=0.75):
    decay = (1 + gamma * iter_num / max_iter) ** (-power)
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr0'] * decay
        param_group['weight_decay'] =0.0005
        param_group['momentum'] = 0.9
        param_group['nesterov'] = True
    return optimizer

def tensor_l2normalization(q):
    qn = torch.norm(q, p=2, dim=1).detach().unsqueeze(1)
    q = q.div(qn.expand_as(q))
    return q


In [None]:
def mean_crossentropy(y_logits,y_true):
  return nn.CrossEntropyLoss()(y_logits[0],y_true[0])
  


In [None]:
import time
def APM_ours(target_trainable_net):
  target_trainable_net = target_trainable_net.cuda()

  cnt=1
  available_cls = []
  h_dict = {}
  feat_dict = {}
  missing_cls = []
  after_softmax_numpy_for_emergency = []
  feature_numpy_for_emergency = []
  max_prototype_bound = 12
  start_time = time.time()
  m=torch.nn.Softmax(dim=2)

  for cls in range(65): ## number of classes and their cls
    h_dict[cls] = []
    feat_dict[cls] = []
  cnt=0
  for dt in APM_dataloader:
    cnt+=1
    # print(cnt)
    input_id = dt[0].cuda()
    attention_mask = dt[1].cuda()

    out = target_trainable_net(input_id,attention_mask=attention_mask,output_hidden_states = True)
    del input_id,attention_mask
    ### ?
    fc1 = out[1][-1][:,:,:]
    temp = fc1
    fc1 = fc1[0]
    del temp
    pred = m(out[0])
    temp = pred
    pred = pred[0]
    del temp
    del out
    after_softmax_numpy_for_emergency.append(pred.cpu().detach().numpy())
    feature_numpy_for_emergency.append(fc1.cpu().detach().numpy())
    pseudo_label = torch.argmax(pred, dim=1)
    pseudo_label = pseudo_label.cpu()
    entropy = torch.sum(- pred * torch.log(pred), dim=1, keepdim=True)
    entropy_norm = entropy / np.log(pred.size(1))
    entropy_norm = entropy_norm.squeeze(1)
    entropy_norm = entropy_norm.cpu()
    # diff = -torch.abs(out[0][:,0] - out[0][:,1])
    for cls in range(65):
      # stack H for each class
      cls_filter = (pseudo_label == cls)
      list_loc = (torch.where(cls_filter == 1))[0]
      num_element = list(list_loc.numpy())
      if len(list_loc) == 0:
          missing_cls.append(cls)
          continue
      available_cls.append(cls)
      filtered_ent = torch.gather(entropy_norm, dim=0, index=list_loc)
      filtered_feat = torch.gather(fc1.cpu(), dim=0, index=list_loc.unsqueeze(1).repeat(1, 768))
      h_dict[cls].append(filtered_ent.detach().numpy())
      feat_dict[cls].append(filtered_feat.cpu().detach().numpy())
    del entropy_norm, filtered_feat,list_loc,num_element,pseudo_label
  print("scan complete")
  h_dict[0] = h_dict[cls][:int(0.3*len(h_dict[cls]))]
  # print(len(h_dict[0]))
  s=0
  min=1580
  min_idx=0
  l=[]
  for i in range(0,65):
    if min>len(h_dict[i]) and len(h_dict[i])!=0:
      min=len(h_dict[i])
      min_idx= i
    if len(h_dict[i])==0:
      l.append(i)
    s += len(h_dict[i])
  # print(s)
  # print(min,min_idx)
  # print(h_dict[min_idx])
  # print(l)
  # print(len(h_dict[1]))
  available_cls = np.unique(available_cls)
  # print(feat_dict[0],h_dict[0])
  # print(feat_dict[1],h_dict[1])
  prototype_memory = []
  prototype_memory_dict = {}
  temp = after_softmax_numpy_for_emergency
  after_softmax_numpy_for_emergency = np.concatenate(after_softmax_numpy_for_emergency, axis=0)
  del temp
  temp = feature_numpy_for_emergency
  feature_numpy_for_emergency = np.concatenate(feature_numpy_for_emergency, axis=0)
  del temp
  max_top1_ent = 0
  for cls in available_cls:
    ents_np = np.concatenate(h_dict[cls], axis=0)
    ent_idxs = np.argsort(ents_np)
    top1_ent = ents_np[ent_idxs[0]]
    if max_top1_ent < top1_ent:
      max_top1_ent = top1_ent
      max_top1_class = cls
    del ents_np,ent_idxs,top1_ent

  class_protypeNum_dict = {}
  max_prototype = 0
  # max_prototype = 100
  for cls in available_cls:
    ents_np = np.concatenate(h_dict[cls], axis=0)
    ents_np_filtered = (ents_np <= max_top1_ent)
    class_protypeNum_dict[cls] = ents_np_filtered.sum()

    if max_prototype < ents_np_filtered.sum():
      max_prototype = ents_np_filtered.sum()
    del ents_np,ents_np_filtered
  if max_prototype > max_prototype_bound:
      max_prototype = max_prototype_bound
  # print(max_prototype)
  # print("reached")
  # print(feat_dict)
  for cls in range(65):
    # print(cls)
    if cls in available_cls:
      # print(h_dict[cls])
      ents_np = np.concatenate(h_dict[cls], axis=0)
      # print(ents_np)
      feats_np = np.concatenate(feat_dict[cls],axis=0)    # print(prototype_memory)xis=0)
      # print(feat_dict[cls])
      ent_idxs = np.argsort(ents_np)
      del ents_np
      # print(ent_idxs,class_protypeNum_dict[cls])
    
      # truncated_feat = feats_np[ent_idxs[:100]]
      truncated_feat = feats_np[ent_idxs[:class_protypeNum_dict[cls]]]
      # print(truncated_feat)
      del feats_np
      fit_to_max_prototype = np.concatenate([truncated_feat] * (int(max_prototype / truncated_feat.shape[0]) + 1),
                                            axis=0)
      
      del truncated_feat
      temp = fit_to_max_prototype
      fit_to_max_prototype = fit_to_max_prototype[:max_prototype, :]
      prototype_memory.append(fit_to_max_prototype)
      prototype_memory_dict[cls] = fit_to_max_prototype
      del fit_to_max_prototype,temp
    else:
      after_softmax_torch_for_emergency = torch.Tensor(after_softmax_numpy_for_emergency)
      emergency_idx = torch.argsort(after_softmax_torch_for_emergency, descending=True, dim=1)
      cls_emergency_idx = emergency_idx[:, cls]
      cls_emergency_idx = cls_emergency_idx[0]
      cls_emergency_idx_numpy = cls_emergency_idx.data.numpy()

      copied_features_emergency = np.concatenate(
          [np.expand_dims(feature_numpy_for_emergency[cls_emergency_idx_numpy], axis=0)] * max_prototype, axis=0)

      prototype_memory.append(copied_features_emergency)
      prototype_memory_dict[cls] = copied_features_emergency
      del copied_features_emergency,cls_emergency_idx,after_softmax_torch_for_emergency,emergency_idx,cls_emergency_idx_numpy

  print("** APM update... time:", time.time() - start_time)
  
  prototype_memory = np.concatenate(prototype_memory, axis=0) ## check
  num_prototype_ = int(max_prototype)
  print(num_prototype_,prototype_memory_dict,prototype_memory)
  return prototype_memory, num_prototype_, prototype_memory_dict

# APM_ours(fixed_source_net)


In [None]:
from torch.autograd import Variable
import copy 
import time
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
# def target_train(train_lr,train_update_freq):
# del config1,tokenizer1,target_trainable_net
config1 = AutoConfig.from_pretrained(model_name)
tokenizer1 = AutoTokenizer.from_pretrained(model_name,
                                          config=config1,
                                          use_fast=True)
target_trainable_net = AutoModelForTokenClassification.from_pretrained(model_name,
                                                        config=config1)
target_trainable_net.cuda()
classifier = classification_head()
classifier.cuda()

train_start_time=time.time()

train_min_step = 1116 ### 1000 originally
train_lr=0.0001
train_weight_decay = 0.0005
train_momentum = 0.9
train_update_freq = 500 # changed from 10

# target_trainable_net.train()
optimizer = optim.SGD(list(filter(lambda p: p.requires_grad, target_trainable_net.parameters()))+list(classifier.parameters()), 
                      lr=train_lr, weight_decay=train_weight_decay, momentum=train_momentum, nesterov=True)

optimizer = op_copy(optimizer)
global_step = 0
best_acc = 0
best_epoch = 0
epoch_id = 0
class_num =  2
max_epoch = (train_min_step*batch_size)//len(dataset)
pt_memory_update_frequncy =  train_update_freq
# eval_interval = 100
cnt=0
m=torch.nn.Softmax(dim=1)

fixed_source_net.train()
classifier.train()

fixed_source_net.zero_grad()
classifier.zero_grad()
target_trainable_net.zero_grad()

while global_step < train_min_step:
  epoch_id += 1
  max_iter = max_epoch * len(train_dataloader)
  print(max_iter)
  epoch_start_time=time.time()
  batch_loss = 0
  for i, dt in enumerate(train_dataloader):
    # APM init/update
    if (global_step) % pt_memory_update_frequncy == 0: ### change
        target_trainable_net.eval() ### change
        prototype_memory, num_prototype_,prototype_memory_dict = APM_ours(target_trainable_net)
        # prototype_memory,prototype_memory_dict,num_prototype_ = proto_augmentation(prototype_memory,prototype_memory_dict,num_prototype_) # Tried augmentation in prototype feature
    input_id = dt[0].cuda()
    attention_mask = dt[1].cuda()
    fixed_source_net.train()
    out_s = fixed_source_net(input_id,attention_mask=attention_mask)
    pseudo_label_s = torch.argmax(out_s[0], dim=2).cuda()
    pseudo_label_hot_s = torch.zeros(out_s[0].shape).cuda()
    # pseudo_label_hot_s = pseudo_label_hot_s.scatter(1,pseudo_label_s.unsqueeze(1),1.0).cuda()
    ## trainable target model
    target_trainable_net.train()
    out_t = target_trainable_net(input_id,attention_mask=attention_mask,output_hidden_states=True)
    fc_t = out_t[1][-1][:,:,:] #target_trainable_net.roberta.pooler(out_t[1][-1])
    feature_embed_tensor = fc_t[0].cpu()
    # print(fc_t)
    
    logit_s2t = classifier(fc_t) ### change
    # print(logit_s2t)
    logit_t = out_t[0]
    print(global_step)
    
    # logit_t.retain_grad() ### change
    # logit_t=m(logit_t)
    # logit_t.retain_grad()
    
    # wt = classifier.dense.weight.cpu().detach().numpy()
    wt1 = target_trainable_net.roberta.encoder.layer[10].attention.output.dense.weight.cpu().detach().numpy()

    

    proto_feat_tensor = torch.Tensor(prototype_memory)
    proto_feat_tensor = tensor_l2normalization(proto_feat_tensor)
    batch_feat_tensor = tensor_l2normalization(feature_embed_tensor)

    sim_mat = torch.mm(batch_feat_tensor, proto_feat_tensor.permute(1,0))
    sim_mat = F.avg_pool1d(sim_mat.unsqueeze(0), kernel_size=num_prototype_, stride=num_prototype_).squeeze(0)# (B, #class)
    

    pseudo_label_t = torch.argmax(sim_mat, dim=1).cuda()
    # # pseudo_label_hot_t = torch.zeros(sim_mat.shape).scatter(1,a.unsqueeze(1),1.0).cuda()
    if not np.array_equal(pseudo_label_t.cpu().detach().numpy(),pseudo_label_s.cpu().detach().numpy()):
      # print(sim_mat,logit_t,pseudo_label_s,pseudo_label_t)
      cnt=cnt+1
    

    # # confidence-based filtering
    arg_idxs = torch.argsort(sim_mat, dim=1, descending=True) # (B, #class)
    first_group_idx = arg_idxs[:, 0]
    second_group_idx = arg_idxs[:, 1]
    first_group_feat = [prototype_memory_dict[int(x.numpy())] for x in first_group_idx]
    first_group_feat_tensor = torch.tensor(np.concatenate(first_group_feat, axis=0)) # (B*P, 2048)
    first_group_feat_tensor = tensor_l2normalization(first_group_feat_tensor)

    second_group_feat = [prototype_memory_dict[int(x.numpy())] for x in second_group_idx]
    second_group_feat_tensor = torch.tensor(np.concatenate(second_group_feat, axis=0)) # (B*P, 2048)
    second_group_feat_tensor = tensor_l2normalization(second_group_feat_tensor)

    feature_embed_tensor_repeat = torch.Tensor(np.repeat(feature_embed_tensor.cpu().detach().numpy(), repeats=num_prototype_, axis=0))
    feature_embed_tensor_repeat = tensor_l2normalization(feature_embed_tensor_repeat)
    print(first_group_feat_tensor.shape)
    print(feature_embed_tensor.shape)
    first_dist_mat = 1 - torch.mm(first_group_feat_tensor, feature_embed_tensor_repeat.permute(1,0)) # distance = 1  - simialirty
    second_dist_mat = 1 - torch.mm(second_group_feat_tensor, feature_embed_tensor_repeat.permute(1,0))

    first_dist_mat = F.max_pool2d(first_dist_mat.permute(1,0).unsqueeze(0).unsqueeze(0), kernel_size=num_prototype_, stride=num_prototype_).squeeze(0).squeeze(0)# (B, #class)
    second_dist_mat = -1*F.max_pool2d(-1* second_dist_mat.permute(1,0).unsqueeze(0).unsqueeze(0), kernel_size=num_prototype_, stride=num_prototype_).squeeze(0).squeeze(0)# (B, #class)

    first_dist_vec = torch.diag(first_dist_mat) #(B)
    second_dist_vec = torch.diag(second_dist_mat) # B
    confidence_mask = ((first_dist_vec- second_dist_vec) < 0).cuda()
    
    
    # # optimize target network using two types of pseudo labels
    ce_from_s2t = nn.CrossEntropyLoss()(logit_s2t[0], pseudo_label_s[0])
    ce_from_t = nn.CrossEntropyLoss(reduction='none')(logit_t[0], pseudo_label_t).view(-1, 1).squeeze(1)
    ce_from_t = torch.mean(ce_from_t*confidence_mask, dim=0, keepdim=True)
    alpha = np.float(2.0 / (1.0 + np.exp( -0.08* global_step / float(train_min_step//2))) - 1.0)   ### 0.8 for 0.8759 ## changed from -0.08 to -0.8
    ce_total = (1 - alpha) * ce_from_s2t + alpha * ce_from_t
    # ce_total = ce_from_s2t
    if global_step%5==0:
      batch_loss = ce_total
    else:
      batch_loss+=ce_total
    global_step+=1 
    if global_step%5==0:
      optimizer.zero_grad()
      batch_loss /= 5
      batch_loss.backward(retain_graph=True)
      print(batch_loss)
      optimizer.step()
      lr_scheduler(optimizer, iter_num=global_step, max_iter=max_iter)
      new_wt1 = target_trainable_net.roberta.encoder.layer[10].attention.output.dense.weight.cpu().detach().numpy()
      if not np.array_equal(wt1,new_wt1):
        print("updated")
      else:
        print("not updated")
         
    # ce_from_s2t = forward(logit_s2t, pseudo_label_hot_s)
    # # ce_from_s2t.retain_grad() ### change

    # ce_from_t = nn.CrossEntropyLoss(reduction='none')(logit_t, pseudo_label_t).view(-1, 1).squeeze(1)
    # # ce_from_t.retain_grad() ### change
    # # print(confidence_mask)
    # ce_from_t = torch.mean(ce_from_t*confidence_mask, dim=0, keepdim=True)
    # # ce_from_t.retain_grad() ### change
    # alpha = np.float(2.0 / (1.0 + np.exp( -0.08* global_step / float(train_min_step//2))) - 1.0)   ### 0.8 for 0.8759 ## changed from -0.08 to -0.8
    # # alpha = 0
    # ce_total = (1 - alpha) * ce_from_s2t + alpha * ce_from_t
    # # ce_total.retain_grad() ### change

    # optimizer.zero_grad()
    # # ce_from_s2t.backward(retain_graph=True)
    # ce_total.backward(retain_graph=True)
    # optimizer.step()
    # # print(global_step,ce_from_s2t.cpu().detach().numpy())
    # print(global_step,ce_total.cpu().detach().numpy(),ce_from_s2t.cpu().detach().numpy(),ce_from_t.cpu().detach().numpy())
    # global_step += 1


  epoch_end_time=time.time()
  print("time taken for epoch no. {} is {}".format(epoch_id,epoch_end_time-epoch_start_time))
  target_trainable_net.eval()
  fixed_source_net.eval()
  # target_trainable_net.save_pretrained()
  tokenizer1.save_pretrained("/content/drive/My Drive/SFDA/Time/model/task2_model_APM/"+str(epoch_id)+"/")
  target_trainable_net.save_pretrained("/content/drive/My Drive/SFDA/Time/model/task2_model_APM/"+str(epoch_id)+"/")
  # acc = evaluation(test_dataloader,target_trainable_net,fixed_source_net)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
torch.Size([512, 768])
114
torch.Size([6144, 768])
torch.Size([512, 768])
tensor([0.1587], device='cuda:0', grad_fn=<DivBackward0>)
updated
115
torch.Size([6144, 768])
torch.Size([512, 768])
116
torch.Size([6144, 768])
torch.Size([512, 768])
117
torch.Size([6144, 768])
torch.Size([512, 768])
118
torch.Size([6144, 768])
torch.Size([512, 768])
119
torch.Size([6144, 768])
torch.Size([512, 768])
tensor([0.1652], device='cuda:0', grad_fn=<DivBackward0>)
updated
120
torch.Size([6144, 768])
torch.Size([512, 768])
121
torch.Size([6144, 768])
torch.Size([512, 768])
122
torch.Size([6144, 768])
torch.Size([512, 768])
123
torch.Size([6144, 768])
torch.Size([512, 768])
124
torch.Size([6144, 768])
torch.Size([512, 768])
tensor([0.1733], device='cuda:0', grad_fn=<DivBackward0>)
updated
125
torch.Size([6144, 768])
torch.Size([512, 768])
126
torch.Size([6144, 768])
torch.Size([512, 768])
127
torch.Size([6144, 768])
torch.Size([512, 768])


In [None]:
tokenizer1.save_pretrained("/content/drive/My Drive/task2_model_without_APM")
target_trainable_net.save_pretrained("/content/drive/My Drive/task2_model_without_APM")

In [None]:
import glob
import os
# if not os.path.exists('my_folder'):
    # os.makedirs('my_folder')
# files = glob.glob('/content/drive/My Drive/source-free-domain-adaptation/practice_text/time/*')
l = ["AQUAINT","TimeBank","te3-platinum"]
# train_dir = "/content/drive/My Drive/source-free-domain-adaptation/practice_text/Train"
# test_dir = "/content/drive/My Drive/source-free-domain-adaptation/practice_text/Test"
# label_dir = "/content/drive/My Drive/source-free-domain-adaptation/practice_data/test_time"
for i in l:
  files = glob.glob('/content/drive/My Drive/SFDA/Time/data/Dev/'+str(i)+'/*')
  ln = len(files)
  # train_files = files[:int(ln*2/3)]
  # test_files = files[int(ln*2/3):]
  # train_sub_dir = train_dir+"/"+i+"/"
  # test_sub_dir = test_dir+"/"+i+"/"
  # label_sub_dir = label_dir+"/"+i+"/"
  # for fl in train_files:
  #   # print(fl)
  #   !cp -r "$fl" "$train_sub_dir"
  for fl in files:
    # print(fl)
    # !cp -r "$fl" "$test_sub_dir"
    s = fl.split("/")
    print(s)
    fl1 = '/content/drive/My Drive/source-free-domain-adaptation/practice_data/time/'+str(i)+"/"+s[-1]
    print(fl1)
    dir = "/content/drive/My Drive/SFDA/Time/true_labels/Dev/"+str(i)+"/"
    !cp -r "$fl1" "$dir"


  

['', 'content', 'drive', 'My Drive', 'SFDA', 'Time', 'data', 'Dev', 'AQUAINT', 'APW19980820.1428']
/content/drive/My Drive/source-free-domain-adaptation/practice_data/time/AQUAINT/APW19980820.1428
['', 'content', 'drive', 'My Drive', 'SFDA', 'Time', 'data', 'Dev', 'AQUAINT', 'APW19980818.0515']
/content/drive/My Drive/source-free-domain-adaptation/practice_data/time/AQUAINT/APW19980818.0515
['', 'content', 'drive', 'My Drive', 'SFDA', 'Time', 'data', 'Dev', 'TimeBank', 'NYT19980212.0019']
/content/drive/My Drive/source-free-domain-adaptation/practice_data/time/TimeBank/NYT19980212.0019
['', 'content', 'drive', 'My Drive', 'SFDA', 'Time', 'data', 'Dev', 'TimeBank', 'CNN19980227.2130.0067']
/content/drive/My Drive/source-free-domain-adaptation/practice_data/time/TimeBank/CNN19980227.2130.0067
['', 'content', 'drive', 'My Drive', 'SFDA', 'Time', 'data', 'Dev', 'TimeBank', 'VOA19980331.1700.1533']
/content/drive/My Drive/source-free-domain-adaptation/practice_data/time/TimeBank/VOA19980331