In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd drive/MyDrive/hw3/

/content/drive/MyDrive/hw3


In [1]:
!pip install pickle5
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import pickle5 as pickle
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
with open('train_pp.pickle', 'rb') as f:
  train_pp = pickle.load(f)

with open('val_pp.pickle', 'rb') as f:
  val_pp = pickle.load(f)

with open('test_pp.pickle', 'rb') as f:
  test_pp = pickle.load(f)

with open('vocab.pickle', 'rb') as f:
  vocab = pickle.load(f)

In [22]:
def map_targets(vocab): 
  label_idx = {}

  idx = 0 
  for label in ['pad', 'unk']: 
    label_idx[label] = idx
    idx += 1

  for label in vocab: 
    if label not in ['pad', 'bos', 'eos', 'unk']: 
      label_idx[label] = idx
      idx += 1
  
  return label_idx 

In [23]:
rel_pos_idxs = map_targets(vocab['rel_pos_set'])
dep_label_idxs = map_targets(vocab['dep_label_set'])

In [41]:
with open('rel_pos_idxs.pickle', 'wb') as handle:
    pickle.dump(rel_pos_idxs, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('dep_label_idxs.pickle', 'wb') as handle:
    pickle.dump(dep_label_idxs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
dep_label_idxs

{'pad': 0,
 'unk': 1,
 'csubj': 2,
 'obl:tmod': 3,
 'punct': 4,
 'reparandum': 5,
 'nummod': 6,
 'advcl': 7,
 'iobj': 8,
 'aux': 9,
 'acl:relcl': 10,
 'acl': 11,
 'ccomp': 12,
 'mark': 13,
 'cc:preconj': 14,
 'appos': 15,
 'expl': 16,
 'vocative': 17,
 'root': 18,
 'parataxis': 19,
 'dislocated': 20,
 'conj': 21,
 'nsubj': 22,
 'nmod': 23,
 'discourse': 24,
 'csubj:pass': 25,
 'compound': 26,
 'dep': 27,
 'xcomp': 28,
 'det:predet': 29,
 'list': 30,
 'case': 31,
 'obj': 32,
 'compound:prt': 33,
 'flat': 34,
 'aux:pass': 35,
 'det': 36,
 'fixed': 37,
 'nmod:npmod': 38,
 'amod': 39,
 'cop': 40,
 'orphan': 41,
 'nmod:tmod': 42,
 'obl:npmod': 43,
 'cc': 44,
 'nmod:poss': 45,
 'goeswith': 46,
 'obl': 47,
 'nsubj:pass': 48,
 'advmod': 49}

In [25]:
class distilBERT_FT(nn.Module):

    def __init__(self, pre_model, hidden_dim, n_rel_classes, n_dep_classes):
      super().__init__()
      self.pre_model = pre_model
      self.rel_project = nn.Linear(hidden_dim, n_rel_classes)
      self.dep_project = nn.Linear(hidden_dim, n_dep_classes)

    def forward(self, input):
      #print(f"input size: {input['input_ids'].size()}")

      # hidden_states = self.pre_model(input['input_ids']).last_hidden_state # [batch_size, seq_len, hidden_dim]
      hidden_states = self.pre_model(**input).last_hidden_state
      #print(f'hidden_states : {hidden_states.size()}')

      rel_out = self.rel_project(hidden_states)
      #print(f'rel_out size: {rel_out.size()}')
      #print(f'rel_out: {rel_out}')

      dep_out = self.dep_project(hidden_states)
      #print(f'dep_out size: {dep_out.size()}')
      #print(f'dep_out: {dep_out}')

      return rel_out, dep_out

In [26]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [28]:
from torch.utils.data import DataLoader

def pad_list_of_tensors(list_of_tensors, pad_token):
  # print('list of tensors: ', list_of_tensors)
  max_length = max([t.size(-1) for t in list_of_tensors])
  padded_list = []

  # print('max length:', max_length)
  
  for t in list_of_tensors:
    # print('t:', t.size())
    padded_tensor = torch.cat((t, torch.tensor([pad_token]*(max_length - t.size(-1)), dtype=torch.long)), dim = -1)
    # print('padded_tensor: ', padded_tensor)
    # print('padded_tensor size: ', padded_tensor.size())
    padded_list.append(padded_tensor)
      
  padded_tensor = torch.stack(padded_list)
  #print('padded_tensor size: ', padded_tensor.size())
  return padded_tensor

def pad_collate_fn(batch):
  # print('prepadded batch: ', batch)
  input_sent_list = [s['text'] for s in batch] # not adding bos and eos because tokenizer does that for us 
  #print(input_list)
  input_token_list = [s['tokens'] for s in batch] # not adding bos and eos because I dont end up using this 
  #print(input_list)
  rel_list = [s['rel_pos'] for s in batch]
  #print(rel_list)
  dep_list = [s['dep_label'] for s in batch]
  #print(dep_list)

  rel_mapped_list = []
  dep_mapped_list = []

  for rels in rel_list: 
    temp_rel_list = []

    # temp_rel_list.append(rel_pos_idxs['bos'])
    for rel in rels: 
      temp_rel_list.append(rel_pos_idxs[rel])

    # temp_rel_list.append(rel_pos_idxs['eos'])

    rel_mapped_list.append(torch.tensor(temp_rel_list))

  for deps in dep_list: 
    temp_dep_list = []

    # temp_dep_list.append(dep_label_idxs['bos'])
    for dep in deps: 
      temp_dep_list.append(dep_label_idxs[dep])

    # temp_dep_list.append(dep_label_idxs['eos'])

    dep_mapped_list.append(torch.tensor(temp_dep_list))
    
  # print(rel_mapped_list)
  # print(dep_mapped_list)

  rel_pad_idx = rel_pos_idxs.get('pad')
  dep_pad_idx = dep_label_idxs.get('pad')

  # print(rel_pad_idx)
  # print(dep_pad_idx)
    
  # padding the input is probably not necessary here as we will be using bert tokenizer to pad the input sentences later on 
  # input_tensor = pad_list_of_tensors(input_list, pad_token)

  rel_tensor = pad_list_of_tensors(rel_mapped_list, rel_pad_idx)
  dep_tensor = pad_list_of_tensors(dep_mapped_list, dep_pad_idx)
    
  return input_sent_list, input_token_list, rel_tensor, dep_tensor

In [29]:
BATCH_SIZE = 32
dataloader_dict = {}

# change shuffle back to True later
dataloader_dict['train'] = DataLoader(train_pp, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate_fn)
dataloader_dict['val'] = DataLoader(val_pp, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate_fn)
dataloader_dict['test'] = DataLoader(test_pp, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate_fn)

In [30]:
def filter_scores_and_preds(input_sents, output, target_size, n_classes):
  filtered_scores = torch.zeros((output.size()[0], target_size, n_classes))
  #print('filtered_scores size of zeros: ', filtered_scores.size())

  preds_list = []
  for idx, sentence in enumerate(output): 
    bert_tokens = tokenizer.tokenize(input_sents[idx])
    bert_tokens = ['bos'] + bert_tokens + ['eos']
    #print('bert_tokens: ', bert_tokens)

    hyphen_prev = False 
    preds = []
    filtered_idx = 0 

    for t_idx, token_scores in enumerate(sentence):
      if t_idx == 0: 
        continue
      elif bert_tokens[t_idx] == 'eos':
        #print('breaking at t_idx: ', t_idx)
        break 
      elif bert_tokens[t_idx].startswith('##'): 
        continue 
      elif bert_tokens[t_idx] == '-': 
        hyphen_prev = True 
        continue
      elif hyphen_prev == True: 
        hyphen_prev = False 
        continue 
      elif filtered_idx >= target_size: 
        continue
      else: 
        #print('filtered_idx: ', filtered_idx)
        filtered_scores[idx][filtered_idx] = token_scores
        filtered_idx += 1
        # get the args to reflect the class so they can be comparable to targets which are also classes 
        preds.append(torch.argmax(token_scores).item())
    
    preds = preds + ([0] * (target_size - len(preds)))

    preds_list.append(preds)

  preds_list = torch.tensor(preds_list)
  #print('preds_list: ', preds_list)
    
  #print('filtered_scores size: ', filtered_scores.size())
  return filtered_scores, preds_list

In [31]:
from tqdm import tqdm

In [32]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [38]:
n_rel_classes = len(rel_pos_idxs)
n_dep_classes = len(dep_label_idxs)

classifier = distilBERT_FT(model, 768, n_rel_classes, n_dep_classes).to(device)

criterion = torch.nn.CrossEntropyLoss(ignore_index = rel_pos_idxs['pad']).to(device)
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)

def train(model, num_epochs, criterion, optimizer, lam, device): 
  pbar = tqdm(range(num_epochs))
  val_accs = []

  for epoch in pbar:
    print('EPOCH: ', epoch)
    total_train_loss = 0.0
    train_rel_accs = []
    train_dep_accs = []

    total_val_loss = 0.0
    val_rel_accs = []
    val_dep_accs = []

    model.train()
    for input_sents, input_tokens, rel_target, dep_target in tqdm(dataloader_dict['train']): 
      # encoded_input['input_ids'], encoded_input['attention_mask']
      encoded_input = tokenizer(input_sents, return_tensors='pt', padding=True).to(device)
      rel_output, dep_output = model(encoded_input)

      filtered_rel_scores, filtered_rel_preds = filter_scores_and_preds(input_sents, rel_output, rel_target.size()[1], n_rel_classes)
      filtered_dep_scores, filtered_dep_preds = filter_scores_and_preds(input_sents, dep_output, dep_target.size()[1], n_dep_classes)

      rel_loss = criterion(filtered_rel_scores.permute(0,2,1).to(device), rel_target.to(device))
      dep_loss = criterion(filtered_dep_scores.permute(0,2,1).to(device), dep_target.to(device))

      loss = (lam * rel_loss) + ((1 - lam) * dep_loss)
      #print('loss: ', loss)
      total_train_loss += loss

      # rel_acc = torch.eq(filtered_rel_preds, rel_target).sum() / (rel_target.size()[0] * rel_target.size()[1])
      rel_no_pad_cnt = 0 
      rel_correct = 0
      for s_idx, sent in enumerate(filtered_rel_preds): 
        for t_idx, token in enumerate(sent):
          if token != 0: 
            rel_no_pad_cnt += 1
            if token == rel_target[s_idx][t_idx]:
              rel_correct += 1
          
      train_rel_accs.append(rel_correct / rel_no_pad_cnt)

      # dep_acc = torch.eq(filtered_dep_preds, dep_target).sum() / (dep_target.size()[0] * dep_target.size()[1])
      dep_no_pad_cnt = 0 
      dep_correct = 0
      for s_idx, sent in enumerate(filtered_dep_preds): 
        for t_idx, token in enumerate(sent):
          if token != 0: 
            dep_no_pad_cnt += 1
            if token == dep_target[s_idx][t_idx]:
              dep_correct += 1
    
      train_dep_accs.append(dep_correct / dep_no_pad_cnt)

      optimizer.zero_grad()
      loss.backward()
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0, norm_type=2)
      optimizer.step() 

    model.eval()
    for input_sents, input_tokens, rel_target, dep_target in tqdm(dataloader_dict['val']): 
      encoded_input = tokenizer(input_sents, return_tensors='pt', padding=True).to(device)
      rel_output, dep_output = model(encoded_input)

      filtered_rel_scores, filtered_rel_preds = filter_scores_and_preds(input_sents, rel_output, rel_target.size()[1], n_rel_classes)
      filtered_dep_scores, filtered_dep_preds = filter_scores_and_preds(input_sents, dep_output, dep_target.size()[1], n_dep_classes)

      rel_loss = criterion(filtered_rel_scores.permute(0,2,1).to(device), rel_target.to(device))
      dep_loss = criterion(filtered_dep_scores.permute(0,2,1).to(device), dep_target.to(device))

      val_loss = (lam * rel_loss) + ((1 - lam) * dep_loss)
      #print('val loss: ', val_loss)
      total_val_loss += val_loss

      # rel_acc = torch.eq(filtered_rel_preds, rel_target).sum() / (rel_target.size()[0] * rel_target.size()[1])
      rel_no_pad_cnt = 0 
      rel_correct = 0
      for s_idx, sent in enumerate(filtered_rel_preds): 
        for t_idx, token in enumerate(sent):
          if token != 0: 
            rel_no_pad_cnt += 1
            if token == rel_target[s_idx][t_idx]:
              rel_correct += 1
          
      val_rel_accs.append(rel_correct / rel_no_pad_cnt)

      # dep_acc = torch.eq(filtered_dep_preds, dep_target).sum() / (dep_target.size()[0] * dep_target.size()[1])
      dep_no_pad_cnt = 0 
      dep_correct = 0
      for s_idx, sent in enumerate(filtered_dep_preds): 
        for t_idx, token in enumerate(sent):
          if token != 0: 
            dep_no_pad_cnt += 1
            if token == dep_target[s_idx][t_idx]:
              dep_correct += 1
    
      val_dep_accs.append(dep_correct / dep_no_pad_cnt)

    print('avg train loss: ', total_train_loss / len(dataloader_dict['train']))
    print('avg train rel_pos acc: ', sum(train_rel_accs) / len(train_rel_accs))
    print('avg train dep_label acc: ', sum(train_dep_accs) / len(train_dep_accs))

    print('avg val loss: ', total_val_loss / len(dataloader_dict['val']))
    print('avg val rel_pos acc: ', sum(val_rel_accs) / len(val_rel_accs))
    print('avg val dep_label acc: ', sum(val_dep_accs) / len(val_dep_accs))
    if epoch == num_epochs-1: 
      val_accs.append({'rel_pos': sum(val_rel_accs) / len(val_rel_accs),  'dep_label': sum(val_dep_accs) / len(val_dep_accs)})

  print('SAVING MODEL')
  torch.save(model.state_dict(), f'bert-parser-{lam}.pt')
  return val_accs

In [35]:
train(classifier, 1, criterion, optimizer, 0.25, device)

  0%|          | 0/1 [00:00<?, ?it/s]

EPOCH:  0




loss:  tensor(4.1841, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(4.0369, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(3.8734, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(3.6675, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(3.4427, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(3.3754, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(3.3333, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(3.1223, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(3.0949, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(3.0345, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(3.0256, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(2.9222, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.8530, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.9442, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(2.7586, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.6438, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(2.4854, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.6238, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.4541, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.3657, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.5020, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(2.3511, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.3698, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(2.1567, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.1341, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.1717, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(2.2390, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.0217, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(2.1730, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.9897, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.9784, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.8108, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.0775, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(2.0039, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.9501, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.7329, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.9463, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.7089, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.8687, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.8747, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.7053, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.6189, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.5664, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.6619, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.7248, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.5469, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.6519, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.4661, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.5718, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.5886, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.5295, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.7447, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.3841, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.6604, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.5208, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.7829, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.4634, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.5114, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2040, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1186, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.3126, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2373, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.5160, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.5935, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.2155, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2724, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2924, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.7317, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2349, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.6172, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2314, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.3183, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1762, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.4745, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.6844, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.6042, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.4157, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.4254, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.3112, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.3434, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.1456, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2974, device='cuda:0', grad_fn=<AddBackward0>)





loss:  tensor(1.5584, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.4925, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.3271, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.5040, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(0.9872, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1345, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2889, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1378, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.3970, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2366, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.5264, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1943, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.2527, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1948, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.0675, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.1076, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1527, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1174, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.0213, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(0.8805, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1880, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1816, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1742, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.0779, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2396, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2903, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.0670, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2855, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.2339, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1949, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(0.8878, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(0.8857, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.2412, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(0.9842, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1431, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.1745, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.3999, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1195, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.0295, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.0020, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.0230, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1781, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.3151, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.0501, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(0.7973, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.0132, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.2704, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(1.1948, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.1289, device='cuda:0', grad_fn=<AddBackward0>)




loss:  tensor(0.8747, device='cuda:0', grad_fn=<AddBackward0>)
loss:  tensor(1.0856, device='cuda:0', grad_fn=<AddBackward0>)


100%|██████████| 134/134 [00:34<00:00,  3.91it/s]


loss:  tensor(0.9013, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(1.3055, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(1.3357, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(1.3743, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(1.1938, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(0.9697, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(0.9327, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(1.0390, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(0.9027, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(0.8338, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(0.9662, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(1.1577, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(0.9455, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(0.9265, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(1.2392, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(0.8871, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(0.9089, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(1.1581, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(1.0374, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(0.8628, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(0.9335, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(1.0818, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(0.8726, device='cuda:0', grad_fn=<AddBackward0>)




val loss:  tensor(0.8514, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(0.9828, device='cuda:0', grad_fn=<AddBackward0>)
val loss:  tensor(0.5621, device='cuda:0', grad_fn=<AddBackward0>)


100%|██████████| 25/25 [00:03<00:00,  8.08it/s]
100%|██████████| 1/1 [00:37<00:00, 37.36s/it]


avg train loss:  tensor(1.6790, device='cuda:0', grad_fn=<DivBackward0>)
avg train rel_pos acc:  0.42843728613060955
avg train dep_label acc:  0.6419149971273127
avg val loss:  tensor(1.0104, device='cuda:0', grad_fn=<DivBackward0>)
avg val rel_pos acc:  0.5549146758176737
avg val dep_label acc:  0.7988441375770617
SAVING MODEL


[{'rel_pos': 0.5549146758176737, 'dep_label': 0.7988441375770617}]

In [39]:
lam_val_accs = []
for lam in [0.25, 0.5, 0.75]:
    model = DistilBertModel.from_pretrained("distilbert-base-uncased")
    classifier = distilBERT_FT(model, 768, n_rel_classes, n_dep_classes).to(device)

    criterion = torch.nn.CrossEntropyLoss(ignore_index = rel_pos_idxs['pad']).to(device)
    optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)
    val_accs = train(classifier, 3, criterion, optimizer, lam, device)
    lam_val_accs.append(val_accs)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/3 [00:00<?, ?it/s]

EPOCH:  0


100%|██████████| 134/134 [00:30<00:00,  4.38it/s]
100%|██████████| 25/25 [00:03<00:00,  8.10it/s]
 33%|███▎      | 1/3 [00:33<01:07, 33.68s/it]

avg train loss:  tensor(1.6780, device='cuda:0', grad_fn=<DivBackward0>)
avg train rel_pos acc:  0.4241927727345834
avg train dep_label acc:  0.6421994190668313
avg val loss:  tensor(1.0459, device='cuda:0', grad_fn=<DivBackward0>)
avg val rel_pos acc:  0.5439136290436235
avg val dep_label acc:  0.7866831156069918
EPOCH:  1


100%|██████████| 134/134 [00:29<00:00,  4.49it/s]
100%|██████████| 25/25 [00:03<00:00,  8.13it/s]
 67%|██████▋   | 2/3 [01:06<00:33, 33.38s/it]

avg train loss:  tensor(0.8501, device='cuda:0', grad_fn=<DivBackward0>)
avg train rel_pos acc:  0.5888297180652967
avg train dep_label acc:  0.8285460634952074
avg val loss:  tensor(0.8592, device='cuda:0', grad_fn=<DivBackward0>)
avg val rel_pos acc:  0.6160262098445057
avg val dep_label acc:  0.8273340599695739
EPOCH:  2


100%|██████████| 134/134 [00:30<00:00,  4.42it/s]
100%|██████████| 25/25 [00:03<00:00,  8.20it/s]
100%|██████████| 3/3 [01:40<00:00, 33.48s/it]


avg train loss:  tensor(0.6383, device='cuda:0', grad_fn=<DivBackward0>)
avg train rel_pos acc:  0.6561419952425218
avg train dep_label acc:  0.8705795537967115
avg val loss:  tensor(0.7630, device='cuda:0', grad_fn=<DivBackward0>)
avg val rel_pos acc:  0.66389582946543
avg val dep_label acc:  0.8345714217217187
SAVING MODEL


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/3 [00:00<?, ?it/s]

EPOCH:  0


100%|██████████| 134/134 [00:29<00:00,  4.48it/s]
100%|██████████| 25/25 [00:03<00:00,  8.19it/s]
 33%|███▎      | 1/3 [00:32<01:05, 32.95s/it]

avg train loss:  tensor(1.8754, device='cuda:0', grad_fn=<DivBackward0>)
avg train rel_pos acc:  0.4707008204745844
avg train dep_label acc:  0.5877566241341244
avg val loss:  tensor(1.1407, device='cuda:0', grad_fn=<DivBackward0>)
avg val rel_pos acc:  0.621216255573992
avg val dep_label acc:  0.7822407636717371
EPOCH:  1


100%|██████████| 134/134 [00:29<00:00,  4.50it/s]
100%|██████████| 25/25 [00:03<00:00,  8.13it/s]
 67%|██████▋   | 2/3 [01:06<00:33, 33.01s/it]

avg train loss:  tensor(0.9658, device='cuda:0', grad_fn=<DivBackward0>)
avg train rel_pos acc:  0.6577337462627746
avg train dep_label acc:  0.8167932958414224
avg val loss:  tensor(0.9198, device='cuda:0', grad_fn=<DivBackward0>)
avg val rel_pos acc:  0.6751392538234966
avg val dep_label acc:  0.822276276671075
EPOCH:  2


100%|██████████| 134/134 [00:29<00:00,  4.49it/s]
100%|██████████| 25/25 [00:03<00:00,  8.12it/s]
100%|██████████| 3/3 [01:39<00:00, 33.07s/it]


avg train loss:  tensor(0.7404, device='cuda:0', grad_fn=<DivBackward0>)
avg train rel_pos acc:  0.7153630259388839
avg train dep_label acc:  0.8594240662698368
avg val loss:  tensor(0.8424, device='cuda:0', grad_fn=<DivBackward0>)
avg val rel_pos acc:  0.706399152137012
avg val dep_label acc:  0.83666102417964
SAVING MODEL


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/3 [00:00<?, ?it/s]

EPOCH:  0


100%|██████████| 134/134 [00:29<00:00,  4.48it/s]
100%|██████████| 25/25 [00:03<00:00,  8.19it/s]
 33%|███▎      | 1/3 [00:32<01:05, 32.97s/it]

avg train loss:  tensor(2.0098, device='cuda:0', grad_fn=<DivBackward0>)
avg train rel_pos acc:  0.49480344136593823
avg train dep_label acc:  0.4785377302243596
avg val loss:  tensor(1.3007, device='cuda:0', grad_fn=<DivBackward0>)
avg val rel_pos acc:  0.6490743061059027
avg val dep_label acc:  0.6974944783414159
EPOCH:  1


100%|██████████| 134/134 [00:30<00:00,  4.44it/s]
100%|██████████| 25/25 [00:03<00:00,  8.12it/s]
 67%|██████▋   | 2/3 [01:06<00:33, 33.29s/it]

avg train loss:  tensor(1.1115, device='cuda:0', grad_fn=<DivBackward0>)
avg train rel_pos acc:  0.6810296754622306
avg train dep_label acc:  0.7423418436356254
avg val loss:  tensor(1.0259, device='cuda:0', grad_fn=<DivBackward0>)
avg val rel_pos acc:  0.7056174590790745
avg val dep_label acc:  0.769180724575546
EPOCH:  2


100%|██████████| 134/134 [00:29<00:00,  4.48it/s]
100%|██████████| 25/25 [00:03<00:00,  8.13it/s]
100%|██████████| 3/3 [01:39<00:00, 33.23s/it]


avg train loss:  tensor(0.8406, device='cuda:0', grad_fn=<DivBackward0>)
avg train rel_pos acc:  0.742553238936682
avg train dep_label acc:  0.8040866460139139
avg val loss:  tensor(0.9435, device='cuda:0', grad_fn=<DivBackward0>)
avg val rel_pos acc:  0.7199073611090989
avg val dep_label acc:  0.8009836938271249
SAVING MODEL


In [40]:
print(lam_val_accs)

[[{'rel_pos': 0.66389582946543, 'dep_label': 0.8345714217217187}], [{'rel_pos': 0.706399152137012, 'dep_label': 0.83666102417964}], [{'rel_pos': 0.7199073611090989, 'dep_label': 0.8009836938271249}]]
