<a href="https://colab.research.google.com/github/jsvan/MaskTests4LanguageModels/blob/main/Extension_of_Summerwinograndeattempt_to_more_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this extension I will try out more models, first tuning generic language models on the debiased dataset. 

In [1]:
%%capture
!pip install transformers
!pip install datasets 

In [2]:
# Textizer
"""
    Each sentence was split on "_" placeholder symbol.
    Each option was concatenated with the second part of the split, thus transforming each example into two text segment pairs.
    Text segment pairs corresponding to correct and incorrect options were marked with True and False labels accordingly.
    Text segment pairs were shuffled thereafter.

"""

from datasets import Dataset

def prepare_data(dataset):

  # internal function
  def prep_ds(dataset):
    sentences, answers, o1, o2 = [], [], [], []
    for p in dataset:
      s1 = p['option1'].join(p['sentence'].split('_'))
      s2 = p['option2'].join(p['sentence'].split('_'))
      a1 = int(p['answer'] == '1')
      a2 = int(p['answer'] == '2')

      
      sentences.append(s1)
      answers.append(a1)
      sentences.append(s2)
      answers.append(a2)
      o1.append(p['option1'])
      o2.append(p['option2'])
      o1.append(p['option1'])
      o2.append(p['option2'])

    return {'sentence':sentences, 'labels':answers, 'option1':o1, 'option2':o2}
    # end internal function

  train = prep_ds(dataset["train"])
  test = prep_ds(dataset["validation"])
  trainds = Dataset.from_dict( train ).shuffle()
  testds = Dataset.from_dict( test ).shuffle()

  return {"train":trainds, "test":testds, "name":"Standard Dataset"}


In [3]:

def mask_datasets(dataset):

  # internal method BEGIN
  def mask_copy(dataset):
    sentences = []
    toprint = 1

    for p in dataset:
      if toprint > 0:
        print(f"[{p['option1']}], [{p['option2']}], [{p['sentence']}]")

      sentences.append(p['sentence'].replace(p['option1'], 'option1').replace(p['option2'], 'option2'))
      
      if toprint > 0:
        print(sentences[-1])
        toprint -= 1

    build = {'sentence':sentences, 'labels':dataset['labels'], 'option1':dataset['option1'], 'option2':dataset['option2']}
    return Dataset.from_dict(build) # DON'T SHUFFLE
  #END
  
  return {"train":mask_copy(dataset['train']), "test":mask_copy(dataset['test']), "name":"Masked Dataset"}



In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from pprint import pprint
from tqdm import tqdm

""" 
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("roberta-large")

model = AutoModelForMaskedLM.from_pretrained("roberta-large")
"""


def test_datasets(tokenizer, std_datasets, masked_datasets, model=False,):

  print("\nTESTING TESTING  1 2 3 ...")
  delete = False
  if not model:
    model = AutoModelForSequenceClassification.from_pretrained("DeepPavlov/roberta-large-winogrande")
    delete = True

  elif isinstance(model, str):
    delete = True
    with torch.no_grad():
      torch.cuda.empty_cache()

    model = torch.load(model) # open(model, "rb"))

    with torch.no_grad():
      torch.cuda.empty_cache()
  try:
    for ds in (std_datasets, masked_datasets):
      print("\nPERFORMING", ds["name"])
      
      #combinedtraintest = concatenate_datasets([ds['train'], ds['test']])
      #encoded_train = combinedtraintest.map(lambda examples: tokenizer(examples['sentence'], padding='max_length'), batched=True) # , return_tensors='pt'
      encoded_test = ds['test'].map(lambda examples: tokenizer(examples['sentence'], padding='max_length'), batched=True)

      #encoded_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
      encoded_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
      #dataloader_train = torch.utils.data.DataLoader(encoded_train, batch_size=32)
      dataloader_test = torch.utils.data.DataLoader(encoded_test, batch_size=32)

      device = 'cuda' if torch.cuda.is_available() else 'cpu' 
      #model.train().to(device)
      model.to(device)
      #optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

      correct = 0
      total = 0
      for i, batch in enumerate(tqdm(dataloader_test)):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        #for oo, lab in zip(outputs.logits, batch['labels'] ):
        #  print(oo.argmax().item(), lab.items())
        correct += sum((int(x.argmax().item() == y.item()) for x, y in zip(outputs.logits, batch['labels'])))
        total += len(batch['labels'])
        
        
      print("\nSCORE", correct / total, '/ 1.00')

    if delete:
      del model
      print("Deleted model")

  except Exception as e:
    del model
    print(e, e.__str__)
    print("Deleted model")




In [5]:

def train_model(trainingset, compareset, tokenizer, model=False, copy=False, epochs=1):
  
  if copy: # ie, if copy and model
    print('copy')

    with torch.no_grad():
      torch.cuda.empty_cache()
    model = torch.load(model) # open(model, "rb"))
    with torch.no_grad():
      torch.cuda.empty_cache()
    
  elif model and isinstance(model, str): # ie if str(model) AND NOT copy:
    
    # download new model of name
    model = AutoModelForSequenceClassification.from_pretrained(model)

  try:
    encoded_train = trainingset['train'].map(lambda examples: tokenizer(examples['sentence'], padding='max_length'), batched=True) # , return_tensors='pt'
    encoded_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    dataloader_train = torch.utils.data.DataLoader(encoded_train, batch_size=32)

    device = 'cuda' if torch.cuda.is_available() else 'cpu' 
    model.train().to(device)
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

    for epoch in range(epochs):
      print("Epoch", epoch)
      correct = 0
      total = 0
      for i, batch in enumerate(tqdm(dataloader_train)):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        #for oo, lab in zip(outputs.logits, batch['labels'] ):
        #  print(oo.argmax().item(), lab.items())
        correct += sum((int(x.argmax().item() == y.item()) for x, y in zip(outputs.logits, batch['labels'])))
        total += len(batch['labels'])
        
      print("Score", correct / total, '/ 1.00')
      test_datasets(tokenizer, trainingset, compareset, model)
      model.train().to(device)
  except Exception as e:
    del model
    print(e, e.__str__)
    print("deleted model")
  if copy:
    del model
  else:
    return model




In [6]:
import re

def mask_copy_1(dataset, unk):
  sentences = []
  toprint = 1

  for p in dataset:
    # This is annoying because option1 can be substrings of other words, usually option2. They can also have uppercase letters. 
    # If one is a substring, then I will cover up the larger word with a temporary mask to not confuse anything else.
    sentence = p['sentence']
    option1, option2 = re.compile(p['option1'], re.IGNORECASE), re.compile(p['option2'], re.IGNORECASE)
    maskedoption1, maskedoption2 = "OPTION_ONE", "OPTION_TWO"
    first_search, second_search, first_mask, second_mask = None, None, None, None

    # "table" in "tablecloth" --> cover bigger one
    if len(p['option1']) > len(p['option2']):
      # cover option1 first
      first_search, second_search = option1, option2
      first_mask, second_mask = maskedoption1, maskedoption2
    else:
      first_search, second_search = option2, option1
      first_mask, second_mask = maskedoption2, maskedoption1
    
    sentence = first_search.sub(first_mask, sentence) #Mask the longer word with OPTION_ mask
    sentence = second_search.sub(second_mask, sentence) #then the shorter one

    # IT gets kinda confusing which word now to <IGNORE> because it's language and some words appear more than two, more than three, times. 
    # I think the smartest approach is to assume the final word used is the question word and mask that one. 

    # Find final word used
    # maskedoption1 is the final word
    if sentence.rfind(maskedoption1) > sentence.rfind(maskedoption2):
      # IGNORE final word
      # Convert other word back to original
      sentence = sentence.replace(maskedoption1, unk)
      sentence = sentence.replace(maskedoption2, p['option2'])
    else:
      sentence = sentence.replace(maskedoption2, unk)
      sentence = sentence.replace(maskedoption1, p['option2'])


    if toprint > 0:
      print(f"[{p['option1']}], [{p['option2']}], [{p['sentence']}]")

    sentences.append(sentence)
    
    if toprint > 0:
      print(sentences[-1])
      toprint -= 1

  build = {'sentence':sentences, 'labels':dataset['labels'], 'option1':dataset['option1'], 'option2':dataset['option2']}
  return Dataset.from_dict(build) # DON'T SHUFFLE





def mask_datasets_1(dataset, tokenizer):
  unk = tokenizer.unk_token
  return {"train":mask_copy_1(dataset['train'], unk), "test":mask_copy_1(dataset['test'], unk), "name":"Double <Unk> Masked Dataset"}

In [7]:
import re


# This masked <unk> tokens on the option which is NOT involved in the question. 
# If the option involved in the question is wrong, it will need to identify the <unk> token as having importance.
def mask_copy_2(dataset, unk):
  sentences = []
  toprint = 1

  for p in dataset:
    # This is annoying because option1 can be substrings of other words, usually option2. They can also have uppercase letters. 
    # If one is a substring, then I will cover up the larger word with a temporary mask to not confuse anything else.
    sentence = p['sentence']
    option1, option2 = re.compile(p['option1'], re.IGNORECASE), re.compile(p['option2'], re.IGNORECASE)
    maskedoption1, maskedoption2 = "OPTION_ONE", "OPTION_TWO"
    first_search, second_search, first_mask, second_mask = None, None, None, None

    # "table" in "tablecloth" --> cover bigger one
    if len(p['option1']) > len(p['option2']):
      # cover option1 first
      first_search, second_search = option1, option2
      first_mask, second_mask = maskedoption1, maskedoption2
    else:
      first_search, second_search = option2, option1
      first_mask, second_mask = maskedoption2, maskedoption1
    
    sentence = first_search.sub(first_mask, sentence) #Mask the longer word with OPTION_ mask
    sentence = second_search.sub(second_mask, sentence) #then the shorter one

    # IT gets kinda confusing which word now to <IGNORE> because it's language and some words appear more than two, more than three, times. 
    # I think the smartest approach is to assume the final word used is the question word and mask that one. 

    # Find final word used
    # maskedoption1 is NOT the final word
    if sentence.rfind(maskedoption1) < sentence.rfind(maskedoption2):
      # IGNORE final word
      # Convert other word back to original
      sentence = sentence.replace(maskedoption1, unk)
      sentence = sentence.replace(maskedoption2, p['option2'])
    else:
      sentence = sentence.replace(maskedoption2, unk)
      sentence = sentence.replace(maskedoption1, p['option2'])


    if toprint > 0:
      print(f"[{p['option1']}], [{p['option2']}], [{p['sentence']}]")

    sentences.append(sentence)
    
    if toprint > 0:
      print(sentences[-1])
      toprint -= 1

  build = {'sentence':sentences, 'labels':dataset['labels'], 'option1':dataset['option1'], 'option2':dataset['option2']}
  return Dataset.from_dict(build) # DON'T SHUFFLE





def mask_datasets_2(dataset, tokenizer):
  unk = tokenizer.unk_token
  return {"train":mask_copy_2(dataset['train'], unk), "test":mask_copy_2(dataset['test'], unk), "name":"Single <Unk> Masked Dataset"}

In [8]:
stupid_tries = [
  ("dog", "doggy"),
  ("red", "blue"),
  ("flavor", "flavour"),
  ('A', 'B'),
  ('X', 'Y'),
  ('1', '2'),
  ('first', 'second'),
  ('alpha', 'beta'),
  ('#', '@'),
  ('primero', 'secundo'), # yes its true, I dont speak spanish. Having it spelled wrong just makes the model's task that much more difficult ;)
  ('Alice', 'Bob'),
  ('_', '__'),
  ('mask1', 'mask2'),
  ('thing1', 'thing2'),
  ('mask_a', 'mask_b'),
  ("thing_a", "thing_b")
]


In [9]:
import re


# This masked <unk> tokens on the option which is NOT involved in the question. 
# If the option involved in the question is wrong, it will need to identify the <unk> token as having importance.
def stupid_masking(dataset, mask1, mask2):
  sentences = []
  toprint = 1

  for p in dataset:
    # This is annoying because option1 can be substrings of other words, usually option2. They can also have uppercase letters. 
    # If one is a substring, then I will cover up the larger word with a temporary mask to not confuse anything else.
    sentence = p['sentence']
    option1, option2 = re.compile(p['option1'], re.IGNORECASE), re.compile(p['option2'], re.IGNORECASE)
    maskedoption1, maskedoption2 = "OPTION_ONE", "OPTION_TWO"
    first_search, second_search, first_mask, second_mask = None, None, None, None

    # "table" in "tablecloth" --> cover bigger one
    if len(p['option1']) > len(p['option2']):
      # cover option1 first
      first_search, second_search = option1, option2
      first_mask, second_mask = maskedoption1, maskedoption2
    else:
      first_search, second_search = option2, option1
      first_mask, second_mask = maskedoption2, maskedoption1
    
    sentence = first_search.sub(first_mask, sentence) #Mask the longer word with OPTION_ mask
    sentence = second_search.sub(second_mask, sentence) #then the shorter one

    sentence = sentence.replace(maskedoption1, mask1)
    sentence = sentence.replace(maskedoption2, mask2)


    if toprint > 0:
      print(f"[{p['option1']}], [{p['option2']}], [{p['sentence']}]")

    sentences.append(sentence)
    
    if toprint > 0:
      print(sentences[-1])
      toprint -= 1

  build = {'sentence':sentences, 'labels':dataset['labels'], 'option1':dataset['option1'], 'option2':dataset['option2']}
  return Dataset.from_dict(build) # DON'T SHUFFLE





def stupid_datasets(dataset, tokenizer, masks):
  mask1, mask2 = masks
  return {"train":stupid_masking(dataset['train'], mask1, mask2), "test":stupid_masking(dataset['test'], mask1, mask2), "name":f"({mask1}, {mask2}) Masked Dataset"}

In [10]:
%%capture
from datasets import load_dataset, concatenate_datasets

# You can switch between these two datasets
dataset = load_dataset("winogrande", 'winogrande_debiased')
#dataset = load_dataset("winogrande", 'winogrande_l')

# dataset is dict with keys ['train', 'test', 'validation']
# Each with an enumerable of 
"""
{'answer': '2',
 'option1': 'Kyle',
 'option2': 'Logan',
 'sentence': "Kyle doesn't wear leg warmers to bed, while Logan almost always does. _ is more likely to live in a colder climate."}

"""
# Use Validation instead of Test because Test lacks labels.


#dicts of {'train':_, 'test':_}
std_datasets = prepare_data(dataset)

masked_datasets = mask_datasets(std_datasets)

In [14]:
import os

# This model will become the base model for everything we build upon, 
# so we will save it to disk to load it fresh for each experiment. 
model_list = [#"DeepPavlov/roberta-large-winogrande", 
              "distilbert-base-uncased", 
              "roberta-large",
              "albert-base-v2", 
              'bert-base-cased', 
              'xlnet-base-cased', 
              'xlm-mlm-en-2048']

for model_name in model_list:
  try:
    print("NOW DOING", model_name)
    # download model and save to memory. Models must be removed from cache to prevent gpu mem errors.
    # fine tune the model to the debiased dataset
    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=64)
    print("BEGIN OG TRAIN")
    debiased_tuned_model = train_model(trainingset=std_datasets, compareset=masked_datasets, tokenizer=tokenizer, model=model_name, copy=False, epochs=3)
    print("SAVED")
    model_name = ''.join([c for c in model_name if c.isalpha()])
    torch.save(debiased_tuned_model, f=model_name)
    debiased_tuned_model = None


    # Run individual tests 
    print("STARTING FIRST UNK")
    unk_datasets = mask_datasets_1(std_datasets, tokenizer)
    test_datasets(tokenizer, std_datasets, unk_datasets, debiased_tuned_model)
    train_model(unk_datasets, std_datasets, tokenizer, model=model_name, copy = True)
    print("ENDING FIRST UNK & STARTING SECOND")

    unk_dataset_2 = mask_datasets_2(std_datasets, tokenizer)
    test_datasets(tokenizer, std_datasets, unk_dataset_2, debiased_tuned_model)
    train_model(unk_dataset_2, std_datasets, tokenizer, model=model_name, copy = True)

    print("ENDING UNK TESTS AND STARTING STUPID MASKS")

    for masks in stupid_tries[-2:]:
      stupid_ds = stupid_datasets(std_datasets, tokenizer, masks)
      print("TESTING:")
      test_datasets(tokenizer, std_datasets, stupid_ds, model=model_name)
      print("POST TRAINING:")
      train_model(stupid_ds, std_datasets, tokenizer, model=model_name, copy = True)

  finally:
    print("REMOVING MODEL")
    del tokenizer
    os.remove(model_name)

NOW DOING distilbert-base-uncased


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

BEGIN OG TRAIN


Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [01:06<00:00,  8.67it/s]

Score 0.49210640138408307 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 24.22it/s]



SCORE 0.5130228887134964 / 1.00

PERFORMING Masked Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 23.46it/s]



SCORE 0.5047355958958168 / 1.00
Epoch 1


100%|██████████| 578/578 [01:07<00:00,  8.59it/s]

Score 0.49962153979238755 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 23.99it/s]


SCORE 0.5019731649565904 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 24.37it/s]



SCORE 0.5086819258089976 / 1.00
Epoch 2


100%|██████████| 578/578 [01:07<00:00,  8.58it/s]

Score 0.5056769031141869 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 23.95it/s]



SCORE 0.4913180741910024 / 1.00

PERFORMING Masked Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 23.66it/s]



SCORE 0.4996053670086819 / 1.00
SAVED
STARTING FIRST UNK
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at [UNK] and 6PM so he's cutting it close. Luckily, the [UNK] event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both [UNK] and contacts, but the cleaning solution was empty so I wore the [UNK] today.

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.36it/s]


SCORE 0.4968429360694554 / 1.00

PERFORMING Double <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.36it/s]



SCORE 0.4972375690607735 / 1.00
Deleted model
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [01:07<00:00,  8.56it/s]

Score 0.5019463667820069 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Double <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 23.88it/s]



SCORE 0.5067087608524072 / 1.00

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 24.04it/s]



SCORE 0.4952644041041831 / 1.00
ENDING FIRST UNK & STARTING SECOND
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at 6PM and [UNK] so he's cutting it close. Luckily, the 6PM event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both contacts and [UNK], but the cleaning solution was empty so I wore the contacts today.

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.35it/s]


SCORE 0.4968429360694554 / 1.00

PERFORMING Single <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.36it/s]



SCORE 0.4972375690607735 / 1.00
Deleted model
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [01:07<00:00,  8.55it/s]

Score 0.5032439446366782 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Single <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 23.69it/s]



SCORE 0.5059194948697711 / 1.00

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 23.69it/s]



SCORE 0.4984214680347277 / 1.00
ENDING UNK TESTS AND STARTING STUPID MASKS
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at mask_a and mask_b so he's cutting it close. Luckily, the mask_a event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both mask_a and mask_b, but the cleaning solution was empty so I wore the mask_a today.
TESTING:

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 23.66it/s]



SCORE 0.494869771112865 / 1.00

PERFORMING (mask_a, mask_b) Masked Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 23.60it/s]



SCORE 0.5023677979479084 / 1.00
Deleted model
POST TRAINING:
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [01:07<00:00,  8.53it/s]

Score 0.5018382352941176 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING (mask_a, mask_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 24.12it/s]



SCORE 0.505130228887135 / 1.00

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 24.16it/s]



SCORE 0.5071033938437254 / 1.00
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at thing_a and thing_b so he's cutting it close. Luckily, the thing_a event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both thing_a and thing_b, but the cleaning solution was empty so I wore the thing_a today.
TESTING:

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 24.14it/s]


SCORE 0.5047355958958168 / 1.00

PERFORMING (thing_a, thing_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 24.15it/s]



SCORE 0.4960536700868193 / 1.00
Deleted model
POST TRAINING:
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [01:07<00:00,  8.57it/s]

Score 0.4978373702422145 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING (thing_a, thing_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 24.08it/s]



SCORE 0.5011838989739542 / 1.00

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:03<00:00, 24.04it/s]



SCORE 0.4988161010260458 / 1.00
REMOVING MODEL
NOW DOING roberta-large


Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

BEGIN OG TRAIN


Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'clas

  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [06:48<00:00,  1.41it/s]

Score 0.49891868512110726 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.30it/s]


SCORE 0.5067087608524072 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.30it/s]



SCORE 0.505524861878453 / 1.00
Epoch 1


100%|██████████| 578/578 [06:48<00:00,  1.41it/s]

Score 0.4992971453287197 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.31it/s]


SCORE 0.48579321231254935 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.30it/s]



SCORE 0.4925019731649566 / 1.00
Epoch 2


100%|██████████| 578/578 [06:48<00:00,  1.41it/s]

Score 0.49967560553633217 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.29it/s]


SCORE 0.5142067876874507 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.30it/s]



SCORE 0.5007892659826362 / 1.00
SAVED
STARTING FIRST UNK
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at <unk> and 6PM so he's cutting it close. Luckily, the <unk> event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both <unk> and contacts, but the cleaning solution was empty so I wore the <unk> today.

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.36it/s]


SCORE 0.6953433307024467 / 1.00

PERFORMING Double <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.37it/s]



SCORE 0.5887924230465666 / 1.00
Deleted model
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [06:48<00:00,  1.41it/s]

Score 0.5031898788927336 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Double <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.30it/s]


SCORE 0.5193370165745856 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.30it/s]



SCORE 0.5039463299131808 / 1.00
ENDING FIRST UNK & STARTING SECOND
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at 6PM and <unk> so he's cutting it close. Luckily, the 6PM event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both contacts and <unk>, but the cleaning solution was empty so I wore the contacts today.

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.36it/s]


SCORE 0.6953433307024467 / 1.00

PERFORMING Single <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.36it/s]



SCORE 0.5481452249408051 / 1.00
Deleted model
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [06:48<00:00,  1.41it/s]

Score 0.4989727508650519 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Single <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.30it/s]


SCORE 0.5047355958958168 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.30it/s]



SCORE 0.4877663772691397 / 1.00
ENDING UNK TESTS AND STARTING STUPID MASKS
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at mask_a and mask_b so he's cutting it close. Luckily, the mask_a event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both mask_a and mask_b, but the cleaning solution was empty so I wore the mask_a today.
TESTING:

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.30it/s]


SCORE 0.48421468034727705 / 1.00

PERFORMING (mask_a, mask_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.30it/s]



SCORE 0.4988161010260458 / 1.00
Deleted model
POST TRAINING:
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [06:48<00:00,  1.41it/s]

Score 0.501189446366782 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING (mask_a, mask_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.31it/s]


SCORE 0.48382004735595896 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.31it/s]



SCORE 0.4972375690607735 / 1.00
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at thing_a and thing_b so he's cutting it close. Luckily, the thing_a event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both thing_a and thing_b, but the cleaning solution was empty so I wore the thing_a today.
TESTING:

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.30it/s]


SCORE 0.5094711917916338 / 1.00

PERFORMING (thing_a, thing_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.31it/s]



SCORE 0.5134175217048145 / 1.00
Deleted model
POST TRAINING:
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [06:48<00:00,  1.41it/s]

Score 0.5015679065743944 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING (thing_a, thing_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.31it/s]


SCORE 0.4976322020520916 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.31it/s]



SCORE 0.5063141278610892 / 1.00
REMOVING MODEL
NOW DOING albert-base-v2


Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

BEGIN OG TRAIN


Downloading:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.bias', 'predictions.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:03<00:00,  4.69it/s]

Score 0.49648572664359863 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.63it/s]


SCORE 0.494869771112865 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.64it/s]



SCORE 0.5078926598263614 / 1.00
Epoch 1


100%|██████████| 578/578 [02:03<00:00,  4.69it/s]

Score 0.5033520761245674 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.65it/s]


SCORE 0.521310181531176 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.62it/s]



SCORE 0.5082872928176796 / 1.00
Epoch 2


100%|██████████| 578/578 [02:03<00:00,  4.69it/s]

Score 0.49935121107266434 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.64it/s]


SCORE 0.4956590370955012 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.64it/s]



SCORE 0.4925019731649566 / 1.00
SAVED
STARTING FIRST UNK
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at <unk> and 6PM so he's cutting it close. Luckily, the <unk> event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both <unk> and contacts, but the cleaning solution was empty so I wore the <unk> today.

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.36it/s]


SCORE 0.494869771112865 / 1.00

PERFORMING Double <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.36it/s]



SCORE 0.4988161010260458 / 1.00
Deleted model
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:03<00:00,  4.69it/s]

Score 0.5026492214532872 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Double <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.62it/s]


SCORE 0.4913180741910024 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.65it/s]



SCORE 0.5205209155485399 / 1.00
ENDING FIRST UNK & STARTING SECOND
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at 6PM and <unk> so he's cutting it close. Luckily, the 6PM event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both contacts and <unk>, but the cleaning solution was empty so I wore the contacts today.

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.35it/s]


SCORE 0.494869771112865 / 1.00

PERFORMING Single <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.35it/s]



SCORE 0.5035516969218626 / 1.00
Deleted model
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:03<00:00,  4.67it/s]

Score 0.501784169550173 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Single <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.57it/s]


SCORE 0.5071033938437254 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.57it/s]



SCORE 0.4905288082083662 / 1.00
ENDING UNK TESTS AND STARTING STUPID MASKS
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at mask_a and mask_b so he's cutting it close. Luckily, the mask_a event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both mask_a and mask_b, but the cleaning solution was empty so I wore the mask_a today.
TESTING:

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.55it/s]


SCORE 0.5071033938437254 / 1.00

PERFORMING (mask_a, mask_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.58it/s]



SCORE 0.48539857932123126 / 1.00
Deleted model
POST TRAINING:
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:03<00:00,  4.68it/s]

Score 0.5043793252595156 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING (mask_a, mask_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.62it/s]


SCORE 0.5035516969218626 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.66it/s]



SCORE 0.5 / 1.00
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at thing_a and thing_b so he's cutting it close. Luckily, the thing_a event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both thing_a and thing_b, but the cleaning solution was empty so I wore the thing_a today.
TESTING:

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.67it/s]


SCORE 0.5011838989739542 / 1.00

PERFORMING (thing_a, thing_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.64it/s]



SCORE 0.4913180741910024 / 1.00
Deleted model
POST TRAINING:
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:03<00:00,  4.67it/s]

Score 0.4969182525951557 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING (thing_a, thing_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.57it/s]


SCORE 0.51026045777427 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:06<00:00, 12.57it/s]



SCORE 0.5063141278610892 / 1.00
REMOVING MODEL
NOW DOING bert-base-cased


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

BEGIN OG TRAIN


Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:02<00:00,  4.72it/s]

Score 0.5060553633217993 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.94it/s]


SCORE 0.4928966061562747 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 14.01it/s]



SCORE 0.48855564325177586 / 1.00
Epoch 1


100%|██████████| 578/578 [02:02<00:00,  4.73it/s]

Score 0.4969182525951557 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.98it/s]


SCORE 0.5177584846093133 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.97it/s]



SCORE 0.48184688239936857 / 1.00
Epoch 2


100%|██████████| 578/578 [02:02<00:00,  4.72it/s]

Score 0.5068122837370242 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.98it/s]


SCORE 0.4936858721389108 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.88it/s]



SCORE 0.5063141278610892 / 1.00
SAVED
STARTING FIRST UNK
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at [UNK] and 6PM so he's cutting it close. Luckily, the [UNK] event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both [UNK] and contacts, but the cleaning solution was empty so I wore the [UNK] today.

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.34it/s]


SCORE 0.5043409629044988 / 1.00

PERFORMING Double <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.32it/s]



SCORE 0.5015785319652722 / 1.00
Deleted model
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:02<00:00,  4.71it/s]

Score 0.5014597750865052 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Double <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.94it/s]


SCORE 0.48855564325177586 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.74it/s]



SCORE 0.4940805051302289 / 1.00
ENDING FIRST UNK & STARTING SECOND
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at 6PM and [UNK] so he's cutting it close. Luckily, the 6PM event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both contacts and [UNK], but the cleaning solution was empty so I wore the contacts today.

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.35it/s]


SCORE 0.5043409629044988 / 1.00

PERFORMING Single <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.35it/s]



SCORE 0.505130228887135 / 1.00
Deleted model
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:02<00:00,  4.71it/s]

Score 0.4937824394463668 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Single <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.96it/s]


SCORE 0.49171270718232046 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.94it/s]



SCORE 0.47868981846882397 / 1.00
ENDING UNK TESTS AND STARTING STUPID MASKS
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at mask_a and mask_b so he's cutting it close. Luckily, the mask_a event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both mask_a and mask_b, but the cleaning solution was empty so I wore the mask_a today.
TESTING:

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.93it/s]


SCORE 0.4996053670086819 / 1.00

PERFORMING (mask_a, mask_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.97it/s]



SCORE 0.4905288082083662 / 1.00
Deleted model
POST TRAINING:
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:02<00:00,  4.72it/s]

Score 0.49789143598615915 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING (mask_a, mask_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.91it/s]


SCORE 0.515785319652723 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.91it/s]



SCORE 0.5126282557221784 / 1.00
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at thing_a and thing_b so he's cutting it close. Luckily, the thing_a event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both thing_a and thing_b, but the cleaning solution was empty so I wore the thing_a today.
TESTING:

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.92it/s]


SCORE 0.49329123914759276 / 1.00

PERFORMING (thing_a, thing_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.90it/s]



SCORE 0.4936858721389108 / 1.00
Deleted model
POST TRAINING:
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:02<00:00,  4.72it/s]

Score 0.5037305363321799 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING (thing_a, thing_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.93it/s]


SCORE 0.5098658247829518 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:05<00:00, 13.90it/s]



SCORE 0.505524861878453 / 1.00
REMOVING MODEL
NOW DOING xlnet-base-cased


Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

BEGIN OG TRAIN


Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:25<00:00,  3.97it/s]

Score 0.49594506920415227 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.92it/s]


SCORE 0.48500394632991317 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.90it/s]



SCORE 0.4996053670086819 / 1.00
Epoch 1


100%|██████████| 578/578 [02:25<00:00,  3.98it/s]

Score 0.4986483564013841 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.93it/s]


SCORE 0.5063141278610892 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.91it/s]



SCORE 0.4928966061562747 / 1.00
Epoch 2


100%|██████████| 578/578 [02:25<00:00,  3.98it/s]

Score 0.5084883217993079 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.88it/s]


SCORE 0.5094711917916338 / 1.00

PERFORMING Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.94it/s]



SCORE 0.5035516969218626 / 1.00
SAVED
STARTING FIRST UNK
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at <unk> and 6PM so he's cutting it close. Luckily, the <unk> event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both <unk> and contacts, but the cleaning solution was empty so I wore the <unk> today.

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.34it/s]


SCORE 0.4996053670086819 / 1.00

PERFORMING Double <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.33it/s]



SCORE 0.4984214680347277 / 1.00
Deleted model
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:25<00:00,  3.97it/s]

Score 0.5033520761245674 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Double <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.86it/s]


SCORE 0.4956590370955012 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.89it/s]



SCORE 0.4972375690607735 / 1.00
ENDING FIRST UNK & STARTING SECOND
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at 6PM and <unk> so he's cutting it close. Luckily, the 6PM event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both contacts and <unk>, but the cleaning solution was empty so I wore the contacts today.

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.33it/s]


SCORE 0.4996053670086819 / 1.00

PERFORMING Single <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:18<00:00,  4.34it/s]



SCORE 0.5067087608524072 / 1.00
Deleted model
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:25<00:00,  3.97it/s]

Score 0.5012975778546713 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING Single <Unk> Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.86it/s]


SCORE 0.5 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.88it/s]



SCORE 0.5094711917916338 / 1.00
ENDING UNK TESTS AND STARTING STUPID MASKS
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at mask_a and mask_b so he's cutting it close. Luckily, the mask_a event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both mask_a and mask_b, but the cleaning solution was empty so I wore the mask_a today.
TESTING:

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.91it/s]


SCORE 0.5126282557221784 / 1.00

PERFORMING (mask_a, mask_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.93it/s]



SCORE 0.4925019731649566 / 1.00
Deleted model
POST TRAINING:
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:25<00:00,  3.97it/s]

Score 0.5003243944636678 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING (mask_a, mask_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.88it/s]


SCORE 0.5007892659826362 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.89it/s]



SCORE 0.5023677979479084 / 1.00
[4PM], [6PM], [Joe had two events to go to at 4PM and 6PM so he's cutting it close. Luckily, the 4PM event started late.]
Joe had two events to go to at thing_a and thing_b so he's cutting it close. Luckily, the thing_a event started late.
[glasses], [contacts], [I have both glasses and contacts, but the cleaning solution was empty so I wore the glasses today.]
I have both thing_a and thing_b, but the cleaning solution was empty so I wore the thing_a today.
TESTING:

TESTING TESTING  1 2 3 ...

PERFORMING Standard Dataset


  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.87it/s]


SCORE 0.4925019731649566 / 1.00

PERFORMING (thing_a, thing_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.90it/s]



SCORE 0.49013417521704816 / 1.00
Deleted model
POST TRAINING:
copy


  0%|          | 0/19 [00:00<?, ?ba/s]

Epoch 0


100%|██████████| 578/578 [02:25<00:00,  3.97it/s]

Score 0.5012975778546713 / 1.00

TESTING TESTING  1 2 3 ...

PERFORMING (thing_a, thing_b) Masked Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.86it/s]


SCORE 0.4996053670086819 / 1.00

PERFORMING Standard Dataset





  0%|          | 0/3 [00:00<?, ?ba/s]

100%|██████████| 80/80 [00:07<00:00, 10.81it/s]



SCORE 0.489344909234412 / 1.00
REMOVING MODEL
NOW DOING xlm-mlm-en-2048


Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/840 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/631k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475k [00:00<?, ?B/s]

REMOVING MODEL


NameError: ignored

Results:

| Masking Scheme | Accuracy (before training) | Accuracy (after training on silly set) |
|:-------:|:------------------:|:----------------:|
|Standard set| 75% | 75% |
|(Alice, Bob)| 73% | 73% |
|(primero, secundo)| 72% | 75% |
| (X, Y) | 72% | 74% |
| (A, B) | 72% | 73% |
| (1, 2) | 71% | 74% |
|(red, blue) | 71% | 73% |
|(#, @)| 71% | 73% |
|(alpha, beta)| 71% | 73% |
|(mask_a, mask_b) | 70% | 75% | 
|(thing_a, thing_b) | 70% | 73% |
|(first, second)| 69% | 73% |
|(\_, \_\_)| 67% | 73% |
|(dog, doggy)| 65% | 73% | 
|(flavor, flavour)| 53% | 74% |



| Masking Scheme | Accuracy (before training) | Accuracy (after training on \<unk\> masked set) |
|:-------:|:------------------:|:----------------:|
|\<unk\> as subj| 65% | 73% |
|\<unk\> as obj| 68% | 73% |


