<a href="https://colab.research.google.com/github/jsvan/MaskTests4LanguageModels/blob/main/Extension_of_Summerwinograndeattempt_to_more_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this extension I will try out more models, first tuning generic language models on the debiased dataset. 

In [1]:
%%capture
!pip install transformers
!pip install datasets 

In [2]:
# Textizer
"""
    Each sentence was split on "_" placeholder symbol.
    Each option was concatenated with the second part of the split, thus transforming each example into two text segment pairs.
    Text segment pairs corresponding to correct and incorrect options were marked with True and False labels accordingly.
    Text segment pairs were shuffled thereafter.

"""

from datasets import Dataset

def prepare_data(dataset):

  # internal function
  def prep_ds(dataset):
    sentences, answers, o1, o2 = [], [], [], []
    for p in dataset:
      s1 = p['option1'].join(p['sentence'].split('_'))
      s2 = p['option2'].join(p['sentence'].split('_'))
      a1 = int(p['answer'] == '1')
      a2 = int(p['answer'] == '2')

      
      sentences.append(s1)
      answers.append(a1)
      sentences.append(s2)
      answers.append(a2)
      o1.append(p['option1'])
      o2.append(p['option2'])
      o1.append(p['option1'])
      o2.append(p['option2'])

    return {'sentence':sentences, 'labels':answers, 'option1':o1, 'option2':o2}
    # end internal function

  train = prep_ds(dataset["train"])
  test = prep_ds(dataset["validation"])
  trainds = Dataset.from_dict( train ).shuffle()
  testds = Dataset.from_dict( test ).shuffle()

  return {"train":trainds, "test":testds, "name":"Standard Dataset"}


In [3]:

def mask_datasets(dataset):

  # internal method BEGIN
  def mask_copy(dataset):
    sentences = []
    toprint = 1

    for p in dataset:
      if toprint > 0:
        print(f"[{p['option1']}], [{p['option2']}], [{p['sentence']}]")

      sentences.append(p['sentence'].replace(p['option1'], 'option1').replace(p['option2'], 'option2'))
      
      if toprint > 0:
        print(sentences[-1])
        toprint -= 1

    build = {'sentence':sentences, 'labels':dataset['labels'], 'option1':dataset['option1'], 'option2':dataset['option2']}
    return Dataset.from_dict(build) # DON'T SHUFFLE
  #END
  
  return {"train":mask_copy(dataset['train']), "test":mask_copy(dataset['test']), "name":"Masked Dataset"}



In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from pprint import pprint
from tqdm import tqdm

""" 
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("roberta-large")

model = AutoModelForMaskedLM.from_pretrained("roberta-large")
"""


def test_datasets(tokenizer, std_datasets, masked_datasets, model=False,):

  print("\nTESTING TESTING  1 2 3 ...")
  delete = False
  if not model:
    model = AutoModelForSequenceClassification.from_pretrained("DeepPavlov/roberta-large-winogrande")
    delete = True

  elif isinstance(model, str):
    delete = True
    with torch.no_grad():
      torch.cuda.empty_cache()

    model = torch.load(model) # open(model, "rb"))

    with torch.no_grad():
      torch.cuda.empty_cache()
  try:
    for ds in (std_datasets, masked_datasets):
      print("\nPERFORMING", ds["name"])
      
      #combinedtraintest = concatenate_datasets([ds['train'], ds['test']])
      #encoded_train = combinedtraintest.map(lambda examples: tokenizer(examples['sentence'], padding='max_length'), batched=True) # , return_tensors='pt'
      encoded_test = ds['test'].map(lambda examples: tokenizer(examples['sentence'], padding='max_length'), batched=True)

      #encoded_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
      encoded_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
      #dataloader_train = torch.utils.data.DataLoader(encoded_train, batch_size=32)
      dataloader_test = torch.utils.data.DataLoader(encoded_test, batch_size=32)

      device = 'cuda' if torch.cuda.is_available() else 'cpu' 
      #model.train().to(device)
      model.to(device)
      #optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

      correct = 0
      total = 0
      for i, batch in enumerate(tqdm(dataloader_test)):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        #for oo, lab in zip(outputs.logits, batch['labels'] ):
        #  print(oo.argmax().item(), lab.items())
        correct += sum((int(x.argmax().item() == y.item()) for x, y in zip(outputs.logits, batch['labels'])))
        total += len(batch['labels'])
        
        
      print("\nSCORE", correct / total, '/ 1.00')

    if delete:
      del model
      print("Deleted model")

  except Exception as e:
    del model
    print(e, e.__str__)
    print("Deleted model")




In [None]:

def train_model(trainingset, compareset, tokenizer, model=False, copy=False, epochs=1):
  print('\n\nTRAIN:\n', '\n'.join(trainingset['train']['sentence'][0:10]))
  print('\n\nLABELS:\n', '\n'.join([str(x) for x in trainingset['train']['labels'][0:10]]))

  print('\n\nTEST:\n', '\n'.join(trainingset['test']['sentence'][0:10]))
  print('\n\nLABELS:\n', '\n'.join([str(x) for x in trainingset['test']['labels'][0:10]]))


  if copy: # ie, if copy and model
    print('DUPLICATING MODEL')

    with torch.no_grad():
      torch.cuda.empty_cache()
    model = torch.load(model) # open(model, "rb"))
    
  elif model and isinstance(model, str): # ie if str(model) AND NOT copy:
    print("DOWNLOADING MODEL")
    # download new model of name
    model = AutoModelForSequenceClassification.from_pretrained(model)

  try:
    encoded_train = trainingset['train'].map(lambda examples: tokenizer(examples['sentence'], padding='max_length'), batched=True) # , return_tensors='pt'
    encoded_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    dataloader_train = torch.utils.data.DataLoader(encoded_train, batch_size=32)

    device = 'cuda' if torch.cuda.is_available() else 'cpu' 
    model.train().to(device)
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

    for epoch in range(epochs):
      print("EPOCH", epoch+1, '/', epochs)
      correct = 0
      total = 0
      for i, batch in enumerate(tqdm(dataloader_train)):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i < 1:
          for oo, lab in zip(outputs.logits, batch['labels'] ):
            print(oo.argmax().item(), lab.item())
        correct += sum((int(x.argmax().item() == y.item()) for x, y in zip(outputs.logits, batch['labels'])))
        total += len(batch['labels'])
        
      print("Score", correct / total, '/ 1.00')
      test_datasets(tokenizer, trainingset, compareset, model)
      model.train().to(device)
  except Exception as e:
    del model
    print(e, e.__str__)
    print("deleted model")
  if copy:
    del model
  else:
    return model




In [6]:
import re

def mask_copy_1(dataset, unk):
  sentences = []
  toprint = 1

  for p in dataset:
    # This is annoying because option1 can be substrings of other words, usually option2. They can also have uppercase letters. 
    # If one is a substring, then I will cover up the larger word with a temporary mask to not confuse anything else.
    sentence = p['sentence']
    option1, option2 = re.compile(p['option1'], re.IGNORECASE), re.compile(p['option2'], re.IGNORECASE)
    maskedoption1, maskedoption2 = "OPTION_ONE", "OPTION_TWO"
    first_search, second_search, first_mask, second_mask = None, None, None, None

    # "table" in "tablecloth" --> cover bigger one
    if len(p['option1']) > len(p['option2']):
      # cover option1 first
      first_search, second_search = option1, option2
      first_mask, second_mask = maskedoption1, maskedoption2
    else:
      first_search, second_search = option2, option1
      first_mask, second_mask = maskedoption2, maskedoption1
    
    sentence = first_search.sub(first_mask, sentence) #Mask the longer word with OPTION_ mask
    sentence = second_search.sub(second_mask, sentence) #then the shorter one

    # IT gets kinda confusing which word now to <IGNORE> because it's language and some words appear more than two, more than three, times. 
    # I think the smartest approach is to assume the final word used is the question word and mask that one. 

    # Find final word used
    # maskedoption1 is the final word
    if sentence.rfind(maskedoption1) > sentence.rfind(maskedoption2):
      # IGNORE final word
      # Convert other word back to original
      sentence = sentence.replace(maskedoption1, unk)
      sentence = sentence.replace(maskedoption2, p['option2'])
    else:
      sentence = sentence.replace(maskedoption2, unk)
      sentence = sentence.replace(maskedoption1, p['option2'])


    if toprint > 0:
      print(f"[{p['option1']}], [{p['option2']}], [{p['sentence']}]")

    sentences.append(sentence)
    
    if toprint > 0:
      print(sentences[-1])
      toprint -= 1

  build = {'sentence':sentences, 'labels':dataset['labels'], 'option1':dataset['option1'], 'option2':dataset['option2']}
  return Dataset.from_dict(build) # DON'T SHUFFLE





def mask_datasets_1(dataset, tokenizer):
  unk = tokenizer.unk_token
  return {"train":mask_copy_1(dataset['train'], unk), "test":mask_copy_1(dataset['test'], unk), "name":"Double <Unk> Masked Dataset"}

In [7]:
import re


# This masked <unk> tokens on the option which is NOT involved in the question. 
# If the option involved in the question is wrong, it will need to identify the <unk> token as having importance.
def mask_copy_2(dataset, unk):
  sentences = []
  toprint = 1

  for p in dataset:
    # This is annoying because option1 can be substrings of other words, usually option2. They can also have uppercase letters. 
    # If one is a substring, then I will cover up the larger word with a temporary mask to not confuse anything else.
    sentence = p['sentence']
    option1, option2 = re.compile(p['option1'], re.IGNORECASE), re.compile(p['option2'], re.IGNORECASE)
    maskedoption1, maskedoption2 = "OPTION_ONE", "OPTION_TWO"
    first_search, second_search, first_mask, second_mask = None, None, None, None

    # "table" in "tablecloth" --> cover bigger one
    if len(p['option1']) > len(p['option2']):
      # cover option1 first
      first_search, second_search = option1, option2
      first_mask, second_mask = maskedoption1, maskedoption2
    else:
      first_search, second_search = option2, option1
      first_mask, second_mask = maskedoption2, maskedoption1
    
    sentence = first_search.sub(first_mask, sentence) #Mask the longer word with OPTION_ mask
    sentence = second_search.sub(second_mask, sentence) #then the shorter one

    # IT gets kinda confusing which word now to <IGNORE> because it's language and some words appear more than two, more than three, times. 
    # I think the smartest approach is to assume the final word used is the question word and mask that one. 

    # Find final word used
    # maskedoption1 is NOT the final word
    if sentence.rfind(maskedoption1) < sentence.rfind(maskedoption2):
      # IGNORE final word
      # Convert other word back to original
      sentence = sentence.replace(maskedoption1, unk)
      sentence = sentence.replace(maskedoption2, p['option2'])
    else:
      sentence = sentence.replace(maskedoption2, unk)
      sentence = sentence.replace(maskedoption1, p['option2'])


    if toprint > 0:
      print(f"[{p['option1']}], [{p['option2']}], [{p['sentence']}]")

    sentences.append(sentence)
    
    if toprint > 0:
      print(sentences[-1])
      toprint -= 1

  build = {'sentence':sentences, 'labels':dataset['labels'], 'option1':dataset['option1'], 'option2':dataset['option2']}
  return Dataset.from_dict(build) # DON'T SHUFFLE





def mask_datasets_2(dataset, tokenizer):
  unk = tokenizer.unk_token
  return {"train":mask_copy_2(dataset['train'], unk), "test":mask_copy_2(dataset['test'], unk), "name":"Single <Unk> Masked Dataset"}

In [8]:
stupid_tries = [
  ("dog", "doggy"),
  ("red", "blue"),
  ("flavor", "flavour"),
  ('A', 'B'),
  ('X', 'Y'),
  ('1', '2'),
  ('first', 'second'),
  ('alpha', 'beta'),
  ('#', '@'),
  ('primero', 'secundo'), # yes its true, I dont speak spanish. Having it spelled wrong just makes the model's task that much more difficult ;)
  ('Alice', 'Bob'),
  ('_', '__'),
  ('mask1', 'mask2'),
  ('thing1', 'thing2'),
  ('mask_a', 'mask_b'),
  ("thing_a", "thing_b")
]


In [9]:
import re


# This masked <unk> tokens on the option which is NOT involved in the question. 
# If the option involved in the question is wrong, it will need to identify the <unk> token as having importance.
def stupid_masking(dataset, mask1, mask2):
  sentences = []
  toprint = 1

  for p in dataset:
    # This is annoying because option1 can be substrings of other words, usually option2. They can also have uppercase letters. 
    # If one is a substring, then I will cover up the larger word with a temporary mask to not confuse anything else.
    sentence = p['sentence']
    option1, option2 = re.compile(p['option1'], re.IGNORECASE), re.compile(p['option2'], re.IGNORECASE)
    maskedoption1, maskedoption2 = "OPTION_ONE", "OPTION_TWO"
    first_search, second_search, first_mask, second_mask = None, None, None, None

    # "table" in "tablecloth" --> cover bigger one
    if len(p['option1']) > len(p['option2']):
      # cover option1 first
      first_search, second_search = option1, option2
      first_mask, second_mask = maskedoption1, maskedoption2
    else:
      first_search, second_search = option2, option1
      first_mask, second_mask = maskedoption2, maskedoption1
    
    sentence = first_search.sub(first_mask, sentence) #Mask the longer word with OPTION_ mask
    sentence = second_search.sub(second_mask, sentence) #then the shorter one

    sentence = sentence.replace(maskedoption1, mask1)
    sentence = sentence.replace(maskedoption2, mask2)


    if toprint > 0:
      print(f"[{p['option1']}], [{p['option2']}], [{p['sentence']}]")

    sentences.append(sentence)
    
    if toprint > 0:
      print(sentences[-1])
      toprint -= 1

  build = {'sentence':sentences, 'labels':dataset['labels'], 'option1':dataset['option1'], 'option2':dataset['option2']}
  return Dataset.from_dict(build) # DON'T SHUFFLE





def stupid_datasets(dataset, tokenizer, masks):
  mask1, mask2 = masks
  return {"train":stupid_masking(dataset['train'], mask1, mask2), "test":stupid_masking(dataset['test'], mask1, mask2), "name":f"({mask1}, {mask2}) Masked Dataset"}

In [10]:
%%capture
from datasets import load_dataset, concatenate_datasets

# You can switch between these two datasets
dataset = load_dataset("winogrande", 'winogrande_debiased')
#dataset = load_dataset("winogrande", 'winogrande_l')

# dataset is dict with keys ['train', 'test', 'validation']
# Each with an enumerable of 
"""
{'answer': '2',
 'option1': 'Kyle',
 'option2': 'Logan',
 'sentence': "Kyle doesn't wear leg warmers to bed, while Logan almost always does. _ is more likely to live in a colder climate."}

"""
# Use Validation instead of Test because Test lacks labels.


#dicts of {'train':_, 'test':_}
std_datasets = prepare_data(dataset)

masked_datasets = mask_datasets(std_datasets)

In [None]:
import os

# This model will become the base model for everything we build upon, 
# so we will save it to disk to load it fresh for each experiment. 
model_list = [#"DeepPavlov/roberta-large-winogrande", 
              "roberta-large",
              "distilbert-base-uncased", 
              "albert-base-v2", 
              'bert-base-cased', 
              'xlnet-base-cased', 
            ]

for model_name_orig in model_list:
  try:
    print("NOW DOING", model_name_orig)
    # download model and save to memory. Models must be removed from cache to prevent gpu mem errors.
    # fine tune the model to the debiased dataset
    tokenizer = AutoTokenizer.from_pretrained(model_name_orig, model_max_length=64)
    print("BEGIN OG TRAIN")
    debiased_tuned_model = train_model(trainingset=std_datasets, compareset=masked_datasets, tokenizer=tokenizer, model=model_name_orig, copy=False, epochs=3)
    break



    print("SAVED")
    model_name = ''.join([c for c in model_name if c.isalpha()])
    torch.save(debiased_tuned_model, f=model_name)
    debiased_tuned_model = None


    # Run individual tests 
    print("STARTING FIRST UNK")
    unk_datasets = mask_datasets_1(std_datasets, tokenizer)
    test_datasets(tokenizer, std_datasets, unk_datasets, debiased_tuned_model)
    train_model(unk_datasets, std_datasets, tokenizer, model=model_name, copy = True)
    print("ENDING FIRST UNK & STARTING SECOND")

    unk_dataset_2 = mask_datasets_2(std_datasets, tokenizer)
    test_datasets(tokenizer, std_datasets, unk_dataset_2, debiased_tuned_model)
    train_model(unk_dataset_2, std_datasets, tokenizer, model=model_name, copy = True)

    print("ENDING UNK TESTS AND STARTING STUPID MASKS")

    for masks in stupid_tries[-2:]:
      stupid_ds = stupid_datasets(std_datasets, tokenizer, masks)
      print("TESTING:")
      test_datasets(tokenizer, std_datasets, stupid_ds, model=model_name)
      print("POST TRAINING:")
      train_model(stupid_ds, std_datasets, tokenizer, model=model_name, copy = True)

  finally:
    print("REMOVING MODEL")
    del tokenizer
    os.remove(model_name)


  break

NOW DOING roberta-large
BEGIN OG TRAIN


TRAIN:
 Joel didn't want to let Logan play with the slot machine seeing as how Logan was on a hot streak.
In geometry class Craig commented that Brett's nose is too big, resulting in Craig giving him a shiner.
Making sure to not have any weeds in the garden was important to Victoria but not Elena because Elena entertained a lot in the garden.
Maria asked Megan if she would help clean the garage because Megan wanted to get work done around the house.
Wendy preferred to read the story than to watch the TV because the TV was quiet and wouldn't disturb anyone.
James took the butter over the fire and poured it over the juice from the freezer. The butter is frozen.
Eric found a lot of success in telemarketing products while Randy was reclusive since Randy was extroverted.
Mike and friends practiced basketball out in the field instead of gym during rain, even though the gym is dry.
Monica stretched their muscles before they worked out but Sarah didn't 

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

  0%|          | 0/19 [00:00<?, ?ba/s]

EPOCH 1 / 3


  0%|          | 1/578 [00:00<07:19,  1.31it/s]

0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 1
0 1
0 1
1 0
0 0
0 1
1 0
1 1
0 1
0 1
0 0
0 1
0 0
0 1
0 1
0 1
1 0
0 0
0 1
0 0
0 0
0 0
0 0
0 1
0 0


  0%|          | 2/578 [00:01<07:12,  1.33it/s]

0 0
0 0
0 0
0 0
0 1
0 0
0 1
1 0
0 1
0 0
0 1
0 0
0 1
0 0
0 0
0 1
0 0
1 1
0 1
0 1
0 1
0 0
0 1
0 1
0 1
0 0
1 0
0 0
0 0
0 0
1 0
0 0


  1%|          | 3/578 [00:02<07:09,  1.34it/s]

0 0
0 1
0 0
0 1
0 0
1 1
0 0
0 1
0 1
0 0
1 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 1
1 0
0 0
0 0
0 0
0 1
0 1
0 0
0 1
0 0
0 0
0 1


  1%|          | 4/578 [00:02<07:08,  1.34it/s]

0 1
0 0
0 0
0 0
0 0
0 1
0 1
1 0
0 0
0 0
0 0
0 1
0 1
0 0
0 1
0 1
0 1
0 1
0 1
0 0
0 1
1 1
0 1
0 1
0 0
0 0
1 1
0 0
0 0
0 1
0 1
0 1


  1%|          | 5/578 [00:03<07:01,  1.36it/s]

0 1
0 0
0 0
1 0
0 0
0 1
1 0
0 1
0 0
0 0
1 0
0 1
0 1
0 1
0 1
0 1
0 0
0 0
0 1
0 1
0 1
0 0
1 1
1 0
1 0
0 1
1 0
1 1
0 1
0 1
0 0
0 1


  1%|          | 6/578 [00:04<07:06,  1.34it/s]

0 0
0 0
0 1
0 1
0 0
0 0
1 0
1 1
0 0
0 1
1 1
1 0
1 1
1 0
0 1
1 0
1 0
1 0
1 0
0 0
0 0
1 1
0 1
1 0
0 1
1 0
1 1
0 0
0 0
1 1
1 0
1 0


  1%|          | 7/578 [00:05<07:02,  1.35it/s]

1 0
1 1
0 0
1 1
0 0
1 0
0 1
0 0
0 0
0 0
0 1
0 0
0 1
0 1
0 1
0 0
0 1
0 0
0 0
0 0
0 0
0 1
0 1
0 1
1 0
1 0
0 1
1 1
0 0
0 0
0 1
0 1


  1%|▏         | 8/578 [00:05<07:05,  1.34it/s]

0 1
1 1
0 0
0 1
0 1
0 0
1 1
0 0
0 0
0 1
0 1
1 1
1 0
0 0
0 0
0 1
0 0
0 0
1 1
1 0
0 0
1 1
1 1
1 1
0 0
0 0
0 0
0 1
0 0
0 1
1 0
0 0


  2%|▏         | 9/578 [00:06<07:03,  1.34it/s]

1 1
0 0
1 1
0 0
0 0
1 1
1 1
0 1
1 0
1 0
1 0
0 1
1 0
0 1
1 0
0 0
0 0
0 0
1 0
0 1
1 0
0 0
1 1
1 0
0 0
0 1
0 0
1 1
1 0
0 1
0 0
0 0


  2%|▏         | 10/578 [00:07<07:02,  1.34it/s]

1 1
0 0
0 0
1 0
0 1
0 1
1 1
0 1
0 0
0 1
0 1
0 1
0 1
0 1
0 1
0 1
0 0
0 1
0 0
0 0
0 0
0 1
0 0
0 0
0 1
0 0
0 0
0 1
0 0
0 0
0 0
0 1


 46%|████▌     | 266/578 [03:08<03:40,  1.41it/s]

Results:

| Masking Scheme | Accuracy (before training) | Accuracy (after training on silly set) |
|:-------:|:------------------:|:----------------:|
|Standard set| 75% | 75% |
|(Alice, Bob)| 73% | 73% |
|(primero, secundo)| 72% | 75% |
| (X, Y) | 72% | 74% |
| (A, B) | 72% | 73% |
| (1, 2) | 71% | 74% |
|(red, blue) | 71% | 73% |
|(#, @)| 71% | 73% |
|(alpha, beta)| 71% | 73% |
|(mask_a, mask_b) | 70% | 75% | 
|(thing_a, thing_b) | 70% | 73% |
|(first, second)| 69% | 73% |
|(\_, \_\_)| 67% | 73% |
|(dog, doggy)| 65% | 73% | 
|(flavor, flavour)| 53% | 74% |



| Masking Scheme | Accuracy (before training) | Accuracy (after training on \<unk\> masked set) |
|:-------:|:------------------:|:----------------:|
|\<unk\> as subj| 65% | 73% |
|\<unk\> as obj| 68% | 73% |


