In [None]:
# Group: John Strenio, Scott Klinn, Tuan Nguyen
# Commonsense QA Finetuning v4
# CS 510: Adventures in NLP
# Professor Ameeta Agrawal
# Contributors: John Strenio

# =========== Summary & Instructions ===============================
# This notebook provides a number of different functions revolving
#  around the finetuning of a pretrained bert model. The first 2 
# cells train and save a model but due to the notebook causing 
# system failures at scale, they are generally not used. For training 
# 'commonsense_train.py' is currently used. Testing however is 
# performed here using the final 3 cells, which load the commonsenseQA
#  validation set to the standards specified by the authors of our 
# paper (last 611 of validation set), and defines a function for 
# generating a question to test from the dataset, and test a 
# specified model. The final 3 cell can be run together alone for
# testing only.
# ==================================================================

# This cell uses huggingface/pytorch to prepare the commonsense qa dataset
# for ingestion into some version of bert with a NSP head.

# import datasets and model
model_checkpoint = 'distilbert-base-uncased'

# loads the dataset (downloads it if you don't have it)
from datasets import load_dataset, load_metric, Dataset
dataset = load_dataset("commonsense_qa")

# import a tokenizer the auto is just making sure it fits our model, it should just be selecting DistilBertTokenizerFast
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# make sure its an optimized one thats fast if available
import transformers
import random
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

def build_sentences(example):
    # each example is going to output 2 examples, 1 for correct, 1 for wrong (originally did one for each possible answer)
    first_sentences = []
    second_sentences = []

    question = example['question']
    ans_text = [choice for choice in example['choices']['text']]
    choices = []

    # the correct answer will be first, the other answer will be selected randomly from whats left
    choices.append(ans_text.pop(ord(example['answerKey']) - 65))
    choices.append(random.choice(ans_text))

    # 1st sentence is question, 2nd is choices
    for i in range(len(choices)):
        first_sentences.append(question)
        second_sentences.append(choices[i])

    # we're choosing to take a correct (0 label) and an incorrect (1 label) from each example
    labels = [0, 1]

    return first_sentences, second_sentences, labels

def encode_dataset(dataset, dset_size):
    first_sentences_to_encode = []
    second_sentences_to_encode = []
    labels_to_encode = []

    for i in range(dset_size):
        first_sentences, second_sentences, labels = build_sentences(dataset[i])
        first_sentences_to_encode += first_sentences
        second_sentences_to_encode += second_sentences
        labels_to_encode += labels

    encodings = tokenizer(first_sentences_to_encode, second_sentences_to_encode, return_tensors='pt', padding="longest", truncation=True)
    
    return encodings, torch.LongTensor(labels_to_encode)

import torch

class QA_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset['train'] = Dataset.shuffle(dataset['train'])
dataset['validation'] = Dataset.shuffle(dataset['validation'])
dataset['test'] = Dataset.shuffle(dataset['test'])

# I've been adjusting the dataset sizes to try to better loss without having to retrain for 6 hours
encodings, encoded_labels = encode_dataset(dataset['train'], int(len(dataset['train']))) # normally the dataset size or some large fraction
encodings2, encoded_labels2 = encode_dataset(dataset['validation'], int(len(dataset['validation'])))

train_dataset = QA_Dataset(encodings, torch.LongTensor(encoded_labels))
val_dataset = QA_Dataset(encodings2, torch.LongTensor(encoded_labels2))

# fine tuning (warning references that we're removing the classification head for finetuning)
from transformers import BertForNextSentencePrediction, TrainingArguments, Trainer

model =  BertForNextSentencePrediction.from_pretrained(model_checkpoint)

# training arguments
batch_size = 2
args = TrainingArguments(
    f"test-qa",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3, #6
    weight_decay=0.01,
    save_total_limit=5
)

# data collator
from transformers import default_data_collator
data_collator = default_data_collator

# build trainer
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# this was the bandaid I found online for getting the batching to be accepted
import torch
torch.cuda.empty_cache()
import gc
#del variables
gc.collect()

# training
output = trainer.train()
print(output)


In [None]:
# this cell saves a model
model.save_pretrained('D:/project/test_model1')


In [None]:
# This cell defines the function that retrieves random questions from the validation set
# (currently not being used)
from datasets import load_dataset, load_metric, Dataset
dataset = load_dataset("commonsense_qa")

def get_random_question(n):
    dset = Dataset.shuffle(dataset['validation'])
    ans = dset[n]['answerKey']
    cor_ans = 0
    wrg_ans = 0
    question = dset[n]['question']
    for i in range(5):
        if ans == dset[n]['choices']['label'][i]:
            cor_ans = dset[n]['choices']['text'][i]
        else:
            wrg_ans = dset[n]['choices']['text'][i]
    print('q: ' + question)
    print('c: ' + cor_ans + ' w: ' + wrg_ans)
    return question, cor_ans, wrg_ans



In [1]:
# this cell loads the commonsenseQA validation set and extracts the 
# necessary info from an instance from the commonsense_qa dataset

from datasets import load_dataset, load_metric, Dataset
valid_set = load_dataset('commonsense_qa', split='validation[-611:]')
import random

def get_question(example):
    question = example['question']
    ans_text = [choice for choice in example['choices']['text']]
    choices = []

    # the correct answer will be first, the wrong answer will selected randomly from whats left
    choices.append(ans_text.pop(ord(example['answerKey']) - 65))
    choices.append(random.choice(ans_text))

    return question, choices[0], choices[1]

Using custom data configuration default
Reusing dataset commonsense_qa (C:\Users\johns\.cache\huggingface\datasets\commonsense_qa\default\0.1.0\1ca2d7b680c5bd93c0dc85f9cb65c0c8817e759ff82e405b28de54e83efa80f7)


In [2]:
# This cell tests the specified model on question/answer pairings from the validation set
# Because the original dataset has been modified, the testing structure has been as well
# to accomodate the 2 option format. This cell can be run alone only after the load 
# validation cell directly above has been executed.

from transformers import DistilBertTokenizerFast, AutoTokenizer, BertForNextSentencePrediction, AutoConfig
import torch
from random import randrange

# specificy the location of the model you wish to test here
model_to_test = r'finetuned_bert'

saved_model = model_to_test # this is the model being tested 
model_class = 'bert-base-uncased' # this is necessary for the autotokenizer to determine which tokenizer to provide
config = AutoConfig.from_pretrained(saved_model)
model = BertForNextSentencePrediction.from_pretrained(saved_model)
tokenizer = AutoTokenizer.from_pretrained(model_class)
cor = wrg = 0

# predict whether question answers are correct or not
for i in range(len(valid_set)):
    question, cor_ans, wrg_ans = get_question(valid_set[i])

    encoding = tokenizer(question, cor_ans, return_tensors='pt')
    outputs = model(**encoding)
    logits = outputs.logits

    encoding2 = tokenizer(question, wrg_ans, return_tensors='pt')
    outputs2 = model(**encoding2)
    logits2 = outputs2.logits

    # custom test for 1 to 1 correct:wrong answer pairing
    # check if correct sentence pairing has a higher correct prob than incorrect pairing
    out = outputs.logits.softmax(dim=-1).tolist()
    out2 = outputs2.logits.softmax(dim=-1).tolist()
    print('probs for cor pair: ' + str(out[0]) + ' ' + 'probs for wrg pair: ' + str(out2[0]))

    # how many correct pairings were predicted over incorrect pairings
    if out[0][0] > out2[0][0]:
        cor += 1
    else:
        wrg += 1
    
print('acc: ' + str(cor / (cor + wrg)))



310791, 0.000854235258884728] probs for wrg pair: [0.00020187886548228562, 0.9997981190681458]
probs for cor pair: [0.0055663688108325005, 0.9944337010383606] probs for wrg pair: [0.9983575940132141, 0.0016424076166003942]
probs for cor pair: [0.9950774312019348, 0.004922543186694384] probs for wrg pair: [0.00032255580299533904, 0.9996774196624756]
probs for cor pair: [0.9985520243644714, 0.0014479415258392692] probs for wrg pair: [0.9990559220314026, 0.000944117025937885]
probs for cor pair: [0.9957597851753235, 0.00424027256667614] probs for wrg pair: [0.9872180819511414, 0.012781966477632523]
probs for cor pair: [0.9940078258514404, 0.005992190446704626] probs for wrg pair: [0.0009792187483981252, 0.9990208148956299]
probs for cor pair: [0.9802155494689941, 0.019784417003393173] probs for wrg pair: [0.00012774296919815242, 0.9998722076416016]
probs for cor pair: [0.9949230551719666, 0.005076919216662645] probs for wrg pair: [0.0007006016094237566, 0.9992994070053101]
probs for cor p

In [None]:
# playing with my own examples
question = 'where would you likely get wet?'
cor_ans = 'the beach'
wrg_ans = 'the bank'

encoding = tokenizer(question, cor_ans, return_tensors='pt')
outputs = model(**encoding)
logits = outputs.logits

encoding2 = tokenizer(question, wrg_ans, return_tensors='pt')
outputs2 = model(**encoding2)
logits2 = outputs2.logits

# custom test for 1 to 1 correct:wrong answer pairing
# check if correct sentence pairing has a higher correct prob than incorrect pairing
out = outputs.logits.softmax(dim=-1).tolist()
out2 = outputs2.logits.softmax(dim=-1).tolist()
print('probs for cor pair: ' + str(out[0]) + ' ' + 'probs for wrg pair: ' + str(out2[0]))

# how many correct pairings were predicted over incorrect pairings
if out[0][0] > out2[0][0]:
    print('guessed correctly')
else:
    print('guessed wrong')
