In [1]:
#!pip install sentencepiece

In [2]:
from transformers import AutoTokenizer, RobertaForSequenceClassification, AdamW
from transformers import T5Tokenizer, T5ForSequenceClassification
from transformers import AutoTokenizer, YosoForSequenceClassification

import torch
from torch import cuda
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm
import sentencepiece

In [3]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

from google.colab import drive
drive.mount('/content/drive')

cuda
Mounted at /content/drive


In [4]:
# Load the data
train = pd.DataFrame.from_records(np.load('/content/drive/MyDrive/NLP_Project_Dataset/data/SP-train.npy', allow_pickle=True))
train, test = train_test_split(train, test_size=0.1, random_state=42)

In [5]:
def preprocess_data(df):
    processed_data = []
    for _, row in df.iterrows():
        question = row['question']
        choices = [row['answer'], row['distractor1'], row['distractor2'], row['distractor(unsure)']]
        for choice in choices:
            label = 1 if choice == row['answer'] else 0
            processed_data.append((question, choice, label))
    return pd.DataFrame(processed_data, columns=['question', 'choice', 'label'])


train = preprocess_data(train)
val_data = preprocess_data(test)

In [6]:
class QADataset(Dataset):
    def __init__(self, questions, answers, labels, tokenizer, max_len):
        self.questions = questions
        self.answers = answers
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, item):
        question = str(self.questions[item])
        answer = str(self.answers[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            question,
            answer,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'question_answer_text': question + " " + answer,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
class ModelPredictor:
    def __init__(self, model, tokenizer, device, val_loader, max_len=256):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.val_loader = val_loader
        self.max_len = max_len

    def evaluate(self):
        self.model.eval()
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for batch in self.val_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predictions = torch.argmax(torch.softmax(logits, dim=1), dim=1)
                correct_predictions += (predictions == labels).sum().item()
                total_predictions += labels.size(0)

        return correct_predictions / total_predictions

In [8]:
max_len = 256
train.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)

def get_model(tokenizer_path='roberta-base', model_path='roberta-base'):
    model, tokenizer = None, None
    if tokenizer_path == 'roberta-base':
        print(tokenizer_path)
        tokenizer = AutoTokenizer.from_pretrained('roberta-base')
        model = RobertaForSequenceClassification.from_pretrained('roberta-base')
    elif tokenizer_path == 't5-base':
        print(tokenizer_path)
        tokenizer = T5Tokenizer.from_pretrained('t5-base')
        model = T5ForSequenceClassification.from_pretrained('t5-base')
    else:
        print('yoso')
        tokenizer = AutoTokenizer.from_pretrained('uw-madison/yoso-4096')
        model = YosoForSequenceClassification.from_pretrained('uw-madison/yoso-4096')

    train_dataset = QADataset(train['question'], train['choice'], train['label'], tokenizer, max_len)
    val_dataset = QADataset(val_data['question'], val_data['choice'], val_data['label'], tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    # Training
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)

    last_correctness = 0
    epsilon = 0.0001

    num_epochs = 10
    for epoch in range(num_epochs):
        print(f'\n------------ Epoch: {epoch} ------------')
        model.train()
        losses = np.array([])
        for batch in tqdm(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            losses = np.append(losses, loss.item())
        print(f"Epoch: {epoch} loss: {np.mean(losses)}")
        predictor = ModelPredictor(model, tokenizer, device, val_loader, max_len)

        # Evaluate the model
        mean_correctness = predictor.evaluate()
        print(f"Mean Correctness on Validation Set: {mean_correctness}")
        print(f"Change in correctness on Validation Set: {mean_correctness - last_correctness}")
        print(f"----------------------------------\n")
        if mean_correctness - last_correctness <= epsilon:
            break
        else:
            last_correctness = mean_correctness
    return (model, tokenizer, device, max_len)

In [9]:
class ModelPredictorQA:
    def __init__(self, model, tokenizer, device, max_len=256):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.max_len = max_len

    def predict(self, row):
        question = row['question']
        choices = row['choice_list']
        max_score = -1
        answer_index = -1

        for i, choice in enumerate(choices):
            # Tokenize the question and choice
            encoding = self.tokenizer.encode_plus(
                question,
                choice,
                add_special_tokens=True,
                max_length=self.max_len,
                return_token_type_ids=False,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt',
                truncation=True
            )

            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)

            # Get model predictions
            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predictions = torch.softmax(logits, dim=1)[:, 1]
                score = predictions.item()

            if score > max_score:
                max_score = score
                answer_index = i

        return answer_index, row['label']

In [10]:

t5_model, t5_tokenizer, t5_device, t5_max_len = get_model(tokenizer_path='t5-base', model_path='t5-base')
roberta_model, roberta_tokenizer, roberta_device, roberta_max_len = get_model(tokenizer_path='roberta-base', model_path='roberta-base')
yoso_model, yoso_tokenizer, yoso_device, yoso_max_len = get_model(tokenizer_path='uw-madison/yoso-4096', model_path='uw-madison/yoso-4096')
predictors = [ModelPredictorQA(yoso_model, yoso_tokenizer, yoso_device, yoso_max_len),
              ModelPredictorQA(t5_model, t5_tokenizer, t5_device, t5_max_len),
              ModelPredictorQA(roberta_model, roberta_tokenizer, roberta_device, roberta_max_len)]


for i, predictor in enumerate(predictors):
    print(f'\n\n--------------- Predictor: {i} ---------------\n\n')
    results = []
    for i, row in test.iterrows():
        answer_index, label = predictor.predict(row)
        if answer_index != label:
            print(f"Row: {i}, Predicted Answer Index: {answer_index}, Correct Answer Index: {label}")
        results.append(answer_index == label)
    print(round(sum(results) / len(results),4))

t5-base


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['classification_head.dense.weight', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



------------ Epoch: 0 ------------


100%|██████████| 114/114 [03:07<00:00,  1.64s/it]


Epoch: 0 loss: 0.5455268922082165
Mean Correctness on Validation Set: 0.75
Change in correctness on Validation Set: 0.75
----------------------------------


------------ Epoch: 1 ------------


100%|██████████| 114/114 [03:06<00:00,  1.64s/it]


Epoch: 1 loss: 0.4682947258677399
Mean Correctness on Validation Set: 0.7941176470588235
Change in correctness on Validation Set: 0.044117647058823484
----------------------------------


------------ Epoch: 2 ------------


100%|██████████| 114/114 [03:06<00:00,  1.64s/it]


Epoch: 2 loss: 0.3862094766738122
Mean Correctness on Validation Set: 0.8431372549019608
Change in correctness on Validation Set: 0.0490196078431373
----------------------------------


------------ Epoch: 3 ------------


100%|██████████| 114/114 [03:06<00:00,  1.64s/it]


Epoch: 3 loss: 0.29329943709206163
Mean Correctness on Validation Set: 0.8578431372549019
Change in correctness on Validation Set: 0.014705882352941124
----------------------------------


------------ Epoch: 4 ------------


100%|██████████| 114/114 [03:06<00:00,  1.64s/it]


Epoch: 4 loss: 0.234428840934446
Mean Correctness on Validation Set: 0.8921568627450981
Change in correctness on Validation Set: 0.03431372549019618
----------------------------------


------------ Epoch: 5 ------------


100%|██████████| 114/114 [03:06<00:00,  1.64s/it]


Epoch: 5 loss: 0.1913296826379864
Mean Correctness on Validation Set: 0.8921568627450981
Change in correctness on Validation Set: 0.0
----------------------------------

roberta-base


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



------------ Epoch: 0 ------------


100%|██████████| 114/114 [01:16<00:00,  1.50it/s]


Epoch: 0 loss: 0.5697394606837055
Mean Correctness on Validation Set: 0.75
Change in correctness on Validation Set: 0.75
----------------------------------


------------ Epoch: 1 ------------


100%|██████████| 114/114 [01:15<00:00,  1.51it/s]


Epoch: 1 loss: 0.4849867805054313
Mean Correctness on Validation Set: 0.8284313725490197
Change in correctness on Validation Set: 0.07843137254901966
----------------------------------


------------ Epoch: 2 ------------


100%|██████████| 114/114 [01:15<00:00,  1.51it/s]


Epoch: 2 loss: 0.29453173309172453
Mean Correctness on Validation Set: 0.8823529411764706
Change in correctness on Validation Set: 0.0539215686274509
----------------------------------


------------ Epoch: 3 ------------


100%|██████████| 114/114 [01:15<00:00,  1.51it/s]


Epoch: 3 loss: 0.14526937485329414
Mean Correctness on Validation Set: 0.9215686274509803
Change in correctness on Validation Set: 0.039215686274509776
----------------------------------


------------ Epoch: 4 ------------


100%|██████████| 114/114 [01:15<00:00,  1.51it/s]


Epoch: 4 loss: 0.08065344706757746
Mean Correctness on Validation Set: 0.946078431372549
Change in correctness on Validation Set: 0.02450980392156865
----------------------------------


------------ Epoch: 5 ------------


100%|██████████| 114/114 [01:15<00:00,  1.51it/s]


Epoch: 5 loss: 0.05653446109295545
Mean Correctness on Validation Set: 0.9411764705882353
Change in correctness on Validation Set: -0.004901960784313708
----------------------------------

yoso


tokenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Some weights of YosoForSequenceClassification were not initialized from the model checkpoint at uw-madison/yoso-4096 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



------------ Epoch: 0 ------------


100%|██████████| 114/114 [01:25<00:00,  1.34it/s]


Epoch: 0 loss: 0.5742273884907103
Mean Correctness on Validation Set: 0.75
Change in correctness on Validation Set: 0.75
----------------------------------


------------ Epoch: 1 ------------


100%|██████████| 114/114 [01:24<00:00,  1.35it/s]


Epoch: 1 loss: 0.5697718497953916
Mean Correctness on Validation Set: 0.75
Change in correctness on Validation Set: 0.0
----------------------------------



--------------- Predictor: 0 ---------------


Row: 173, Predicted Answer Index: 2, Correct Answer Index: 0
Row: 274, Predicted Answer Index: 2, Correct Answer Index: 0
Row: 492, Predicted Answer Index: 0, Correct Answer Index: 1
Row: 72, Predicted Answer Index: 2, Correct Answer Index: 1
Row: 453, Predicted Answer Index: 2, Correct Answer Index: 1
Row: 316, Predicted Answer Index: 0, Correct Answer Index: 1
Row: 140, Predicted Answer Index: 0, Correct Answer Index: 1
Row: 218, Predicted Answer Index: 0, Correct Answer Index: 3
Row: 78, Predicted Answer Index: 2, Correct Answer Index: 1
Row: 474, Predicted Answer Index: 1, Correct Answer Index: 0
Row: 124, Predicted Answer Index: 0, Correct Answer Index: 2
Row: 424, Predicted Answer Index: 3, Correct Answer Index: 2
Row: 195, Predicted Answer Index: 2, Correct Answer Index: 3
Row:

In [11]:
# np.save('/content/drive/MyDrive/NLP_Project_Dataset/data/semeval-train-split.npy', train.to_numpy())
# np.save('/content/drive/MyDrive/NLP_Project_Dataset/data/semeval-test-split.npy', test.to_numpy())