In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch import nn
import numpy as np
import pandas as pd
from tqdm import tqdm

In [8]:

train = pd.DataFrame.from_records(np.load('/content/drive/MyDrive/Colab Notebooks/NLP_Dataset/SP-train.npy',allow_pickle=True))
train, test = train_test_split(train, test_size=0.1, random_state=42)
print(train.columns, train.shape)

Index(['id', 'question', 'answer', 'distractor1', 'distractor2',
       'distractor(unsure)', 'label', 'choice_list', 'choice_order'],
      dtype='object') (456, 9)


In [9]:
def preprocess_data(df):
    processed_data = []
    for _, row in df.iterrows():
        question = row['question']
        choices = [row['answer'], row['distractor1'], row['distractor2'], row['distractor(unsure)']]
        for choice in choices:
            label = 1 if choice == row['answer'] else 0
            processed_data.append((question, choice, label))
    return pd.DataFrame(processed_data, columns=['question', 'choice', 'label'])


train = preprocess_data(train)
val_data = preprocess_data(test)

In [10]:
class QADataset(Dataset):
    def __init__(self, questions, answers, labels, tokenizer, max_len):
        self.questions = questions
        self.answers = answers
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, item):
        question = str(self.questions[item])
        answer = str(self.answers[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            question,
            answer,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'question_answer_text': question + " " + answer,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
class ModelPredictor:
    def __init__(self, model, tokenizer, device, val_loader, max_len=256):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.val_loader = val_loader
        self.max_len = max_len

    def evaluate(self):
        self.model.eval()
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for batch in self.val_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predictions = torch.argmax(torch.softmax(logits, dim=1), dim=1)
                correct_predictions += (predictions == labels).sum().item()
                total_predictions += labels.size(0)

        return correct_predictions / total_predictions

In [14]:

#from transformers import AutoTokenizer, XLMRobertaForMultipleChoice
#from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoTokenizer, YosoForSequenceClassification
from transformers import AdamW

# tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
# model = XLMRobertaForMultipleChoice.from_pretrained("xlm-roberta-base")

tokenizer = AutoTokenizer.from_pretrained("uw-madison/yoso-4096")
model = YosoForSequenceClassification.from_pretrained("uw-madison/yoso-4096")
#model.eval()


max_len = 256
train.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)

train_dataset = QADataset(train['question'], train['choice'], train['label'], tokenizer, max_len)
val_dataset = QADataset(val_data['question'], val_data['choice'], val_data['label'], tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
#train_loader = train_dataset
#val_loader = val_dataset

# Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

last_correctness = 0
epsilon = 0.0001

num_epochs = 10
for epoch in range(num_epochs):
    print(f'\n------------ Epoch: {epoch} ------------')
    model.train()
    losses = np.array([])
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # print("labels",labels.shape)
        # print("input_id",input_ids.shape)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses = np.append(losses, loss.item())
    print(f"Epoch: {epoch} loss: {np.mean(losses)}")
    predictor = ModelPredictor(model, tokenizer, device, val_loader, max_len)

    # Evaluate the model
    mean_correctness = predictor.evaluate()
    print(f"Mean Correctness on Validation Set: {mean_correctness}")
    print(f"Change in correctness on Validation Set: {mean_correctness - last_correctness}")
    print(f"----------------------------------\n")
    if mean_correctness - last_correctness <= epsilon:
        break
    else:
        last_correctness = mean_correctness

Some weights of YosoForSequenceClassification were not initialized from the model checkpoint at uw-madison/yoso-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



------------ Epoch: 0 ------------


100%|██████████| 114/114 [01:24<00:00,  1.35it/s]


Epoch: 0 loss: 0.574893490787138
Mean Correctness on Validation Set: 0.75
Change in correctness on Validation Set: 0.75
----------------------------------


------------ Epoch: 1 ------------


100%|██████████| 114/114 [01:26<00:00,  1.32it/s]


Epoch: 1 loss: 0.569563616263239
Mean Correctness on Validation Set: 0.75
Change in correctness on Validation Set: 0.0
----------------------------------



In [15]:
class ModelPredictorQA:
    def __init__(self, model, tokenizer, device, max_len=256):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.max_len = max_len

    def predict(self, row):
        question = row['question']
        choices = row['choice_list']
        max_score = -1
        answer_index = -1

        for i, choice in enumerate(choices):
            # Tokenize the question and choice
            encoding = self.tokenizer.encode_plus(
                question,
                choice,
                add_special_tokens=True,
                max_length=self.max_len,
                return_token_type_ids=False,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt',
                truncation=True
            )

            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)

            # Get model predictions
            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predictions = torch.softmax(logits, dim=1)[:, 1]
                score = predictions.item()

            if score > max_score:
                max_score = score
                answer_index = i

        return answer_index, row['label']

In [16]:
predictor = ModelPredictorQA(model, tokenizer, device, max_len)
results = []

for i, row in test.iterrows():
    answer_index, label = predictor.predict(row)
    if answer_index != label:
        print(f"Row: {i}, Predicted Answer Index: {answer_index}, Correct Answer Index: {label}")
    results.append(answer_index == label)

print(round(sum(results) / len(results),4))

Row: 173, Predicted Answer Index: 2, Correct Answer Index: 0
Row: 274, Predicted Answer Index: 2, Correct Answer Index: 0
Row: 492, Predicted Answer Index: 0, Correct Answer Index: 1
Row: 72, Predicted Answer Index: 2, Correct Answer Index: 1
Row: 453, Predicted Answer Index: 2, Correct Answer Index: 1
Row: 316, Predicted Answer Index: 2, Correct Answer Index: 1
Row: 218, Predicted Answer Index: 0, Correct Answer Index: 3
Row: 9, Predicted Answer Index: 1, Correct Answer Index: 2
Row: 415, Predicted Answer Index: 3, Correct Answer Index: 1
Row: 78, Predicted Answer Index: 2, Correct Answer Index: 1
Row: 323, Predicted Answer Index: 2, Correct Answer Index: 0
Row: 474, Predicted Answer Index: 1, Correct Answer Index: 0
Row: 424, Predicted Answer Index: 1, Correct Answer Index: 2
Row: 195, Predicted Answer Index: 2, Correct Answer Index: 3
Row: 278, Predicted Answer Index: 1, Correct Answer Index: 0
Row: 422, Predicted Answer Index: 0, Correct Answer Index: 2
Row: 79, Predicted Answer In