In [1]:
!pip install sentencepiece



In [2]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, BartForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
# Load the data
train = pd.DataFrame.from_records(np.load('data/SP-train.npy', allow_pickle=True))
train, test = train_test_split(train, test_size=0.1, random_state=42)

In [4]:
def preprocess_data(df):
    processed_data = []
    for _, row in df.iterrows():
        question = row['question']
        choices = [row['answer'], row['distractor1'], row['distractor2'], row['distractor(unsure)']]
        for choice in choices:
            label = 1 if choice == row['answer'] else 0
            processed_data.append((question, choice, label))
    return pd.DataFrame(processed_data, columns=['question', 'choice', 'label'])


train = preprocess_data(train)
val_data = preprocess_data(test)

In [5]:
class QADataset(Dataset):
    def __init__(self, questions, answers, labels, tokenizer, max_len):
        self.questions = questions
        self.answers = answers
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, item):
        question = str(self.questions[item])
        answer = str(self.answers[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            question,
            answer,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'question_answer_text': question + " " + answer,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [6]:
class ModelPredictor:
    def __init__(self, model, tokenizer, device, val_loader, max_len=256):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.val_loader = val_loader
        self.max_len = max_len

    def evaluate(self):
        self.model.eval()
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for batch in self.val_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predictions = torch.argmax(torch.softmax(logits, dim=1), dim=1)
                correct_predictions += (predictions == labels).sum().item()
                total_predictions += labels.size(0)

        return correct_predictions / total_predictions

In [12]:
max_len = 256
train.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)

def get_model(tokenizer_path='xlnet-base-cased', model_path='xlnet-base-cased'):
    model, tokenizer = None, None
    if tokenizer_path == 'xlnet-base-cased':
        print(tokenizer_path)
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_path)
        model = XLNetForSequenceClassification.from_pretrained(model_path)
    elif tokenizer_path == 'bert-base-uncased':
        print(tokenizer_path)
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    else:
        print('bart')
        tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')
        model = BartForSequenceClassification.from_pretrained('facebook/bart-base')

    train_dataset = QADataset(train['question'], train['choice'], train['label'], tokenizer, max_len)
    val_dataset = QADataset(val_data['question'], val_data['choice'], val_data['label'], tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    # Training
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)

    last_correctness = 0
    epsilon = 0.0001

    num_epochs = 10
    for epoch in range(num_epochs):
        print(f'\n------------ Epoch: {epoch} ------------')
        model.train()
        losses = np.array([])
        for batch in tqdm(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            losses = np.append(losses, loss.item())
        print(f"Epoch: {epoch} loss: {np.mean(losses)}")
        predictor = ModelPredictor(model, tokenizer, device, val_loader, max_len)

        # Evaluate the model
        mean_correctness = predictor.evaluate()
        print(f"Mean Correctness on Validation Set: {mean_correctness}")
        print(f"Change in correctness on Validation Set: {mean_correctness - last_correctness}")
        print(f"----------------------------------\n")
        if mean_correctness - last_correctness <= epsilon:
            break
        else:
            last_correctness = mean_correctness
    return (model, tokenizer, device, max_len)

In [8]:
class ModelPredictorQA:
    def __init__(self, model, tokenizer, device, max_len=256):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.max_len = max_len

    def predict(self, row):
        question = row['question']
        choices = row['choice_list']
        max_score = -1
        answer_index = -1

        for i, choice in enumerate(choices):
            # Tokenize the question and choice
            encoding = self.tokenizer.encode_plus(
                question,
                choice,
                add_special_tokens=True,
                max_length=self.max_len,
                return_token_type_ids=False,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt',
                truncation=True
            )

            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)

            # Get model predictions
            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predictions = torch.softmax(logits, dim=1)[:, 1]
                score = predictions.item()

            if score > max_score:
                max_score = score
                answer_index = i

        return answer_index, row['label']

In [13]:
xlnet_model, xlnet_tokenizer, xlnet_device, xlnet_max_len = get_model(tokenizer_path='xlnet-base-cased', model_path='xlnet-base-cased')
bart_model, bart_tokenizer, bart_device, bart_max_len = get_model(tokenizer_path='bart-base-uncased', model_path='bart-base-uncased')
bert_model, bert_tokenizer, bert_device, bert_max_len = get_model(tokenizer_path='bert-base-uncased', model_path='bert-base-uncased')
predictors = [ModelPredictorQA(bert_model, bert_tokenizer, bert_device, bert_max_len),
              ModelPredictorQA(bart_model, bart_tokenizer, bart_device, bart_max_len),
              ModelPredictorQA(xlnet_model, xlnet_tokenizer, xlnet_device, xlnet_max_len)]


for i, predictor in enumerate(predictors):
    print(f'\n\n--------------- Predictor: {i} ---------------\n\n')
    results = []
    for i, row in test.iterrows():
        answer_index, label = predictor.predict(row)
        if answer_index != label:
            print(f"Row: {i}, Predicted Answer Index: {answer_index}, Correct Answer Index: {label}")
        results.append(answer_index == label)
    print(round(sum(results) / len(results),4))

xlnet-base-cased


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'logits_proj.bias', 'sequence_summary.summary.weight', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



------------ Epoch: 0 ------------


100%|██████████| 114/114 [02:01<00:00,  1.07s/it]


Epoch: 0 loss: 0.5130856047596848
Mean Correctness on Validation Set: 0.8774509803921569
Change in correctness on Validation Set: 0.8774509803921569
----------------------------------


------------ Epoch: 1 ------------


100%|██████████| 114/114 [02:02<00:00,  1.07s/it]


Epoch: 1 loss: 0.3464401493404518
Mean Correctness on Validation Set: 0.8872549019607843
Change in correctness on Validation Set: 0.009803921568627416
----------------------------------


------------ Epoch: 2 ------------


100%|██████████| 114/114 [02:02<00:00,  1.07s/it]


Epoch: 2 loss: 0.210968724729722
Mean Correctness on Validation Set: 0.9068627450980392
Change in correctness on Validation Set: 0.019607843137254943
----------------------------------


------------ Epoch: 3 ------------


100%|██████████| 114/114 [02:01<00:00,  1.07s/it]


Epoch: 3 loss: 0.1309525005748136
Mean Correctness on Validation Set: 0.9264705882352942
Change in correctness on Validation Set: 0.019607843137254943
----------------------------------


------------ Epoch: 4 ------------


100%|██████████| 114/114 [02:01<00:00,  1.07s/it]


Epoch: 4 loss: 0.08335018951012835
Mean Correctness on Validation Set: 0.9509803921568627
Change in correctness on Validation Set: 0.02450980392156854
----------------------------------


------------ Epoch: 5 ------------


100%|██████████| 114/114 [02:02<00:00,  1.07s/it]


Epoch: 5 loss: 0.07750344860751443
Mean Correctness on Validation Set: 0.9166666666666666
Change in correctness on Validation Set: -0.03431372549019607
----------------------------------

bart


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.bias', 'classification_head.out_proj.weight', 'classification_head.dense.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



------------ Epoch: 0 ------------


100%|██████████| 114/114 [01:35<00:00,  1.20it/s]


Epoch: 0 loss: 0.5194778221479633
Mean Correctness on Validation Set: 0.8529411764705882
Change in correctness on Validation Set: 0.8529411764705882
----------------------------------


------------ Epoch: 1 ------------


100%|██████████| 114/114 [01:34<00:00,  1.20it/s]


Epoch: 1 loss: 0.3437388958246039
Mean Correctness on Validation Set: 0.8970588235294118
Change in correctness on Validation Set: 0.044117647058823595
----------------------------------


------------ Epoch: 2 ------------


100%|██████████| 114/114 [01:34<00:00,  1.20it/s]


Epoch: 2 loss: 0.20861074914992378
Mean Correctness on Validation Set: 0.9215686274509803
Change in correctness on Validation Set: 0.02450980392156854
----------------------------------


------------ Epoch: 3 ------------


100%|██████████| 114/114 [01:34<00:00,  1.21it/s]


Epoch: 3 loss: 0.12520778722440204
Mean Correctness on Validation Set: 0.9362745098039216
Change in correctness on Validation Set: 0.014705882352941235
----------------------------------


------------ Epoch: 4 ------------


100%|██████████| 114/114 [01:34<00:00,  1.20it/s]


Epoch: 4 loss: 0.09644726244732738
Mean Correctness on Validation Set: 0.946078431372549
Change in correctness on Validation Set: 0.009803921568627416
----------------------------------


------------ Epoch: 5 ------------


100%|██████████| 114/114 [01:34<00:00,  1.20it/s]


Epoch: 5 loss: 0.06944545895563797
Mean Correctness on Validation Set: 0.9313725490196079
Change in correctness on Validation Set: -0.014705882352941124
----------------------------------

bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



------------ Epoch: 0 ------------


100%|██████████| 114/114 [01:19<00:00,  1.43it/s]


Epoch: 0 loss: 0.49798496930222763
Mean Correctness on Validation Set: 0.8774509803921569
Change in correctness on Validation Set: 0.8774509803921569
----------------------------------


------------ Epoch: 1 ------------


100%|██████████| 114/114 [01:19<00:00,  1.43it/s]


Epoch: 1 loss: 0.2768408387180483
Mean Correctness on Validation Set: 0.9264705882352942
Change in correctness on Validation Set: 0.0490196078431373
----------------------------------


------------ Epoch: 2 ------------


100%|██████████| 114/114 [01:19<00:00,  1.44it/s]


Epoch: 2 loss: 0.14257635331402221
Mean Correctness on Validation Set: 0.946078431372549
Change in correctness on Validation Set: 0.019607843137254832
----------------------------------


------------ Epoch: 3 ------------


100%|██████████| 114/114 [01:19<00:00,  1.44it/s]


Epoch: 3 loss: 0.0760727740248273
Mean Correctness on Validation Set: 0.9264705882352942
Change in correctness on Validation Set: -0.019607843137254832
----------------------------------



--------------- Predictor: 0 ---------------


Row: 497, Predicted Answer Index: 3, Correct Answer Index: 2
0.9804


--------------- Predictor: 1 ---------------


Row: 424, Predicted Answer Index: 3, Correct Answer Index: 2
Row: 153, Predicted Answer Index: 3, Correct Answer Index: 0
Row: 497, Predicted Answer Index: 1, Correct Answer Index: 2
0.9412


--------------- Predictor: 2 ---------------


Row: 424, Predicted Answer Index: 3, Correct Answer Index: 2
Row: 497, Predicted Answer Index: 1, Correct Answer Index: 2
0.9608


In [14]:
np.save('semeval-train-split.npy', train.to_numpy())
np.save('semeval-test-split.npy', test.to_numpy())