In [1]:
import json
import tqdm
import torch
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import f1_score, cohen_kappa_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

ModuleNotFoundError: No module named 'transformers'

In [3]:
def generate_metrics(predicted_labels_raw, true_labels, return_metrics=False):

    plt.hist(predicted_labels_raw)
    plt.title('BLEURT Score Distribution')
    plt.show()

    max_f1_at_th = -1.0
    max_f1 = 0.0
    
    for th in np.arange(0.5, 1.0, 0.05):
        th = round(th, 2)
        predicted_labels = [1 if bleurt_score >= th else 0 for bleurt_score in predicted_labels_raw]

        temp_f1 = f1_score(true_labels, predicted_labels)

        if temp_f1 >= max_f1:
            max_f1 = temp_f1
            max_f1_at_th = th

    # printing metrics at the threshold for which we got max F1-Score
    predicted_labels = [1 if bleurt_score >= max_f1_at_th else 0 for bleurt_score in predicted_labels_raw]        

    if return_metrics:
        return max_f1_at_th, f1_score(true_labels, predicted_labels), cohen_kappa_score(true_labels, predicted_labels)
    else:
        print(f'Optimal Threshold: {max_f1_at_th} \n')
        print(f'Predicted Label Count: {Counter(predicted_labels)}\n')
        print('Classification Report:')
        print(classification_report(true_labels, predicted_labels), '\n')
        print('F1 Score: ', f1_score(true_labels, predicted_labels), '\n')
        print('Cohen Kappa: ', cohen_kappa_score(true_labels, predicted_labels), '\n')
        cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(true_labels, predicted_labels), display_labels = ['incorrect', 'correct'])
        cm_display.plot()
        plt.show()

    return None

In [4]:
class BLEURTDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        candidate = item["candidate"]
        reference = item["reference"]
        score = item["score"]
        encoding = self.tokenizer(candidate, reference, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(score, dtype=torch.float)
        }

In [5]:
# device = "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained("vaiibhavgupta/finetuned-bleurt-large")
# model = AutoModelForSequenceClassification.from_pretrained("/home/jovyan/active-projects/textbook-question-generation/src/multirc-bleurt/model_large_v0")
model.to(device)
model.eval()
tokenizer = AutoTokenizer.from_pretrained("vaiibhavgupta/finetuned-bleurt-large")
# tokenizer = AutoTokenizer.from_pretrained("/home/jovyan/active-projects/textbook-question-generation/src/multirc-bleurt/tokenizer_large_v0")

Downloading (…)lve/main/config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [6]:
data_path = "/home/jovyan/active-projects/textbook-question-generation/bleurt-models/bleurtmodel/bleurt/bleurt/test_data/multirc-dataset/{}_samples.jsonl"

with open(data_path.format('test'), 'r') as file:
    test_samples = [json.loads(line) for line in file]

test_dataset = BLEURTDataset(test_samples, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8)

In [None]:
predictions = []
true_labels = []
with torch.no_grad():
    for batch in tqdm.tqdm(test_loader):
        true_labels.extend(batch['labels'])
        outputs = model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device))
        logits = outputs.logits
        preds = logits.detach().cpu().numpy()
        predictions.extend(preds)

In [None]:
pr = [arr[0] for arr in predictions]
tl = [tn.item() for tn in true_labels]
generate_metrics(pr, tl)

In [None]:
misclassfified_points = []
for p, t in zip(pr, tl):
    if not int(p > 0.7) == t:
        misclassfified_points.append(p)
len(misclassfified_points)

In [None]:
480+364

In [None]:
plt.figure(figsize=(10, 7))
plt.hist(misclassfified_points, alpha=0.7, color='red', label='Misclassified Points')
plt.hist(pr, alpha=0.5, color='blue', label='All Predicted Points')
plt.plot([0.7]*10, [0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1750], label='Optimal Threshold')
plt.title('BLEURT scores for all vs misclassified datapoints')
plt.legend(loc='upper right')
plt.show()

In [15]:
list_of_items = [x for x in test_loader]

[{'candidate': "The Women's Haven of Tarrant County",
  'reference': "The Women's Haven of Tarrant County will be able to add a second full-time attorney to its staff and contract with Texas Wesleyan University for pro bono and student services thanks to a two-year, $350,000 grant from the federal Violence Against Women Office.",
  'score': 1},
 {'candidate': 'Tarrant county shelters',
  'reference': "The Women's Haven of Tarrant County will be able to add a second full-time attorney to its staff and contract with Texas Wesleyan University for pro bono and student services thanks to a two-year, $350,000 grant from the federal Violence Against Women Office.",
  'score': 0},
 {'candidate': "Female's Safe House of Haven County",
  'reference': "The Women's Haven of Tarrant County will be able to add a second full-time attorney to its staff and contract with Texas Wesleyan University for pro bono and student services thanks to a two-year, $350,000 grant from the federal Violence Against Wo