# MPNet Eval

## Using pipeline

In [1]:
from transformers import pipeline
from datasets import DatasetDict, Dataset
import pandas as pd
from tqdm import tqdm
import torch
from sklearn import metrics

dataset_path = '../../bin/multirc_dataset.hf'

tqdm.pandas()

pipe = pipeline('text-classification', model='tiedaar/short-answer-classification', device=0)
ds = DatasetDict.load_from_disk(dataset_path)

test_df = ds['test'].to_pandas()
test_df['labels'] = test_df['labels'].progress_apply(lambda x: 'correct_answer' if x==1 else 'incorrect_answer')
test_df['preds'] = test_df['text'].progress_apply(lambda x: pipe(x)[0]['label'])
print(metrics.classification_report(test_df['labels'], test_df['preds']))

Device set to use cuda:0
100%|██████████| 3962/3962 [00:00<00:00, 1198459.00it/s]
  0%|          | 2/3962 [00:00<09:05,  7.26it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 3962/3962 [00:35<00:00, 111.87it/s]

                  precision    recall  f1-score   support

  correct_answer       0.80      0.76      0.78      1722
incorrect_answer       0.82      0.85      0.84      2240

        accuracy                           0.81      3962
       macro avg       0.81      0.81      0.81      3962
    weighted avg       0.81      0.81      0.81      3962






## Without pipeline

In [3]:
import torch
import gc
from time import perf_counter
from sklearn import metrics
from transformers import AutoModelForSequenceClassification, AutoTokenizer

dataset_path = '../../bin/multirc_dataset.hf'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

model = AutoModelForSequenceClassification.from_pretrained("wesleymorris/short-answer-classification").to(device)
tokenizer = AutoTokenizer.from_pretrained("wesleymorris/short-answer-classification")

def preprocess_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)
    
ds = DatasetDict.load_from_disk(dataset_path)
ds = ds.map(preprocess_function, batched=False)


preds = []
times = []

for text in ds['test']['text']:
    start_time = perf_counter()
    inputs = tokenizer(text, return_tensors='pt').to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    preds.append(model.config.id2label[predicted_class_id])
    times.append(perf_counter()-start_time)

df = ds['test'].to_pandas()
df['preds']=preds
df['times']=times

labels = []
for x in ds['test']['labels']:
    if x == 1:
        labels.append('correct_answer' )
    else: 
        labels.append('incorrect_answer') 

print(metrics.classification_report(labels, preds))

cuda


Map:   0%|          | 0/19170 [00:00<?, ? examples/s]

Map:   0%|          | 0/4080 [00:00<?, ? examples/s]

Map:   0%|          | 0/3962 [00:00<?, ? examples/s]

                  precision    recall  f1-score   support

  correct_answer       0.80      0.76      0.78      1722
incorrect_answer       0.82      0.85      0.84      2240

        accuracy                           0.81      3962
       macro avg       0.81      0.81      0.81      3962
    weighted avg       0.81      0.81      0.81      3962

