## Evaluate Model

In [1]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import pandas as pd
import random
import torch

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

**Read data:**

In [2]:
val_data = pd.read_csv("./data/val_set.csv")
val_data.head()

Unnamed: 0.1,Unnamed: 0,#1 String,#2 String,label
0,8675,They were killed over a few days in April 2001...,"""For customers to get the most of the Internet...",0
1,9755,Such a step could put the issue before the UN ...,The matter could then be sent to the U.N. Secu...,1
2,9573,Thousands of pounds of fireworks inside the wa...,Thousands of pounds of fireworks inside the La...,1
3,11138,Some Wall Street experts believe stocks have r...,Some Wall Street analysts believe stocks have ...,1
4,10987,"""We will work with the board to ensure a smoot...",He said federal regulators would work with the...,1


**Load Model:**

In [3]:
# Reloading the saved model and tokenizer
model_path = "./models/bert_paraphrase_model"  # Change this to your last saved epoch's path
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
device = torch.device("mps" if torch.cuda.is_available() else "cpu")
model = model.to(device)

**Convert dataset:**

In [4]:
class ParaphraseDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence1 = str(self.data.iloc[idx]['#1 String'])
        sentence2 = str(self.data.iloc[idx]['#2 String'])
        label = self.data.iloc[idx]['label']

        encoding = self.tokenizer(
            sentence1,
            sentence2,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [5]:
max_len = 128
train_dataset = ParaphraseDataset(val_data, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

**Make predictions:**

In [6]:
# Evaluate function
def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return all_preds, all_labels

# Evaluate on the full training dataset
preds, labels = evaluate_model(model, train_loader, device)

Evaluating: 100%|██████████| 73/73 [00:49<00:00,  1.47it/s]


**Calculate metrics:**

In [7]:
# Calculate metrics
accuracy = accuracy_score(labels, preds)
precision = precision_score(labels, preds, average='binary')
recall = recall_score(labels, preds, average='binary')
f1 = f1_score(labels, preds, average='binary')

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")


Accuracy: 0.9474590869939707
Precision: 0.9667250437828371
Recall: 0.9292929292929293
F1-Score: 0.9476394849785408
