# Evaluation Code

Evaluation for the trained model is conducted in this code.

In [15]:
# COLAB
from google.colab import drive
drive.mount('/content/drive')

# COMMON
import numpy as np
import torch

# DATA
import pandas as pd

# SEED
import random
import os

# PREPROCESS DATA
import re
!pip install contractions
import contractions

# DATASET/LOADER
!pip install datasets
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

# MODEL
from transformers import AutoModelForSequenceClassification

# TEST
from tqdm.auto import tqdm
import torch.nn as nn
import sys
import pickle

# METRICS
import sklearn.metrics as sm

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Variables for several configurations

In [2]:
# define path
test_data_path = "/content/drive/MyDrive/nlu/data/dev.csv"
trained_model_path = "/content/drive/MyDrive/nlu/result/model/6_model.pt"

# define variables
MODEL_CHECKPOINT = "bert-base-cased"
MAX_LENGTH = 256
BATCH_SIZE = 32

# move model to device if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
# set seed
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed()

Random seed set as 42


# Prepare the Dataloader

In [17]:
# define preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return " "

    # convert to lower case
    text = text.lower()

    # expand contractions (ex. don't -> do not)
    text = contractions.fix(text)

    # remove multiple spaces
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    return text

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def preprocess_function(records):
    return tokenizer(records['text_1'], records['text_2'], truncation=True, return_token_type_ids=True, max_length = MAX_LENGTH)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# own dataset
class PairwiseDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        dataset = Dataset.from_pandas(data)
        # dataset = dataset.remove_columns('__index_level_0__')
        encoded_dataset = dataset.map(preprocess_function, batched=True)

        self.input_ids = encoded_dataset["input_ids"]
        self.token_type_ids = encoded_dataset["token_type_ids"]
        self.attention_mask = encoded_dataset["attention_mask"]
        self.labels = encoded_dataset["label"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'token_type_ids': torch.tensor(self.token_type_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx]),
            'label': torch.tensor(self.labels[idx])
        }

In [18]:
# read data
test_data = pd.read_csv(test_data_path)

# preprocess test data
test_data['text_1'] = test_data['text_1'].apply(preprocess_text)
test_data['text_2'] = test_data['text_2'].apply(preprocess_text)

# prepare dataset
test_dataset = PairwiseDataset(test_data)

# define dataloader
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=data_collator
)

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

# Model

In [19]:
# import trained model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=1).to(device)
model.load_state_dict(torch.load(trained_model_path))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

# Evaluate

In [20]:
# return metrics
def metrics(y_true, y_pred):
    acc = sm.accuracy_score(y_true, y_pred)
    macro_p = sm.precision_score(y_true, y_pred, average='macro')
    macro_r = sm.recall_score(y_true, y_pred, average='macro')
    macro_f1 = sm.f1_score(y_true, y_pred, average='macro')
    w_macro_p = sm.precision_score(y_true, y_pred, average='weighted')
    w_macro_r = sm.recall_score(y_true, y_pred, average='weighted')
    w_macro_f1 = sm.f1_score(y_true, y_pred, average='weighted')
    mcc = sm.matthews_corrcoef(y_true, y_pred)

    return {"Accuracy":acc, "Macro-P":macro_p, "Macro-R":macro_r, "Macro-F1":macro_f1, "W Macro-P":w_macro_p, "W Macro-R":w_macro_r, "W Macro-F1":w_macro_f1, "MCC":mcc}

# test/validation function
def test(model, dataloader, criterion, device):
    model.eval()
    test_loss = 0.0
    y_true = []
    y_pred = []

    with torch.inference_mode():
        for batch in tqdm(dataloader):
            # move batch to device
            input_ids = batch['input_ids'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # forward pass
            outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
            logits = outputs.logits
            logits = logits.reshape(logits.shape[0])

            # calculate loss
            loss = criterion(logits, labels.float())

            # compute running loss and metrics
            test_loss += loss.item()
            predicted = (torch.sigmoid(logits) > 0.5).float()

            y_true.extend(labels.detach().cpu().tolist())
            y_pred.extend(predicted.detach().cpu().tolist())

    # calculate epoch loss and accuracy
    test_loss /= len(dataloader)

    test_metrics = metrics(y_true, y_pred)

    return test_loss, test_metrics

In [21]:
# check the metrics
criterion = nn.BCEWithLogitsLoss()

test_loss, test_metrics = test(model, test_dataloader, criterion, device=device)
test_loss, test_metrics

  0%|          | 0/188 [00:00<?, ?it/s]

(0.5135706663924329,
 {'Accuracy': 0.7775,
  'Macro-P': 0.7801178451178451,
  'Macro-R': 0.7773203950853117,
  'Macro-F1': 0.7769006000816805,
  'W Macro-P': 0.780021857463524,
  'W Macro-R': 0.7775,
  'W Macro-F1': 0.7769430013620762,
  'MCC': 0.5574312207924021})