In [1]:
# common imports

import sys
sys.path.append("../../ReqSeek/")

import os
import random
import mapper
import datasets
import numpy as np
import pandas as pd
import tensorflow as tf
from pathlib import Path
from tensorflow import keras

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Model Setup

In [2]:
# Loading the dataset  

evaluation_set = datasets.load_from_disk('../../datasets/ARID_supporting_scripts/5_1_training_set')['test']

In [3]:
lbl_ = evaluation_set.features['label'].names
label2id = {lbl: idx for idx, lbl in enumerate(lbl_)}
id2label = {val: key for key, val in label2id.items()}

In [4]:
def preprocess_function(dataset):
    return tokenizer(dataset['Requirement Sentences'], padding = 'max_length', max_length = 256, truncation = True, return_tensors = 'tf')

In [5]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

def forward_pass_with_label(batch):
    inputs = {'input_ids': tf.convert_to_tensor(batch['input_ids']),
             'attention_mask': tf.convert_to_tensor(batch['attention_mask'])}
    true_labels = tf.convert_to_tensor(batch['label'])
    
    with tf.GradientTape() as tape:
        output = ReqSeek(**inputs)
        predicted_labels = tf.argmax(output.logits, axis = -1).numpy()
        probas = tf.nn.softmax(output.logits, axis = -1).numpy()
        loss = tf.keras.losses.sparse_categorical_crossentropy(true_labels, output.logits)

    loss = loss.numpy()

    return {"loss": loss, 
            "y_preds": [id2label[lbl] for lbl in predicted_labels],
            "y_probas": [probas[i][predicted_labels[i]] for i in range(len(predicted_labels))]}

# Evaluating ReqSeek

In [6]:
ReqSeek_path = '../../ReqSeek/'

In [7]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification


tokenizer = AutoTokenizer.from_pretrained(ReqSeek_path)
ReqSeek = TFAutoModelForSequenceClassification.from_pretrained(ReqSeek_path)
  
evaluation_set_encoded = evaluation_set.map(preprocess_function, batched = True)
evaluation_set_predicted = evaluation_set_encoded.map(forward_pass_with_label, batched = True, batch_size = 8)        

Metal device set to: Apple M4 Pro


2026-01-18 23:16:58.563600: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2026-01-18 23:16:58.563720: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at ../../ReqSeek/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


INFO:tensorflow:Assets written to: ram://caa204ba-1de3-4eda-a5fc-29a1c4fb51cf/assets


INFO:tensorflow:Assets written to: ram://caa204ba-1de3-4eda-a5fc-29a1c4fb51cf/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate(y_true, y_pred, average = 'macro'):
    print(f"Precision: {precision_score(y_true, y_pred, average = average)}")
    print(f"Recall: {recall_score(y_true, y_pred, average = average)}")
    print(f"F1-Score: {f1_score(y_true, y_pred, average = average)}")

In [9]:
print('ReqSeek Performance:\n')

y_true = mapper.map(evaluation_set_predicted['signal_keyword'])
y_pred = mapper.map(evaluation_set_predicted['y_preds'])
evaluate(y_true, y_pred)

ReqSeek Performance:

Precision: 0.9541119708891043
Recall: 0.9517460317460317
F1-Score: 0.9528933151427276
