In [1]:
# path parsing and data reading
import os
import re
import sys
import datasets
from pathlib import Path
from typing import Optional, Dict

# commot ml imports
import sklearn
import numpy as np
import pandas as pd


sys.path.append("../../ReqSeek/")
import mapper

np.random.seed(42)

# some imports for visualisation
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Reading & Listing Paths

In [2]:
# loading trained models and test sets
notebook_dir = Path.cwd()

data_root = (notebook_dir / './training_scripts_and_models/models_10fold_dataset_splits').resolve()
model_root = (notebook_dir / './training_scripts_and_models/models/tuned_10_fold/').resolve()

target_data_dirs = {
    'kfold_all_mini_l6v2_data',
    'kfold_bert_base_cased_data',
    'kfold_gpt2_data',
    'kfold_roberta_base_data',
}

In [3]:
fold_re = re.compile(r'fold[_-]?(?P<n>\d+)', re.IGNORECASE)

def extract_fold_number(path: Path) -> Optional[int]:
    match = fold_re.search(path.name)
    if match:
        return int(match.group('n'))
    return None

In [4]:
def infer_model_name(path: Path) -> Optional[str]:
    parts = set(path.parts)
    if 'reqseek_bert_base_cased_kfold_trained' in parts or 'kfold_bert_base_cased_data' in parts:
        return 'bert_base_cased'
    if 'reqseek_roberta_base_kfold_trained' in parts or 'kfold_roberta_base_data' in parts:
        return 'roberta_base'
    if 'reqseek_gpt2_kfold_trained' in parts or 'kfold_gpt2_data' in parts:
        return 'gpt2' 
    if 'reqseek_all_mini_l6v2_kfold_trained' in parts or 'kfold_all_mini_l6v2_data' in parts:
        return 'l6v6'
    return None

In [5]:
def discover_model_folds(model_root: Path) -> dict:
    model_folds = {}
    for p in model_root.rglob('trained_fold_*'):
        if not p.is_dir():
            continue
        fold = extract_fold_number(p)
        model_name = infer_model_name(p)
        if fold is None or model_name is None:
            continue
        model_folds[(model_name, fold)] = p
    return model_folds

In [6]:
def discover_data_folds(data_root: Path) -> dict:
    data_folds = {}
    for root in data_root.rglob('*'):
        if not root.is_dir():
            continue
        if root.name not in target_data_dirs:
            continue
        for fold_dir in root.iterdir():
            if not fold_dir.is_dir():
                continue
            fold = extract_fold_number(fold_dir)
            model_name = infer_model_name(fold_dir)
            if fold is None or model_name is None:
                continue
            data_folds[(model_name, fold)] = fold_dir
    return data_folds

# Paring kFold Models & Test Splits

In [7]:
model_folds = discover_model_folds(model_root)
data_folds = discover_data_folds(data_root)

paired = {}
missing = {'models': [], 'data': []}

all_keys = set(model_folds) | set(data_folds)

for key in sorted(all_keys):
    model_name, fold = key
    model_path = model_folds.get(key)
    data_path = data_folds.get(key)
    if model_path is None:
        missing['models'].append(key)
        continue
    if data_path is None:
        missing['data'].append(key)
        continue
    paired.setdefault(model_name, {})[fold] = {
        'model_dir': model_path,
        'data_dir': data_path,
    }

In [8]:
# Sanity check, if all model and data folds are paired
print('Paired model and data folds:')
for model, folds in paired.items():
    print(f"  {model}: {sorted(folds.keys())}")

# Sanity check, if any model or data fold is missing
if missing['models'] or missing['data']:
    print('missing entries:')
    if missing['models']:
        print('  missing models:', missing['models'])
    if missing['data']:
        print('  missing data:', missing['data'])

Paired model and data folds:
  bert_base_cased: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
  gpt2: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
  l6v6: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
  roberta_base: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


# Evaluation Setup & Helper Functions

In [9]:
import torch
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [10]:
# basic settings, to process raw text for model
batch_size = 4
max_length_gpt2 = 64 

# selecting appropriate device
device = torch.device('cuda:1' if torch.cuda.is_available() 
                      and torch.cuda.device_count() > 1 
                      else 'cuda:0' if torch.cuda.is_available() 
                      else 'cpu')

In [11]:
def collect_per_class_prf(report_dict, class_labels):
    macro_avg = report_dict['macro avg']
    macro = (
        macro_avg['precision'],
        macro_avg['recall'],
        macro_avg['f1-score'],
    )
    prf = {
        cls: (
            report_dict[cls]["precision"],
            report_dict[cls]["recall"],
            report_dict[cls]["f1-score"],
        )
        for cls in class_labels
    }
    return prf, macro

In [12]:
# a function to receive computed metrics and decorate
def print_cv_summary(all_folds, title = None):
    print('\033[1m' + title + '\033[0m')
    print('=' * 60)
    print(f"{'Class':<10} | {'Precision':<13} | {'Recall':<13} | {'F1-score':<10}")
    print("-" * 60)

    # Case 1: list of (p, r, f)
    if isinstance(all_folds[0], (list, tuple)):
        arr = np.asarray(all_folds, dtype=float)
        mean = arr.mean(axis=0)
        std = arr.std(axis=0)
        print(
            f"{'Macro':<10} | "
            f"{mean[0]:.3f} ± {std[0]:.3f} | "
            f"{mean[1]:.3f} ± {std[1]:.3f} | "
            f"{mean[2]:.3f} ± {std[2]:.3f}"
        )

    # Case 2: list of dicts (per-class)
    else:
        class_names = all_folds[0].keys()
        for cls in class_names:
            arr = np.asarray([fold[cls] for fold in all_folds], dtype=float)
            mean = arr.mean(axis=0)
            std = arr.std(axis=0)
            print(
                f"{cls:<10} | "
                f"{mean[0]:.3f} ± {std[0]:.3f} | "
                f"{mean[1]:.3f} ± {std[1]:.3f} | "
                f"{mean[2]:.3f} ± {std[2]:.3f}"
            )
    print('=' * 60)

In [13]:
# helper functions to identify fold-trained misclassification
def majority_vote(preds_2d, n_classes):
    out = np.empty(preds_2d.shape[1], dtype = int)
    for j in range(preds_2d.shape[1]):
        out[j] = np.bincount(preds_2d[:, j], minlength=n_classes).argmax()
    return out

def get_misclassified_reqids(promise_ds, y_true, y_pred):
    reqids = np.array(promise_ds['REQID'])
    return reqids[y_true != y_pred]

In [14]:
# run each fold and feed evaluation set
def eval_fold_torch(model_dir, data_dir = None, batch_size = 8, override_test_ds = None, override_label_names = None):
    # load test dataset
    if override_test_ds is None:
        dataset = datasets.load_from_disk(str(data_dir))
        test_ds = dataset['test']
        target_names = np.unique([mapper.map_hf(i) for i in dataset['train'].features['label'].names])
    else:
        test_ds = override_test_ds
        target_names = override_label_names  # must be provided

    tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
    model = AutoModelForSequenceClassification.from_pretrained(str(model_dir), from_tf = True).to(device)
    model.eval()

    # setting specific settings if the model is GPT
    is_gpt2 = 'gpt2' in str(model_dir).lower()
    if is_gpt2:
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = 'left'

    texts = test_ds['Requirement Sentences']
    y_pred = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            enc = tokenizer(
                batch_texts,
                padding = 'max_length' if is_gpt2 else True,
                truncation = True,
                max_length = max_length_gpt2 if is_gpt2 else None,
                return_tensors = 'pt',
            )
            enc = {k: v.to(device) for k, v in enc.items()}
            logits = model(**enc).logits
            y_pred.extend(torch.argmax(logits, dim = 1).cpu().numpy())

    y_true = [mapper.map_hf(model.config.id2label[i]) for i in np.array(test_ds['label'])]
    y_pred = [mapper.map_hf(model.config.id2label[i]) for i in np.array(y_pred)]
    report_text = classification_report(y_true, y_pred, target_names = target_names)
    report_dict = classification_report(y_true, y_pred, target_names = target_names, output_dict = True)

    return y_true, y_pred, report_text, report_dict

# Model Evaluation

In [15]:
from IPython.display import display, Markdown

model_to_run = 'gpt2'  # or 'l6v6' | 'roberta_base' | 'bert_base_cased'

fold_test_macro = []  
fold_test_per_class_metrics = [] 

for fold, info in sorted(paired[model_to_run].items()):
    model_dir = Path(info['model_dir'])
    data_dir = Path(info['data_dir'])
    fold_label_names = [mapper.map_hf(i) for i in datasets.load_from_disk(str(data_dir))['train'].features['label'].names]
    print(f"\033[1m>>>>>\033[0mEvaluating {model_to_run.upper()} on fold-{fold}\033[1m<<<<<\033[0m")

    # 1) evaluate on the fold's own held-out test set
    y_true, y_pred, report_text, report_dict = eval_fold_torch(
        model_dir = model_dir,
        data_dir = data_dir,
        batch_size = batch_size,
        override_test_ds = None,
    )
    prf_on_heldout, m_on_heldout = collect_per_class_prf(report_dict, fold_label_names)
    fold_test_per_class_metrics.append(prf_on_heldout)
    fold_test_macro.append(m_on_heldout)
    print(report_text) # classification report for each fold

[1m>>>>>[0mEvaluating GPT2 on fold-1[1m<<<<<[0m


2026-01-13 21:53:09.526899: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-13 21:53:09.526935: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-13 21:53:09.528158: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-13 21:53:09.535066: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-13 21:53:10.655377: I external/local_xla/xla/

                          precision    recall  f1-score   support

    contextual_auxiliary       0.96      0.90      0.93        60
             requirement       0.89      0.96      0.92       105
system_related_auxiliary       0.90      0.84      0.87        75

                accuracy                           0.91       240
               macro avg       0.92      0.90      0.91       240
            weighted avg       0.91      0.91      0.91       240

[1m>>>>>[0mEvaluating GPT2 on fold-2[1m<<<<<[0m


All TF 2.0 model weights were used when initializing GPT2ForSequenceClassification.

All the weights of GPT2ForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.97      0.93      0.95        60
             requirement       0.88      0.95      0.91       105
system_related_auxiliary       0.93      0.84      0.88        75

                accuracy                           0.91       240
               macro avg       0.92      0.91      0.91       240
            weighted avg       0.91      0.91      0.91       240

[1m>>>>>[0mEvaluating GPT2 on fold-3[1m<<<<<[0m


All TF 2.0 model weights were used when initializing GPT2ForSequenceClassification.

All the weights of GPT2ForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.92      0.95        60
             requirement       0.89      0.95      0.92       105
system_related_auxiliary       0.88      0.84      0.86        75

                accuracy                           0.91       240
               macro avg       0.92      0.90      0.91       240
            weighted avg       0.91      0.91      0.91       240

[1m>>>>>[0mEvaluating GPT2 on fold-4[1m<<<<<[0m


All TF 2.0 model weights were used when initializing GPT2ForSequenceClassification.

All the weights of GPT2ForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.90      0.95        60
             requirement       0.85      0.96      0.90       105
system_related_auxiliary       0.91      0.81      0.86        75

                accuracy                           0.90       240
               macro avg       0.92      0.89      0.90       240
            weighted avg       0.91      0.90      0.90       240

[1m>>>>>[0mEvaluating GPT2 on fold-5[1m<<<<<[0m


All TF 2.0 model weights were used when initializing GPT2ForSequenceClassification.

All the weights of GPT2ForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.93      0.97        60
             requirement       0.90      0.97      0.94       105
system_related_auxiliary       0.93      0.88      0.90        75

                accuracy                           0.93       240
               macro avg       0.94      0.93      0.94       240
            weighted avg       0.94      0.93      0.93       240

[1m>>>>>[0mEvaluating GPT2 on fold-6[1m<<<<<[0m


All TF 2.0 model weights were used when initializing GPT2ForSequenceClassification.

All the weights of GPT2ForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.88      0.94        60
             requirement       0.84      0.93      0.88       105
system_related_auxiliary       0.84      0.79      0.81        75

                accuracy                           0.88       240
               macro avg       0.89      0.87      0.88       240
            weighted avg       0.88      0.88      0.88       240

[1m>>>>>[0mEvaluating GPT2 on fold-7[1m<<<<<[0m


All TF 2.0 model weights were used when initializing GPT2ForSequenceClassification.

All the weights of GPT2ForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.90      0.95        60
             requirement       0.88      0.97      0.92       105
system_related_auxiliary       0.93      0.86      0.90        74

                accuracy                           0.92       239
               macro avg       0.94      0.91      0.92       239
            weighted avg       0.92      0.92      0.92       239

[1m>>>>>[0mEvaluating GPT2 on fold-8[1m<<<<<[0m


All TF 2.0 model weights were used when initializing GPT2ForSequenceClassification.

All the weights of GPT2ForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.95      0.97        60
             requirement       0.86      0.95      0.90       105
system_related_auxiliary       0.92      0.81      0.86        74

                accuracy                           0.91       239
               macro avg       0.92      0.90      0.91       239
            weighted avg       0.91      0.91      0.91       239

[1m>>>>>[0mEvaluating GPT2 on fold-9[1m<<<<<[0m


All TF 2.0 model weights were used when initializing GPT2ForSequenceClassification.

All the weights of GPT2ForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.92      0.95        60
             requirement       0.89      0.97      0.93       105
system_related_auxiliary       0.93      0.85      0.89        74

                accuracy                           0.92       239
               macro avg       0.93      0.91      0.92       239
            weighted avg       0.92      0.92      0.92       239

[1m>>>>>[0mEvaluating GPT2 on fold-10[1m<<<<<[0m


All TF 2.0 model weights were used when initializing GPT2ForSequenceClassification.

All the weights of GPT2ForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.95      0.90      0.92        60
             requirement       0.91      0.94      0.93       105
system_related_auxiliary       0.88      0.86      0.87        74

                accuracy                           0.91       239
               macro avg       0.91      0.90      0.91       239
            weighted avg       0.91      0.91      0.91       239



In [16]:
heldout_text = f'{model_to_run} Evaluation on Heldout Set:'

display(Markdown(f"### **{heldout_text.upper()}**"))
print_cv_summary(fold_test_macro, title = '10-folds Macro Average Mean±STD:')
print('\n\n')
print_cv_summary(fold_test_per_class_metrics, title = '10-folds Per Class P,R,F Mean±STD:')

### **GPT2 EVALUATION ON HELDOUT SET:**

[1m10-folds Macro Average Mean±STD:[0m
Class      | Precision     | Recall        | F1-score  
------------------------------------------------------------
Macro      | 0.921 ± 0.013 | 0.903 ± 0.015 | 0.911 ± 0.014



[1m10-folds Per Class P,R,F Mean±STD:[0m
Class      | Precision     | Recall        | F1-score  
------------------------------------------------------------
contextual_auxiliary | 0.982 ± 0.018 | 0.913 ± 0.019 | 0.946 ± 0.013
requirement | 0.878 ± 0.022 | 0.957 ± 0.012 | 0.916 ± 0.015
system_related_auxiliary | 0.904 ± 0.028 | 0.839 ± 0.027 | 0.870 ± 0.024


In [17]:
from IPython.display import display, Markdown

model_to_run =  'l6v6' # or 'gpt2' | 'roberta_base' | 'bert_base_cased'

fold_test_macro = []  
fold_test_per_class_metrics = [] 

for fold, info in sorted(paired[model_to_run].items()):
    model_dir = Path(info['model_dir'])
    data_dir = Path(info['data_dir'])
    fold_label_names = [mapper.map_hf(i) for i in datasets.load_from_disk(str(data_dir))['train'].features['label'].names]
    print(f"\033[1m>>>>>\033[0mEvaluating {model_to_run.upper()} on fold-{fold}\033[1m<<<<<\033[0m")

    # 1) evaluate on the fold's own held-out test set
    y_true, y_pred, report_text, report_dict = eval_fold_torch(
        model_dir = model_dir,
        data_dir = data_dir,
        batch_size = batch_size,
        override_test_ds = None,
    )
    prf_on_heldout, m_on_heldout = collect_per_class_prf(report_dict, fold_label_names)
    fold_test_per_class_metrics.append(prf_on_heldout)
    fold_test_macro.append(m_on_heldout)
    print(report_text) # classification report for each fold

[1m>>>>>[0mEvaluating L6V6 on fold-1[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.93      0.97        60
             requirement       0.90      0.99      0.95       105
system_related_auxiliary       0.99      0.91      0.94        75

                accuracy                           0.95       240
               macro avg       0.96      0.94      0.95       240
            weighted avg       0.95      0.95      0.95       240

[1m>>>>>[0mEvaluating L6V6 on fold-2[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.97      0.95      0.96        60
             requirement       0.91      0.95      0.93       105
system_related_auxiliary       0.93      0.88      0.90        75

                accuracy                           0.93       240
               macro avg       0.93      0.93      0.93       240
            weighted avg       0.93      0.93      0.93       240

[1m>>>>>[0mEvaluating L6V6 on fold-3[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.97      0.97        60
             requirement       0.94      0.99      0.96       105
system_related_auxiliary       0.97      0.91      0.94        75

                accuracy                           0.96       240
               macro avg       0.96      0.95      0.96       240
            weighted avg       0.96      0.96      0.96       240

[1m>>>>>[0mEvaluating L6V6 on fold-4[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.97      0.97      0.97        60
             requirement       0.94      0.97      0.96       105
system_related_auxiliary       0.94      0.91      0.93        75

                accuracy                           0.95       240
               macro avg       0.95      0.95      0.95       240
            weighted avg       0.95      0.95      0.95       240

[1m>>>>>[0mEvaluating L6V6 on fold-5[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.98      0.98        60
             requirement       0.94      0.95      0.95       105
system_related_auxiliary       0.95      0.93      0.94        75

                accuracy                           0.95       240
               macro avg       0.96      0.96      0.96       240
            weighted avg       0.95      0.95      0.95       240

[1m>>>>>[0mEvaluating L6V6 on fold-6[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.92      0.96        60
             requirement       0.84      0.98      0.90       105
system_related_auxiliary       0.95      0.79      0.86        75

                accuracy                           0.90       240
               macro avg       0.93      0.89      0.91       240
            weighted avg       0.91      0.90      0.90       240

[1m>>>>>[0mEvaluating L6V6 on fold-7[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.92      0.95        60
             requirement       0.92      0.99      0.95       105
system_related_auxiliary       0.94      0.89      0.92        74

                accuracy                           0.94       239
               macro avg       0.95      0.93      0.94       239
            weighted avg       0.94      0.94      0.94       239

[1m>>>>>[0mEvaluating L6V6 on fold-8[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.97      0.97      0.97        60
             requirement       0.90      0.96      0.93       105
system_related_auxiliary       0.96      0.86      0.91        74

                accuracy                           0.93       239
               macro avg       0.94      0.93      0.94       239
            weighted avg       0.93      0.93      0.93       239

[1m>>>>>[0mEvaluating L6V6 on fold-9[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.90      0.95        60
             requirement       0.89      0.97      0.93       105
system_related_auxiliary       0.94      0.89      0.92        74

                accuracy                           0.93       239
               macro avg       0.94      0.92      0.93       239
            weighted avg       0.93      0.93      0.93       239

[1m>>>>>[0mEvaluating L6V6 on fold-10[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.93      0.97        60
             requirement       0.89      0.98      0.93       105
system_related_auxiliary       0.93      0.84      0.88        74

                accuracy                           0.92       239
               macro avg       0.94      0.92      0.93       239
            weighted avg       0.93      0.92      0.92       239



In [18]:
heldout_text = f'{model_to_run} Evaluation on Heldout Set:'

display(Markdown(f"### **{heldout_text.upper()}**"))
print_cv_summary(fold_test_macro, title = '10-folds Macro Average Mean±STD:')
print('\n\n')
print_cv_summary(fold_test_per_class_metrics, title = '10-folds Per Class P,R,F Mean±STD:')

### **L6V6 EVALUATION ON HELDOUT SET:**

[1m10-folds Macro Average Mean±STD:[0m
Class      | Precision     | Recall        | F1-score  
------------------------------------------------------------
Macro      | 0.947 ± 0.011 | 0.933 ± 0.018 | 0.939 ± 0.015



[1m10-folds Per Class P,R,F Mean±STD:[0m
Class      | Precision     | Recall        | F1-score  
------------------------------------------------------------
contextual_auxiliary | 0.985 ± 0.014 | 0.943 ± 0.026 | 0.963 ± 0.011
requirement | 0.907 ± 0.031 | 0.974 ± 0.014 | 0.939 ± 0.017
system_related_auxiliary | 0.949 ± 0.017 | 0.881 ± 0.040 | 0.913 ± 0.025


In [19]:
from IPython.display import display, Markdown

model_to_run =  'roberta_base' #  | 'bert_base_cased'

fold_test_macro = []  
fold_test_per_class_metrics = [] 

for fold, info in sorted(paired[model_to_run].items()):
    model_dir = Path(info['model_dir'])
    data_dir = Path(info['data_dir'])
    fold_label_names = [mapper.map_hf(i) for i in datasets.load_from_disk(str(data_dir))['train'].features['label'].names]
    print(f"\033[1m>>>>>\033[0mEvaluating {model_to_run.upper()} on fold-{fold}\033[1m<<<<<\033[0m")

    # 1) evaluate on the fold's own held-out test set
    y_true, y_pred, report_text, report_dict = eval_fold_torch(
        model_dir = model_dir,
        data_dir = data_dir,
        batch_size = batch_size,
        override_test_ds = None,
    )
    prf_on_heldout, m_on_heldout = collect_per_class_prf(report_dict, fold_label_names)
    fold_test_per_class_metrics.append(prf_on_heldout)
    fold_test_macro.append(m_on_heldout)
    print(report_text) # classification report for each fold

[1m>>>>>[0mEvaluating ROBERTA_BASE on fold-1[1m<<<<<[0m


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.97      0.93      0.95        60
             requirement       0.90      0.99      0.95       105
system_related_auxiliary       0.97      0.87      0.92        75

                accuracy                           0.94       240
               macro avg       0.95      0.93      0.94       240
            weighted avg       0.94      0.94      0.94       240

[1m>>>>>[0mEvaluating ROBERTA_BASE on fold-2[1m<<<<<[0m


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.95      0.97        60
             requirement       0.93      0.95      0.94       105
system_related_auxiliary       0.92      0.91      0.91        75

                accuracy                           0.94       240
               macro avg       0.94      0.94      0.94       240
            weighted avg       0.94      0.94      0.94       240

[1m>>>>>[0mEvaluating ROBERTA_BASE on fold-3[1m<<<<<[0m


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.95      0.95      0.95        60
             requirement       0.93      0.96      0.94       105
system_related_auxiliary       0.94      0.89      0.92        75

                accuracy                           0.94       240
               macro avg       0.94      0.94      0.94       240
            weighted avg       0.94      0.94      0.94       240

[1m>>>>>[0mEvaluating ROBERTA_BASE on fold-4[1m<<<<<[0m


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.93      0.96        60
             requirement       0.89      0.96      0.93       105
system_related_auxiliary       0.91      0.85      0.88        75

                accuracy                           0.92       240
               macro avg       0.93      0.92      0.92       240
            weighted avg       0.92      0.92      0.92       240

[1m>>>>>[0mEvaluating ROBERTA_BASE on fold-5[1m<<<<<[0m


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.98      0.99        60
             requirement       0.93      0.94      0.94       105
system_related_auxiliary       0.92      0.92      0.92        75

                accuracy                           0.95       240
               macro avg       0.95      0.95      0.95       240
            weighted avg       0.95      0.95      0.95       240

[1m>>>>>[0mEvaluating ROBERTA_BASE on fold-6[1m<<<<<[0m


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.93      0.97        60
             requirement       0.89      0.99      0.94       105
system_related_auxiliary       0.96      0.85      0.90        75

                accuracy                           0.93       240
               macro avg       0.95      0.93      0.93       240
            weighted avg       0.94      0.93      0.93       240

[1m>>>>>[0mEvaluating ROBERTA_BASE on fold-7[1m<<<<<[0m


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.95      0.97        60
             requirement       0.90      0.98      0.94       105
system_related_auxiliary       0.96      0.86      0.91        74

                accuracy                           0.94       239
               macro avg       0.95      0.93      0.94       239
            weighted avg       0.94      0.94      0.94       239

[1m>>>>>[0mEvaluating ROBERTA_BASE on fold-8[1m<<<<<[0m


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.97      0.97      0.97        60
             requirement       0.92      0.96      0.94       105
system_related_auxiliary       0.96      0.89      0.92        74

                accuracy                           0.94       239
               macro avg       0.95      0.94      0.94       239
            weighted avg       0.94      0.94      0.94       239

[1m>>>>>[0mEvaluating ROBERTA_BASE on fold-9[1m<<<<<[0m


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.95      0.97        60
             requirement       0.92      0.98      0.95       105
system_related_auxiliary       0.96      0.89      0.92        74

                accuracy                           0.95       239
               macro avg       0.95      0.94      0.95       239
            weighted avg       0.95      0.95      0.95       239

[1m>>>>>[0mEvaluating ROBERTA_BASE on fold-10[1m<<<<<[0m


All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.92      0.95        60
             requirement       0.91      0.96      0.94       105
system_related_auxiliary       0.90      0.88      0.89        74

                accuracy                           0.92       239
               macro avg       0.93      0.92      0.92       239
            weighted avg       0.93      0.92      0.92       239



In [20]:
heldout_text = f'{model_to_run} Evaluation on Heldout Set:'

display(Markdown(f"### **{heldout_text.upper()}**"))
print_cv_summary(fold_test_macro, title = '10-folds Macro Average Mean±STD:')
print('\n\n')
print_cv_summary(fold_test_per_class_metrics, title = '10-folds Per Class P,R,F Mean±STD:')

### **ROBERTA_BASE EVALUATION ON HELDOUT SET:**

[1m10-folds Macro Average Mean±STD:[0m
Class      | Precision     | Recall        | F1-score  
------------------------------------------------------------
Macro      | 0.944 ± 0.008 | 0.932 ± 0.010 | 0.937 ± 0.008



[1m10-folds Per Class P,R,F Mean±STD:[0m
Class      | Precision     | Recall        | F1-score  
------------------------------------------------------------
contextual_auxiliary | 0.981 ± 0.016 | 0.947 ± 0.018 | 0.964 ± 0.013
requirement | 0.912 ± 0.015 | 0.969 ± 0.015 | 0.939 ± 0.006
system_related_auxiliary | 0.939 ± 0.022 | 0.882 ± 0.021 | 0.909 ± 0.013


In [21]:
from IPython.display import display, Markdown

model_to_run = 'bert_base_cased'

fold_test_macro = []  
fold_test_per_class_metrics = [] 

for fold, info in sorted(paired[model_to_run].items()):
    model_dir = Path(info['model_dir'])
    data_dir = Path(info['data_dir'])
    fold_label_names = [mapper.map_hf(i) for i in datasets.load_from_disk(str(data_dir))['train'].features['label'].names]
    print(f"\033[1m>>>>>\033[0mEvaluating {model_to_run.upper()} on fold-{fold}\033[1m<<<<<\033[0m")

    # 1) evaluate on the fold's own held-out test set
    y_true, y_pred, report_text, report_dict = eval_fold_torch(
        model_dir = model_dir,
        data_dir = data_dir,
        batch_size = batch_size,
        override_test_ds = None,
    )
    prf_on_heldout, m_on_heldout = collect_per_class_prf(report_dict, fold_label_names)
    fold_test_per_class_metrics.append(prf_on_heldout)
    fold_test_macro.append(m_on_heldout)
    print(report_text) # classification report for each fold

[1m>>>>>[0mEvaluating BERT_BASE_CASED on fold-1[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.95      0.97        60
             requirement       0.91      0.98      0.94       105
system_related_auxiliary       0.97      0.91      0.94        75

                accuracy                           0.95       240
               macro avg       0.96      0.95      0.95       240
            weighted avg       0.95      0.95      0.95       240

[1m>>>>>[0mEvaluating BERT_BASE_CASED on fold-2[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.97      0.97      0.97        60
             requirement       0.92      0.99      0.95       105
system_related_auxiliary       0.97      0.87      0.92        75

                accuracy                           0.95       240
               macro avg       0.95      0.94      0.95       240
            weighted avg       0.95      0.95      0.95       240

[1m>>>>>[0mEvaluating BERT_BASE_CASED on fold-3[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.97      0.98        60
             requirement       0.92      0.92      0.92       105
system_related_auxiliary       0.88      0.89      0.89        75

                accuracy                           0.93       240
               macro avg       0.93      0.93      0.93       240
            weighted avg       0.93      0.93      0.93       240

[1m>>>>>[0mEvaluating BERT_BASE_CASED on fold-4[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.97      0.97        60
             requirement       0.90      0.97      0.94       105
system_related_auxiliary       0.96      0.87      0.91        75

                accuracy                           0.94       240
               macro avg       0.95      0.93      0.94       240
            weighted avg       0.94      0.94      0.94       240

[1m>>>>>[0mEvaluating BERT_BASE_CASED on fold-5[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.98      0.99        60
             requirement       0.92      0.94      0.93       105
system_related_auxiliary       0.92      0.89      0.91        75

                accuracy                           0.94       240
               macro avg       0.94      0.94      0.94       240
            weighted avg       0.94      0.94      0.94       240

[1m>>>>>[0mEvaluating BERT_BASE_CASED on fold-6[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.92      0.96        60
             requirement       0.86      0.97      0.91       105
system_related_auxiliary       0.94      0.83      0.88        75

                accuracy                           0.91       240
               macro avg       0.93      0.90      0.92       240
            weighted avg       0.92      0.91      0.91       240

[1m>>>>>[0mEvaluating BERT_BASE_CASED on fold-7[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.93      0.96        60
             requirement       0.91      0.97      0.94       105
system_related_auxiliary       0.94      0.89      0.92        74

                accuracy                           0.94       239
               macro avg       0.95      0.93      0.94       239
            weighted avg       0.94      0.94      0.94       239

[1m>>>>>[0mEvaluating BERT_BASE_CASED on fold-8[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       0.98      0.98      0.98        60
             requirement       0.90      0.97      0.94       105
system_related_auxiliary       0.97      0.86      0.91        74

                accuracy                           0.94       239
               macro avg       0.95      0.94      0.94       239
            weighted avg       0.94      0.94      0.94       239

[1m>>>>>[0mEvaluating BERT_BASE_CASED on fold-9[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.95      0.97        60
             requirement       0.90      0.96      0.93       105
system_related_auxiliary       0.93      0.88      0.90        74

                accuracy                           0.93       239
               macro avg       0.94      0.93      0.94       239
            weighted avg       0.93      0.93      0.93       239

[1m>>>>>[0mEvaluating BERT_BASE_CASED on fold-10[1m<<<<<[0m


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


                          precision    recall  f1-score   support

    contextual_auxiliary       1.00      0.98      0.99        60
             requirement       0.91      0.96      0.94       105
system_related_auxiliary       0.94      0.88      0.91        74

                accuracy                           0.94       239
               macro avg       0.95      0.94      0.95       239
            weighted avg       0.94      0.94      0.94       239



In [22]:
heldout_text = f'{model_to_run} Evaluation on Heldout Set:'

display(Markdown(f"### **{heldout_text.upper()}**"))
print_cv_summary(fold_test_macro, title = '10-folds Macro Average Mean±STD:')
print('\n\n')
print_cv_summary(fold_test_per_class_metrics, title = '10-folds Per Class P,R,F Mean±STD:')

### **BERT_BASE_CASED EVALUATION ON HELDOUT SET:**

[1m10-folds Macro Average Mean±STD:[0m
Class      | Precision     | Recall        | F1-score  
------------------------------------------------------------
Macro      | 0.946 ± 0.008 | 0.934 ± 0.011 | 0.939 ± 0.010



[1m10-folds Per Class P,R,F Mean±STD:[0m
Class      | Precision     | Recall        | F1-score  
------------------------------------------------------------
contextual_auxiliary | 0.992 ± 0.011 | 0.960 ± 0.021 | 0.975 ± 0.012
requirement | 0.905 ± 0.017 | 0.965 ± 0.018 | 0.934 ± 0.012
system_related_auxiliary | 0.942 ± 0.027 | 0.877 ± 0.021 | 0.908 ± 0.015
