In [1]:
# common imports

import sys
sys.path.append("../datasets/ARID_supporting_scripts")

import os
import mapper
import random
import datasets
import numpy as np
import pandas as pd
import tensorflow as tf
from pathlib import Path
from tensorflow import keras

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [2]:
# Loading the dataset

evaluation_set = datasets.load_from_disk('../datasets/ARID_supporting_scripts/5_1_training_set')['test']

# Model Setup

In [3]:
def preprocess_function(dataset):
    return tokenizer(dataset['Requirement Sentences'], truncation = True)

In [4]:
lbl_ = evaluation_set.features['label'].names
label2id = {lbl: idx for idx, lbl in enumerate(lbl_)}
id2label = {val: key for key, val in label2id.items()}

In [5]:
def forward_pass_with_label(batch):
    input_ids = batch['input_ids']
    attention_masks = batch['attention_mask']
    true_labels = tf.convert_to_tensor( batch['label'])
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, padding = "post")
    attention_masks = tf.keras.preprocessing.sequence.pad_sequences(attention_masks, padding = "post")
    with tf.GradientTape() as tape:
        output = model(input_ids, attention_masks)
        probas = tf.nn.softmax(output.logits, axis = -1).numpy()
        predicted_labels = tf.argmax(output.logits, axis = -1).numpy()
        loss = tf.keras.losses.sparse_categorical_crossentropy(true_labels, output.logits)
    loss = loss.numpy()
    return {"loss": loss, 
            "y_preds": [id2label[lbl] for lbl in predicted_labels], 
            "y_probas": [probas[i][predicted_labels[i]] for i in range(len(predicted_labels))]}

# Benchmark Noisy Models

In [6]:
# Loading noisy models
noisy_models_path = Path('./training_scripts_and_models/models_with_inconsistency_levels/')

noisy_models = [folder for folder in noisy_models_path.iterdir() if folder.is_dir() and not folder.name.startswith('.')]
noisy_models

[PosixPath('training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.1_seed_100'),
 PosixPath('training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.1_seed_19'),
 PosixPath('training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.1_seed_42'),
 PosixPath('training_scripts_and_models/models_with_inconsistency_levels/clean_roberta_base_noise_rate_0_seed_100'),
 PosixPath('training_scripts_and_models/models_with_inconsistency_levels/clean_roberta_base_noise_rate_0_seed_42'),
 PosixPath('training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.2_seed_42'),
 PosixPath('training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.2_seed_19'),
 PosixPath('training_scripts_and_models/models_with_inconsistency_levels/clean_roberta_base_noise_rate_0_seed_19'),
 PosixPath('training_scripts_and_models/models_with_inconsis

In [7]:
all_models = {}

for mdl in noisy_models:
    split_mdl_id = mdl.name.split('_')
    seed_number = split_mdl_id[-1]
    noise_level = split_mdl_id[-3]
    all_models[noise_level + '_' + seed_number] = mdl

In [8]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

all_models_prediction = {}
for mdl_id in list(all_models.keys()):
    model_path = all_models[mdl_id]
    model = TFAutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    evaluation_set_encoded = evaluation_set.map(preprocess_function, batched = True)
    evaluation_set_predicted = evaluation_set_encoded.map(forward_pass_with_label, batched = True, batch_size = 8)
    all_models_prediction[mdl_id] = evaluation_set_predicted

Metal device set to: Apple M4 Pro


2025-05-26 14:27:09.392683: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-26 14:27:09.392804: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.1_seed_100.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://742a3e2d-0c7b-4dd4-abbd-6a6fc4e56d5c/assets


INFO:tensorflow:Assets written to: ram://742a3e2d-0c7b-4dd4-abbd-6a6fc4e56d5c/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.1_seed_19.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://3178cdaf-a041-4a09-a0de-3bf7e0eb4a63/assets


INFO:tensorflow:Assets written to: ram://3178cdaf-a041-4a09-a0de-3bf7e0eb4a63/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.1_seed_42.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://fe182109-8791-41cc-835a-f909638a904a/assets


INFO:tensorflow:Assets written to: ram://fe182109-8791-41cc-835a-f909638a904a/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/clean_roberta_base_noise_rate_0_seed_100.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://d556755a-97c0-44d1-957c-5a10b90937a6/assets


INFO:tensorflow:Assets written to: ram://d556755a-97c0-44d1-957c-5a10b90937a6/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/clean_roberta_base_noise_rate_0_seed_42.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://962c4206-dfb2-4be1-9635-f7ad3ea1e851/assets


INFO:tensorflow:Assets written to: ram://962c4206-dfb2-4be1-9635-f7ad3ea1e851/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.2_seed_42.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://bf84d5ac-009d-4d2d-9bbf-0d3be4b30ef3/assets


INFO:tensorflow:Assets written to: ram://bf84d5ac-009d-4d2d-9bbf-0d3be4b30ef3/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.2_seed_19.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://02c3cbdc-87e7-4296-9c65-0121dae462df/assets


INFO:tensorflow:Assets written to: ram://02c3cbdc-87e7-4296-9c65-0121dae462df/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/clean_roberta_base_noise_rate_0_seed_19.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://1082938a-6297-42bb-9fcb-3ff1c9fbbadb/assets


INFO:tensorflow:Assets written to: ram://1082938a-6297-42bb-9fcb-3ff1c9fbbadb/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.3_seed_19.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://c2bf87ef-0df3-4937-8c47-005a5e36276a/assets


INFO:tensorflow:Assets written to: ram://c2bf87ef-0df3-4937-8c47-005a5e36276a/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.4_seed_15.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://817967da-8210-4e8a-8365-7bbea29402f1/assets


INFO:tensorflow:Assets written to: ram://817967da-8210-4e8a-8365-7bbea29402f1/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.3_seed_42.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://2a60860c-7127-4831-93ab-bf45f0ed07d5/assets


INFO:tensorflow:Assets written to: ram://2a60860c-7127-4831-93ab-bf45f0ed07d5/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.4_seed_13.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://6d11963c-efe5-4c77-8ea1-83e2e0dc4cd1/assets


INFO:tensorflow:Assets written to: ram://6d11963c-efe5-4c77-8ea1-83e2e0dc4cd1/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.1_seed_13.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://5263e0e4-9d61-4b7c-91e6-74c441cc188c/assets


INFO:tensorflow:Assets written to: ram://5263e0e4-9d61-4b7c-91e6-74c441cc188c/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.1_seed_15.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://48b4ed57-3496-42cc-be7c-3c594cdeb1f8/assets


INFO:tensorflow:Assets written to: ram://48b4ed57-3496-42cc-be7c-3c594cdeb1f8/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.2_seed_100.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://309b2b24-546a-43c0-81c0-f4c9d7219123/assets


INFO:tensorflow:Assets written to: ram://309b2b24-546a-43c0-81c0-f4c9d7219123/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.3_seed_100.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://e6fa7804-97df-471f-b996-ecccd7435553/assets


INFO:tensorflow:Assets written to: ram://e6fa7804-97df-471f-b996-ecccd7435553/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.2_seed_15.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://1664326e-2f7a-4bec-a470-a690cc17a85a/assets


INFO:tensorflow:Assets written to: ram://1664326e-2f7a-4bec-a470-a690cc17a85a/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/clean_roberta_base_noise_rate_0_seed_15.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://bf7126bb-46a7-46f1-a679-00468b710d7d/assets


INFO:tensorflow:Assets written to: ram://bf7126bb-46a7-46f1-a679-00468b710d7d/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/clean_roberta_base_noise_rate_0_seed_13.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://26e1d6f4-981b-4a90-90f0-6d91440bed56/assets


INFO:tensorflow:Assets written to: ram://26e1d6f4-981b-4a90-90f0-6d91440bed56/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.2_seed_13.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://61f787d2-c107-49f1-856b-f7c79f6167b1/assets


INFO:tensorflow:Assets written to: ram://61f787d2-c107-49f1-856b-f7c79f6167b1/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.4_seed_100.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://0a1deaac-88a9-468d-8a05-bb85f4c60f16/assets


INFO:tensorflow:Assets written to: ram://0a1deaac-88a9-468d-8a05-bb85f4c60f16/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.3_seed_13.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://3a4db675-d573-4a6d-8e49-c87115e9a998/assets


INFO:tensorflow:Assets written to: ram://3a4db675-d573-4a6d-8e49-c87115e9a998/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.4_seed_42.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://1c9e1717-f552-4618-aa75-28709751e080/assets


INFO:tensorflow:Assets written to: ram://1c9e1717-f552-4618-aa75-28709751e080/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.3_seed_15.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://5dc7b108-88ce-457f-ba59-2539cbfde72a/assets


INFO:tensorflow:Assets written to: ram://5dc7b108-88ce-457f-ba59-2539cbfde72a/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at training_scripts_and_models/models_with_inconsistency_levels/noisy_roberta_base_noise_rate_0.4_seed_19.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://7984d756-fec4-44aa-8da9-d60fbf40c733/assets


INFO:tensorflow:Assets written to: ram://7984d756-fec4-44aa-8da9-d60fbf40c733/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

In [9]:
from collections import defaultdict

grouped = {}
for key, value in all_models_prediction.items():
    group_id = key.split('_')[0]
    if group_id not in grouped:
        grouped[group_id] = [] 
    grouped[group_id].append(value)

In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate(y_true, y_pred, average = 'macro'):
    f1 = f1_score(y_true, y_pred, average = average)
    return f1

In [11]:
evaluation_results = {}

for key, value in grouped.items():
    evaluation_results[key] = [evaluate(mapper.map(y['signal_keyword']), mapper.map(y['y_preds'])) for y in value]

In [12]:
# Spliting F1 scores

f1_clean = evaluation_results['0']
f1_10 = evaluation_results['0.1']
f1_20 = evaluation_results['0.2']
f1_30 = evaluation_results['0.3']
f1_40 = evaluation_results['0.4']

noise_levels = ['0% Noise', '10% Noise', '20% Noise', '30% Noise', '40% Noise']
f1_scores = [f1_clean, f1_10, f1_20, f1_30, f1_40]

# Descriptive Statistics 

In [13]:
# Calculating basic statistics

def calculate_statistics(sample):
    n = len(sample)
    mean = np.mean(sample)
    std_dev = np.std(sample, ddof = 1)

    print(f'n {n}')
    print(f'Mean {mean:.3f}')
    print(f'Standard Deviation {std_dev:.3f}')
    
for s_name, f1 in zip(noise_levels, f1_scores):
    print(f'{s_name}:')
    calculate_statistics(f1)
    print('*' * 25)

0% Noise:
n 5
Mean 0.944
Standard Deviation 0.008
*************************
10% Noise:
n 5
Mean 0.938
Standard Deviation 0.008
*************************
20% Noise:
n 5
Mean 0.921
Standard Deviation 0.008
*************************
30% Noise:
n 5
Mean 0.887
Standard Deviation 0.017
*************************
40% Noise:
n 5
Mean 0.867
Standard Deviation 0.026
*************************


# Inferential Statistics

In [14]:
# Performing ANOVA

from scipy.stats import f_oneway
import matplotlib.pyplot as plt
import seaborn as sns

F_statistic, p_value = f_oneway(f1_clean, f1_10, f1_20, f1_30, f1_40)

print("ANOVA Results for RoBERTa F1-scores across noise levels:")

print(f"F-statistic: {F_statistic}")
print(f"p-value: {p_value:.5f}", )

ANOVA Results for RoBERTa F1-scores across noise levels:
F-statistic: 24.178287068132317
p-value: 0.00000


In [15]:
# Calculating effect size

def compute_partial_eta_squared(F, df_effect, df_error):
    eta_p_squared = (df_effect * F) / (df_effect * F + df_error)
    return eta_p_squared


df_effect = 4  # 5 groups: 5 - 1 = 4
df_error = 20  # 25 total samples: 25 - 5 = 20

eta_p2 = compute_partial_eta_squared(F_statistic, df_effect, df_error)
print("Partial eta squared (ηₚ²):", eta_p2)

Partial eta squared (ηₚ²): 0.8286397008732957


In [16]:
# Computing CI using bootstrap

def bootstrap_eta_p2(groups, n_bootstraps = 1000, random_state = None):
    if random_state is not None:
        np.random.seed(random_state)
    
    k = len(groups)
    n_per_group = [len(g) for g in groups]
    N = sum(n_per_group)
    df_effect = k - 1
    df_error = N - k
    
    boot_eta_p2 = []
    
    for i in range(n_bootstraps):
        boot_groups = []
        for group in groups:
            boot_sample = np.random.choice(group, size=len(group), replace = True)
            boot_groups.append(boot_sample)    
        F_stat, _ = f_oneway(*boot_groups)
        eta_p2 = compute_partial_eta_squared(F_stat, df_effect, df_error)
        boot_eta_p2.append(eta_p2)
    
    boot_eta_p2 = np.array(boot_eta_p2)
    ci_lower, ci_upper = np.percentile(boot_eta_p2, [2.5, 97.5])
    mean_eta_p2 = np.mean(boot_eta_p2)

    return {'ci_lower': ci_lower, 'ci_upper': ci_upper}

In [17]:
CI = bootstrap_eta_p2([f1_clean, f1_10, f1_20, f1_30, f1_40], random_state = 42)
print(f"95% Confidence Interval: [{CI['ci_lower']:.3f}, {CI['ci_upper']:.3f}]")

95% Confidence Interval: [0.774, 0.944]


In [18]:
# Performing Tukey's HSD post-hoc test

from statsmodels.stats.multicomp import pairwise_tukeyhsd

data = f1_clean + f1_10 + f1_20 + f1_30 + f1_40

groups = (["0% Noise"] * len(f1_clean) +
          ["10% Noise"] * len(f1_10) +
          ["20% Noise"] * len(f1_20) +
          ["30% Noise"] * len(f1_30) +
          ["40% Noise"] * len(f1_40))


df = pd.DataFrame({"F1_Score": data, "Noise_Level": groups})

tukey_results = pairwise_tukeyhsd(endog = df["F1_Score"],
                                  groups = df["Noise_Level"],
                                  alpha = 0.05)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
tukey_results.summary()

group1,group2,meandiff,p-adj,lower,upper,reject
0% Noise,10% Noise,-0.006,0.9697,-0.0348,0.0228,False
0% Noise,20% Noise,-0.0231,0.1573,-0.0519,0.0057,False
0% Noise,30% Noise,-0.0571,0.0001,-0.0859,-0.0283,True
0% Noise,40% Noise,-0.0776,0.0,-0.1064,-0.0488,True
10% Noise,20% Noise,-0.0171,0.4153,-0.0459,0.0117,False
10% Noise,30% Noise,-0.0511,0.0003,-0.0799,-0.0223,True
10% Noise,40% Noise,-0.0716,0.0,-0.1004,-0.0428,True
20% Noise,30% Noise,-0.034,0.0159,-0.0628,-0.0052,True
20% Noise,40% Noise,-0.0545,0.0001,-0.0833,-0.0257,True
30% Noise,40% Noise,-0.0205,0.2475,-0.0493,0.0083,False
