In [1]:
# common imports

import sys
sys.path.append("../datasets/ARID_supporting_scripts")

import os
import re
import random
import mapper
import datasets
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [2]:
evaluation_set = datasets.load_from_disk('../datasets/ARID_supporting_scripts/5_1_training_set')['test']

In [3]:
lbl_ = evaluation_set.features['label'].names
label2id = {lbl: idx for idx, lbl in enumerate(lbl_)}
id2label = {val: key for key, val in label2id.items()}

In [4]:
def preprocess_function(dataset):
    return tokenizer(dataset['Requirement Sentences'], padding = 'max_length', max_length = 256, truncation = True, return_tensors = 'tf')

In [5]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

def forward_pass_with_label(batch):
    inputs = {'input_ids': tf.convert_to_tensor(batch['input_ids']),
             'attention_mask': tf.convert_to_tensor(batch['attention_mask'])}
    true_labels = tf.convert_to_tensor(batch['label'])
    
    with tf.GradientTape() as tape:
        output = model(**inputs)
        predicted_labels = tf.argmax(output.logits, axis = -1).numpy()
        probas = tf.nn.softmax(output.logits, axis = -1).numpy()
        loss = tf.keras.losses.sparse_categorical_crossentropy(true_labels, output.logits)

    loss = loss.numpy()

    return {"loss": loss, 
            "y_preds": [id2label[lbl] for lbl in predicted_labels],
            "y_probas": [probas[i][predicted_labels[i]] for i in range(len(predicted_labels))]}

In [6]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

reqseek_path = '../ReqSeek/'

tokenizer = AutoTokenizer.from_pretrained(reqseek_path)
model = TFAutoModelForSequenceClassification.from_pretrained(reqseek_path)

Metal device set to: Apple M4 Pro


2025-05-26 14:56:00.516813: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-26 14:56:00.516942: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at ../ReqSeek/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn

nltk.download('wordnet')
nltk.data.path.append("/Users/mohammadkasrahabib/nltk_data")

random.seed(42)

def remove_modal_verbs(text):
    return re.sub(r'\b(shall|should|may|will|must)\b', '', text, flags = re.IGNORECASE)

def add_minor_details(text):
    nouns = [syn.lemmas()[0].name().replace('_', ' ') for syn in wn.all_synsets('n')]
    adjs = [syn.lemmas()[0].name().replace('_', ' ') for syn in wn.all_synsets('a')]
    verbs = [syn.lemmas()[0].name().replace('_', ' ') for syn in wn.all_synsets('v')]
    patterns = [
        lambda: f"{random.choice(verbs)} {random.choice(adjs)} {random.choice(nouns)}",
        lambda: f"in order to {random.choice(verbs)} {random.choice(nouns)}",
        lambda: f"which {random.choice(verbs)} {random.choice(adjs)} {random.choice(nouns)}",
        lambda: f"with {random.choice(adjs)} {random.choice(nouns)}",
        lambda: f"for {random.choice(adjs)} {random.choice(nouns)}"]
    phrase = random.choice(patterns)()
    ends_with_period = text.strip().endswith('.')
    if ends_with_period:
        base_text = text.strip()[:-1]
        return f"{base_text} {phrase}."
    else:
        return f"{text} {phrase}."

def reorder_words(text, num_swaps = 2):
    protected_words = {'shall', 'should', 'will', 'may', 'must'}
    if '.' in text:
        parts = text.split('.', 1)
        main_text = parts[0]
        suffix = '.' + parts[1] if parts[1] else '.'
    else:
        main_text = text
        suffix = ''
    words = main_text.split(' ')
    movable_indices = [
        i for i, word in enumerate(words) 
        if word.lower() not in protected_words
    ]
    num_swaps = min(num_swaps, max(0, len(movable_indices) - 1))
    for _ in range(num_swaps):
        i, j = random.sample(movable_indices, 2)
        words[i], words[j] = words[j], words[i]
    result = ' '.join(words) + suffix
    return result

def remove_minor_details(text, num_to_pop = 2):
    protected_words = {'shall', 'should', 'will', 'may', 'must'}
    words = text.split(' ')
    removable_indices = [i for i, word in enumerate(words) if word.lower() not in protected_words]
    num_to_pop = min(num_to_pop, len(removable_indices))
    if num_to_pop > 0:
        random_indices = sorted(random.sample(removable_indices, num_to_pop), reverse=True)
        for i in random_indices:
            words.pop(i)
    return ' '.join(words)

def apply_transformations_without_sk(text):
    text = add_minor_details(text)
    text = reorder_words(text)
    text = remove_minor_details(text)
    return text

def apply_transformations(text):
    text = remove_modal_verbs(text)
    text = add_minor_details(text)
    text = reorder_words(text)
    text = remove_minor_details(text)
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mohammadkasrahabib/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
target_column = 'Requirement Sentences'

t0 = evaluation_set
t1 = evaluation_set.map(lambda text: {target_column: remove_modal_verbs(text[target_column])})
t2 = evaluation_set.map(lambda text: {target_column: add_minor_details(text[target_column])})
t3 = evaluation_set.map(lambda text: {target_column: reorder_words(text[target_column])})
t4 = evaluation_set.map(lambda text: {target_column: remove_minor_details(text[target_column])})
t5_without_sk = evaluation_set.map(lambda text: {target_column: apply_transformations_without_sk(text[target_column])})
t5 = evaluation_set.map(lambda text: {target_column: apply_transformations(text[target_column])})



Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

In [9]:
ts = {'T0 -> No Transformation': t0,
      'T1 -> Model Verb Removed': t1, 
      'T2 -> Add Minor Details': t2, 
      'T3 -> Re-order Words': t3, 
      'T4 -> Remove Minor Details': t4,
      'T5 -> All Transformations Without SK': t5_without_sk,
      'T5 -> All Transformations': t5
     }

In [10]:
ts_prediction = {}

for k, v in ts.items():
    evaluation_set_encoded = v.map(preprocess_function, batched = True)
    evaluation_set_predicted = evaluation_set_encoded.map(forward_pass_with_label, batched = True, batch_size = 8)        
    ts_prediction[k] = evaluation_set_predicted



INFO:tensorflow:Assets written to: ram://64adb73e-8634-440f-8232-d9ec34ded19c/assets


INFO:tensorflow:Assets written to: ram://64adb73e-8634-440f-8232-d9ec34ded19c/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://a404a761-9b6f-4857-8a43-166f01af5aa6/assets


INFO:tensorflow:Assets written to: ram://a404a761-9b6f-4857-8a43-166f01af5aa6/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://e9ca20cd-8d5e-4173-8824-0a42ec6d4efb/assets


INFO:tensorflow:Assets written to: ram://e9ca20cd-8d5e-4173-8824-0a42ec6d4efb/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://19cf161a-afb7-4415-a3b3-66f9a9b8a3cd/assets


INFO:tensorflow:Assets written to: ram://19cf161a-afb7-4415-a3b3-66f9a9b8a3cd/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://cea694d1-a62a-4f6a-b7cb-d45b86898b27/assets


INFO:tensorflow:Assets written to: ram://cea694d1-a62a-4f6a-b7cb-d45b86898b27/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://1f76a1e9-22f9-4575-a6f2-8382d671b799/assets


INFO:tensorflow:Assets written to: ram://1f76a1e9-22f9-4575-a6f2-8382d671b799/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]



INFO:tensorflow:Assets written to: ram://84b5ccb8-58be-4b15-baae-3db0797ec509/assets


INFO:tensorflow:Assets written to: ram://84b5ccb8-58be-4b15-baae-3db0797ec509/assets


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

## Classification Performance

In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate(y_true, y_pred, average = 'macro'):
    print(f"Precision: {precision_score(y_true, y_pred, average = average)}")
    print(f"Recall: {recall_score(y_true, y_pred, average = average)}")
    print(f"F1-Score: {f1_score(y_true, y_pred, average = average)}")

In [12]:
def print_transformed_example(v, filter_id):
    v = v.filter(lambda x: x['REQID'] == filter_id)
    transformed_text = v[target_column]
    y_true = mapper.map(v['signal_keyword'])
    y_pred = mapper.map(v['y_preds'])
    y_proba = v['y_probas'][0]
    print(f'\t Transformed Text: {transformed_text}')
    print(f'\t Original Label: {y_true}')
    print(f'\t Predicted Label: {y_pred}')
    print(f'\t Prediction Probability: {y_proba}')  

In [13]:
for k, v in  ts_prediction.items():
    y_true = mapper.map(v['signal_keyword'])
    y_pred = mapper.map(v['y_preds'])
    print(k)
    evaluate(y_true, y_pred)
    print('\n')

T0 -> No Transformation
Precision: 0.9541119708891043
Recall: 0.9517460317460317
F1-Score: 0.9528933151427276


T1 -> Model Verb Removed
Precision: 0.7969257961682922
Recall: 0.6569047619047619
F1-Score: 0.5301279684734296


T2 -> Add Minor Details
Precision: 0.9414124860821428
Recall: 0.9450793650793651
F1-Score: 0.9430915837004745


T3 -> Re-order Words
Precision: 0.9259381338742395
Recall: 0.9253174603174603
F1-Score: 0.9251619770577624


T4 -> Remove Minor Details
Precision: 0.929897098586419
Recall: 0.9318253968253968
F1-Score: 0.9307804768331084


T5 -> All Transformations Without SK
Precision: 0.9081544667230291
Recall: 0.9070634920634921
F1-Score: 0.9055839649899057


T5 -> All Transformations
Precision: 0.7813630374895197
Recall: 0.6298412698412698
F1-Score: 0.5037054253010643




In [14]:
# Showing some example 
for k, v in  ts_prediction.items():
    print(k)
    print_transformed_example(v, '1700')
    print('\n')

T0 -> No Transformation


Filter:   0%|          | 0/480 [00:00<?, ? examples/s]

	 Transformed Text: ['The system shall provide the ability to print and electronically fax prescriptions.']
	 Original Label: ['requirement']
	 Predicted Label: ['requirement']
	 Prediction Probability: 0.9977893829345703


T1 -> Model Verb Removed


Filter:   0%|          | 0/480 [00:00<?, ? examples/s]

	 Transformed Text: ['The system  provide the ability to print and electronically fax prescriptions.']
	 Original Label: ['requirement']
	 Predicted Label: ['system_related_auxiliary']
	 Prediction Probability: 0.9973887801170349


T2 -> Add Minor Details


Filter:   0%|          | 0/480 [00:00<?, ? examples/s]

	 Transformed Text: ['The system shall provide the ability to print and electronically fax prescriptions appropriate even old-age pension.']
	 Original Label: ['requirement']
	 Predicted Label: ['requirement']
	 Prediction Probability: 0.9977896213531494


T3 -> Re-order Words


Filter:   0%|          | 0/480 [00:00<?, ? examples/s]

	 Transformed Text: ['The system shall provide and ability to prescriptions the electronically fax print.']
	 Original Label: ['requirement']
	 Predicted Label: ['requirement']
	 Prediction Probability: 0.9977931976318359


T4 -> Remove Minor Details


Filter:   0%|          | 0/480 [00:00<?, ? examples/s]

	 Transformed Text: ['The system shall provide the ability to and fax prescriptions.']
	 Original Label: ['requirement']
	 Predicted Label: ['requirement']
	 Prediction Probability: 0.9977967739105225


T5 -> All Transformations Without SK


Filter:   0%|          | 0/480 [00:00<?, ? examples/s]

	 Transformed Text: ['The system shall provide the in light and electronically fax prescriptions to to initiate print.']
	 Original Label: ['requirement']
	 Predicted Label: ['requirement']
	 Prediction Probability: 0.9977788329124451


T5 -> All Transformations


Filter:   0%|          | 0/480 [00:00<?, ? examples/s]

	 Transformed Text: ['The system  provide the ability to print electronically cast prescriptions become temptable fax']
	 Original Label: ['requirement']
	 Predicted Label: ['system_related_auxiliary']
	 Prediction Probability: 0.9976505637168884




In [15]:
class_names = np.unique(mapper.map(ts_prediction['T0 -> No Transformation']['y_preds']), return_counts = True)[0]
samples_per_class = np.unique(mapper.map(ts_prediction['T0 -> No Transformation']['y_preds']), return_counts = True)[1]

for name, ss in zip(class_names, samples_per_class):
    print(f"{name}: {ss}")

contextual_auxiliary: 119
requirement: 213
system_related_auxiliary: 148


## McNemar Bowker Test

In [16]:
def cohen_w(cn_table, stats):
    N = np.sum(cn_table) 
    w = np.sqrt(stats.statistic / N)
    return w

In [17]:
from scipy.stats import norm

def cohen_w_ci(w, cn_table, confidence_level = 0.95):
    n = np.sum(cn_table)
    se_w = w / np.sqrt(2 * n)
    z = norm.ppf(1 - (1 - confidence_level) / 2)
    ci_lower = w - z * se_w
    ci_upper = w + z * se_w
    return max(0, ci_lower), ci_upper

In [18]:
from sklearn.metrics import confusion_matrix


cn_tables = {}
y_t0 = mapper.map(ts_prediction['T0 -> No Transformation']['y_preds'])
transformation_names = list(ts_prediction.keys())[1:]

for name in transformation_names:
    y_tx = mapper.map(ts_prediction[name]['y_preds'])
    cm = confusion_matrix(y_t0, y_tx)
    cn_tables['T0 x ' + name] = cm

In [19]:
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.contingency_tables import SquareTable



dfs = []
names = []
p_values = []
statistics = []
effect_sizes = []
w_lower, w_upper = [], []


for name, cn_table in zip(cn_tables.keys(), cn_tables.values()):
    result = SquareTable(cn_table).symmetry()
    dfs.append(result.df)
    statistics.append(result.statistic)
    p_values.append(result.pvalue)
    names.append(name)


    w = cohen_w(cn_table, result)
    effect_sizes.append(w)
    
    low, up = cohen_w_ci(w, cn_table)
    w_lower.append(low)
    w_upper.append(up)
    
reject, adj_pvals, _, _ = multipletests(pvals = p_values, alpha = 0.05, method = 'holm')

In [20]:
for name, stats, df, p_raw, p_adj, efs, low, up, rej in zip(names, statistics, dfs, p_values, adj_pvals, effect_sizes, w_lower, w_upper, reject):
    print(f"{name}")
    print(f"Degree of freedom: {df}")
    print(f"Statistics: {stats}")
    print(f"Raw p-value: {p_raw:.10f}")
    print(f"Adjusted p-value: {p_adj:.10f}")
    print(f"Cohen's w: {efs:.3f}")
    print(f"Cohen's W 95% CI [{low:.3f}, {up:.3f}]")
    print(f"Significant (α=0.05): {'Yes' if rej else 'No'}")
    print('*' * 30, '\n')

T0 x T1 -> Model Verb Removed
Degree of freedom: 3.0
Statistics: 207.6714399363564
Raw p-value: 0.0000000000
Adjusted p-value: 0.0000000000
Cohen's w: 0.658
Cohen's W 95% CI [0.616, 0.699]
Significant (α=0.05): Yes
****************************** 

T0 x T2 -> Add Minor Details
Degree of freedom: 3.0
Statistics: 6.344444444444445
Raw p-value: 0.0960032906
Adjusted p-value: 0.2614358910
Cohen's w: 0.115
Cohen's W 95% CI [0.108, 0.122]
Significant (α=0.05): No
****************************** 

T0 x T3 -> Re-order Words
Degree of freedom: 3.0
Statistics: 6.564705882352941
Raw p-value: 0.0871452970
Adjusted p-value: 0.2614358910
Cohen's w: 0.117
Cohen's W 95% CI [0.110, 0.124]
Significant (α=0.05): No
****************************** 

T0 x T4 -> Remove Minor Details
Degree of freedom: 3.0
Statistics: 2.98974358974359
Raw p-value: 0.3932092177
Adjusted p-value: 0.3932092177
Cohen's w: 0.079
Cohen's W 95% CI [0.074, 0.084]
Significant (α=0.05): No
****************************** 

T0 x T5 -> All 