In [1]:
# common imports

import sys
sys.path.append("../datasets/ARID_supporting_scripts")

import mapper
import datasets
import numpy as np
import pandas as pd
import license_attribution

import os
import csv
import json


import re
from tqdm import tqdm

np.random.seed(42)

# Custom Util SRS Preprocessor

In [2]:
import re
from html import unescape
from sklearn.base import BaseEstimator, TransformerMixin

class SRSTextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, strip_html = True, parse_to_sent = True, strip_parsed_tabels = True, filter_by_length = None):
        self.strip_html = strip_html
        self.parse_to_sent = parse_to_sent
        self.strip_parsed_tabels = strip_parsed_tabels
        self.filter_by_length = filter_by_length

    def html_to_plain_text(self, text):
        text = re.sub('<head.*?>.*?</head>', '', str(text), flags=re.M | re.S | re.I)
        text = re.sub('<a\s.*?>', '', text, flags=re.M | re.S | re.I)
        text = re.sub('<.*?>', '', text, flags=re.M | re.S)
        text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
        return unescape(text)

    def remove_table_structures(self, text):
        text = re.sub(r'^\|?[-|: ]+\|?$', '', text, flags=re.MULTILINE)
        text = re.sub(r'^\|', '', text, flags=re.MULTILINE)  
        text = re.sub(r'\|$', '', text, flags=re.MULTILINE)  
        text = re.sub(r'\|', '    ', text) 
        text = re.sub(r' {2,}', ' ', text)
        return text
        
    def parse_to_sent_level(self, text):
        # text = re.findall(r'[a-zA-Z].*?[.!?]', text)
        text = re.findall(r'\b[A-Z].*?[.!?]', text)
        sentences = ["".join(re.sub(r'^[^:]*: *(?:[^:]*: *)?', '', sent, flags = re.MULTILINE)) for sent in text]
        if self.filter_by_length:
            sentences = [sent for sent in sentences if len(sent.split(' ')) > self.filter_by_length]
            return sentences 
        return sentences
        
    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        text = X
        if self.strip_html:
            text = self.html_to_plain_text(text)
        if self.remove_table_structures:
            text = self.remove_table_structures(text)
        if self.parse_to_sent:
            text = self.parse_to_sent_level(text)
        return text

# Loading SWARD

In [3]:
uocssr_df = pd.read_excel('../datasets/ARID_supporting_scripts/6_requirement_identification/UOCSSR_Dataset_recent.xlsx')
uocssr_df['REQID'] = np.arange(len(uocssr_df))
uocssr_df.drop_duplicates(subset = ['Requirement (EN)'], inplace = True)
uocssr_df.drop(columns = ['Unnamed: 0', 'Sub Category (NoRBERT)', 'Category (NoRBERT)'], inplace = True)
uocssr_df.reset_index(drop = True, inplace = True)

# Spliting to sentence level

In [4]:
srs_text_preprocessor = SRSTextPreprocessor(filter_by_length = 8)
uocssr_df['Requirement Sentences'] = uocssr_df['Requirement (EN)'].map(lambda text: srs_text_preprocessor.fit_transform(text))
uocssr_df = uocssr_df.explode('Requirement Sentences')
uocssr_df = uocssr_df[uocssr_df['Requirement Sentences'].apply(lambda x: x != [])]
uocssr_df = uocssr_df[uocssr_df['Requirement Sentences'].apply(lambda x: x != [] and pd.notna(x))]
uocssr_df['REQID_expanded'] = range(len(uocssr_df))

# Creating SwaRD

In [5]:
uocssr_hf = datasets.Dataset.from_pandas(uocssr_df.astype(str))

# Automatic Requirments Identification (ARI) with ReqSeek

In [6]:
import numpy as np
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer


strategy = tf.distribute.MirroredStrategy(devices = ["GPU:0", "GPU:1"])

with strategy.scope():
    model_name = "../ReqSeek/"
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

Metal device set to: Apple M4 Pro
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


2025-05-28 19:30:11.645091: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-28 19:30:11.645218: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at ../ReqSeek/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [7]:
def preprocess_function(dataset):
        return tokenizer(dataset['Requirement Sentences'], truncation = True)

uocssr_hf = uocssr_hf.map(preprocess_function, batched = True, batch_size = 256)

Map:   0%|          | 0/60137 [00:00<?, ? examples/s]

In [8]:
def forward_pass(batch):
    input_ids = batch['input_ids']
    attention_masks = batch['attention_mask']

    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, padding = "post")
    attention_masks = tf.keras.preprocessing.sequence.pad_sequences(attention_masks, padding = "post")

    with tf.GradientTape() as tape:
        output = model(input_ids, attention_masks)
        predicted_labels = tf.argmax(output.logits, axis = -1).numpy()

    return {"isReqSysAuxContAux_with_keyword": [model.config.id2label[lbl] for lbl in predicted_labels]}

In [9]:
with strategy.scope():
    uocssr_hf = uocssr_hf.map(forward_pass, batched = True, batch_size = 8)



INFO:tensorflow:Assets written to: ram://acf50887-dcb3-4384-8139-aca73513c2e3/assets


INFO:tensorflow:Assets written to: ram://acf50887-dcb3-4384-8139-aca73513c2e3/assets


Map:   0%|          | 0/60137 [00:00<?, ? examples/s]

In [10]:
isReqSrsTxt = mapper.map(uocssr_hf['isReqSysAuxContAux_with_keyword'])
isRequirement = mapper.twoClassMapper(uocssr_hf['isReqSysAuxContAux_with_keyword'])

uocssr_hf = uocssr_hf.add_column('isReqSysAuxContAux', isReqSrsTxt)
uocssr_hf = uocssr_hf.add_column('isReqAux', isRequirement)

In [11]:
uocssr_hf = uocssr_hf.rename_columns({
    'Sources Type': 'Type',
    'Sub Category (Source)':'NF Subclasses',
    'Category (Source)': 'isF/NF',
    'Open/ Closed Source': 'isOpen/Closed'
})

In [12]:
uocssr_hf = uocssr_hf.remove_columns(column_names = ['__index_level_0__', 
                                        'input_ids', 
                                        'attention_mask',
                                        'Date', 
                                        'Comment', 
                                        'Original Language Code', 
                                        'Requirement (DE)',
                                        'Requirement (Other Language)',
                                        'Category (Manual)',
                                        'Sub Category (Manual)',
                                       ])
uocssr_hf

Dataset({
    features: ['Project Name', 'Subproject Name', 'Type', 'Requirement (EN)', 'isF/NF', 'NF Subclasses', 'isOpen/Closed', 'REQID', 'Requirement Sentences', 'REQID_expanded', 'isReqSysAuxContAux_with_keyword', 'isReqSysAuxContAux', 'isReqAux'],
    num_rows: 60137
})

In [13]:
uocssr_hf = uocssr_hf.select_columns(['REQID', 
                         'REQID_expanded', 
                         'Project Name', 
                         'Subproject Name',
                         'Type',
                         'isOpen/Closed',
                         'Requirement (EN)', 
                         'Requirement Sentences', 
                         'isF/NF', 
                         'NF Subclasses', 
                         'isReqSysAuxContAux_with_keyword', 
                         'isReqSysAuxContAux', 
                         'isReqAux'
                        ])
uocssr_hf

Dataset({
    features: ['REQID', 'REQID_expanded', 'Project Name', 'Subproject Name', 'Type', 'isOpen/Closed', 'Requirement (EN)', 'Requirement Sentences', 'isF/NF', 'NF Subclasses', 'isReqSysAuxContAux_with_keyword', 'isReqSysAuxContAux', 'isReqAux'],
    num_rows: 60137
})

In [14]:
# Removint publicly available datasets with not license

excluded_projects = ['RCM Extractor', 'VHCURES', 'skysigal', 'Promise Modis', 'Generic Business Register', 'NIST']

def filter_out_projects(dataset, excluded_values, column_name = 'Project Name'):
    return dataset.filter(lambda example: example[column_name] not in excluded_values)


uocssr_hf = filter_out_projects(uocssr_hf, excluded_projects, column_name = 'Project Name')

Filter:   0%|          | 0/60137 [00:00<?, ? examples/s]

In [15]:
sward = uocssr_hf.filter(lambda x: x['isOpen/Closed'] == 'O')
sward = sward.remove_columns(['Requirement (EN)', 'Type', 'isOpen/Closed'])
sward

Filter:   0%|          | 0/59531 [00:00<?, ? examples/s]

Dataset({
    features: ['REQID', 'REQID_expanded', 'Project Name', 'Subproject Name', 'Requirement Sentences', 'isF/NF', 'NF Subclasses', 'isReqSysAuxContAux_with_keyword', 'isReqSysAuxContAux', 'isReqAux'],
    num_rows: 40522
})

In [16]:
sward_info = datasets.DatasetInfo(
    description = """The SwaRD dataset is an identified collection of requirements identified from multiple publicly available sources using ReqSeek, each governed by its own license terms. While all source datasets are available for research and academic purposes, they are not uniformly licensed, and therefore SwaRD is not distributed under a unified license (see the license description).""",
    citation = "BLINDED",
    license = license_attribution.get_SwaRD_license_text(),
    version = "1.0.0",
    features = sward.features
)

In [17]:
sward = datasets.Dataset.from_dict(sward.to_dict(), info = sward_info, features = sward.features)

In [18]:
# sward.save_to_disk('../datasets/SwaRD')

In [19]:
uocssr = uocssr_hf.filter(lambda x: x['Type'] == 'RE')
uocssr = uocssr.filter(lambda x: x['isOpen/Closed'] == 'O')
uocssr = uocssr.remove_columns(['Requirement Sentences', 'Type', 'isOpen/Closed', 'isReqSysAuxContAux_with_keyword', 'isReqSysAuxContAux', 'isReqAux'])
uocssr = uocssr.rename_columns({'Requirement (EN)': 'Requirement Sentences'})

Filter:   0%|          | 0/59531 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25644 [00:00<?, ? examples/s]

In [20]:
uocssr_info = datasets.DatasetInfo(
    description = """The BLINEDED dataset is an identified collection of requirements identified from multiple publicly available sources using heuristics, each governed by its own license terms. While all source datasets are available for research and academic purposes, they are not uniformly licensed, and therefore BLINEDED dataset is not distributed under a unified license (see the license description).""",
    citation = "BLINDED",
    license = license_attribution.get_BLINDED_DATASET_license_text(),
    version="1.0.0",
    features = uocssr.features,
)

In [21]:
uocssr = datasets.Dataset.from_dict(uocssr.to_dict(), info = uocssr_info, features = uocssr.features)

In [22]:
uocssr

Dataset({
    features: ['REQID', 'REQID_expanded', 'Project Name', 'Subproject Name', 'Requirement Sentences', 'isF/NF', 'NF Subclasses'],
    num_rows: 12080
})

In [23]:
# uocssr.save_to_disk('../datasets/UOCSSR')