In [78]:
import openai
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
import time
import json

# Set up your OpenAI API key securely using environment variables
# Ensure you set this environment variable before running the script


# Load the datasets
test_data = pd.read_csv('PURE_test.csv')
train_data = pd.read_csv('PURE_train.csv')


### Prompt Templates

In [54]:
# Define prompt templates
PROMPT_TEMPLATES = {
    "In-Context Learning Prompts": (
        "Given a statement, determine whether it is a 'requirement' or 'non-requirement'. "
        "A 'requirement' specifies a condition, specification, or feature that must be fulfilled. "
        "A 'non-requirement' is any statement that does not impose a condition or specification to be met.\n\n"
        "Example 1 (Requirement):\n"
        "\"NPAC SMS shall accept the following data from the NPAC personnel or old Service Provider "
        "upon Subscription Version creation for an Inter-Service Provider port: Local Number Portability Type.\"\n"
        "Classification: Requirement\n\n"
        "Example 2 (Not a Requirement):\n"
        "\"The approach to operation and maintenance is based on remote supervision and monitoring of the wind turbine(s) "
        "but the amount of operational data needed and the need for update of data is very moderate.\"\n"
        "Classification: Non-Requirement"
    ),
    "Few-Shot Exemplar-based Prompt": (
        "Below are a few statements. Your task is to classify each as either a 'Requirement' or 'Non-Requirement'. "
        "A 'Requirement' is a statement that specifies a necessary condition or feature that must be met. "
        "A 'Non-Requirement' does not specify any necessary condition.\n\n"
        "Example 1 (Requirement):\n"
        "\"NPAC SMS shall accept the following data from the NPAC personnel or old Service Provider upon Subscription Version creation for an Inter-Service Provider port.\"\n"
        "Classification: Requirement\n\n"
        "Example 2 (Requirement):\n"
        "\"Visual Input: The conferencing feature requires a high resolution video input device.\"\n"
        "Classification: Requirement\n\n"
        "Example 3 (Non-Requirement):\n"
        "\"The approach to operation and maintenance is based on remote supervision and monitoring of the wind turbine(s).\"\n"
        "Classification: Non-Requirement\n\n"
        "Example 4 (Non-Requirement):\n"
        "\"ESOC Data Disposition system Interface control Document (DDID) to be released sometime 2000.\"\n"
        "Classification: Non-Requirement"
    ),
    "Ultimate Prompt": (
        "### Task: Classify the following text as \"requirement\" or \"non-requirement.\"\n\n"
        "### Definitions:\n"
        "- *Requirement:* A statement that specifies a mandatory functionality, feature, constraint, or operational need. "
        "It typically uses actionable terms like \"shall,\" \"must,\" \"requires,\" or clearly indicates a technical or functional demand.\n"
        "- *Non-Requirement:* A statement that is descriptive, procedural, informational, or otherwise lacks specificity about a mandatory functionality or constraint. "
        "These may reference requirements without being actionable themselves.\n\n"
        "### Persona:\n"
        "You are a highly experienced software engineer with over 25 years of expertise in requirements engineering. "
        "Your role is to carefully distinguish between actionable requirements and supplementary or descriptive information. "
        "You are methodical, objective, and precise in your analysis.\n\n"
        "### Instructions:\n"
        "1. Read the text carefully.\n"
        "2. Compare it to the definitions provided above.\n"
        "3. Follow this decision-making process:\n"
        "   - *Step 1:* Does the text describe a mandatory functionality, feature, or constraint?\n"
        "   - *Step 2:* Does it include actionable terms such as \"shall,\" \"must,\" \"requires,\" or similar?\n"
        "   - *Step 3:* If descriptive, procedural, or lacking specificity, classify as \"non-requirement.\"\n"
        "4. Be neutral and avoid assuming the text belongs to a specific class. Use examples to guide your decision.\n"
        "5. Output your classification in the following JSON format:\n\n"
        "json\n"
        "{\n"
        "  \"reasoning\": \"{Explain step-by-step reasoning here. Highlight whether the text aligns with actionable requirements or descriptive content.}\",\n"
        "  \"label\": \"{Requirement/Non-Requirement}\"\n"
        "}\n\n"
        "### Examples:\n"
        "- *Requirement:* \"NPAC SMS shall accept the following data from the NPAC personnel: Port Type, Ported Telephone Number(s), Due Date, Old Service Provider ID, and Status Change Cause Code.\"\n"
        "- *Requirement:* \"System administrators can create shared folders to allow or disallow staff to run specific reports.\"\n"
        "- *Non-Requirement:* \"The approach to operation and maintenance is based on remote supervision and monitoring of the wind turbine(s).\"\n"
        "- *Non-Requirement:* \"The remaining sections of the document contain the requirements for the system.\"\n\n"
        "### Analyze and Classify:\n"
        "Text: \"{Insert text here}\""
    )
}


### Utility Functions

In [84]:



def get_sentences(data: pd.DataFrame, column_name: str = "Requirement") -> list:
    """
    Extracts sentences from the specified column in the DataFrame.

    Args:
        data (pd.DataFrame): The DataFrame containing the data.
        column_name (str): The column name to extract sentences from.

    Returns:
        list: A list of sentences.
    """
    if column_name not in data.columns:
        raise ValueError(f"Column '{column_name}' not found in the dataset.")

    sentence_list = data[column_name].dropna().astype(str).tolist()
    return sentence_list

def get_true_label(data: pd.DataFrame, column_name: str = "Req/Not Req") -> list:
    """
    Extracts true labels from the specified column in the DataFrame.

    Args:
        data (pd.DataFrame): The DataFrame containing the data.
        column_name (str): The column name to extract labels from.

    Returns:
        list: A list of boolean labels.
    """
    if column_name not in data.columns:
        raise ValueError(f"Column '{column_name}' not found in the dataset.")

    label_list = data[column_name].apply(lambda x: True if str(x).strip().lower() == "req" else False).tolist()
    return label_list

def classify_sentence(sentence: str, prompt_type: str = "zero-shot") -> bool:
    """
    Classifies a single sentence using the specified prompting technique.

    Args:
        sentence (str): The sentence to classify.
        prompt_type (str): The type of prompt to use ('zero-shot' or 'cot').

    Returns:
        bool: The classification result (True for requirement, False otherwise).
    """
    prompt = PROMPT_TEMPLATES.get(prompt_type)
    if not prompt:
        raise ValueError(f"Unsupported prompt_type '{prompt_type}'. Choose from {list(PROMPT_TEMPLATES.keys())}.")

    # Structure the prompt for a single sentence
    user_message = prompt.replace("{Insert text here}", sentence)
    
    # print("Our message is", user_message)
    

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",  # Change to 'gpt-4' if you have access and prefer to use it
            messages=[
                        {            
                         "role": "system",
                         "content": (
                            "You are a highly experienced software engineer with over 25 years of expertise in requirements engineering. "
                            "Your role is to carefully distinguish between actionable requirements and supplementary or descriptive information. "
                            "You are methodical, objective, and precise in your analysis.")
                        },
                        {
                            "role": "user",
                            "content": (user_message)
                        }
                        
            ],
            temperature=0,  # Deterministic output
            max_tokens=500  # Minimal tokens since response is simple
        )
        

        # Extract and parse the response
        response_content = response['choices'][0]['message']['content'].strip()
        
        print("the response content is:", response_content)
        
        result_data = json.loads(response_content)
        
        # Extract and return the label
        label = result_data['label']
        
        if label == "Requirement":
            return True
        elif label == "Non-Requirement":
            return False
        else:
            return "The label is not either Requirement/Non-Requirement"
        

    except openai.error.RateLimitError as e:
        print(f"Rate limit exceeded: {e}. Waiting for 60 seconds.")
        time.sleep(60)  # Wait before retrying
        return classify_sentence(sentence, prompt_type)  # Retry after delay

    except openai.error.InvalidRequestError as e:
        print(f"Invalid request for sentence: \"{sentence}\". Error: {e}")
        return False  # Defaulting to False in case of error

    except Exception as e:
        print(f"Unexpected error for sentence: \"{sentence}\". Error: {e}")
        return False  # Defaulting to False in case of error

def evaluation(y_true: list, y_pred: list) -> dict:
    """
    Evaluates the classification performance by calculating accuracy, precision, recall, and F1 score.

    Args:
        y_true (list): The true labels.
        y_pred (list): The predicted labels.

    Returns:
        dict: A dictionary containing accuracy, precision, recall, and F1 score.
    """
    if not y_true or not y_pred:
        return {"error": "One of the label lists is empty."}
    if len(y_true) != len(y_pred):
        return {"error": "The length of true labels and predicted labels must be the same."}

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    metrics = {
        "accuracy": round(accuracy, 4),
        "precision": round(precision, 4),
        "recall": round(recall, 4),
        "f1_score": round(f1, 4)
    }

    print(f"\nEvaluation Metrics:")
    print(f"Accuracy: {metrics['accuracy']}")
    print(f"Precision: {metrics['precision']}")
    print(f"Recall: {metrics['recall']}")
    print(f"F1 Score: {metrics['f1_score']}")

    return metrics


### In-Context Learning Prompts

In [34]:
try:
    # Select the prompting technique: 'zero-shot' or 'cot'
    prompt_type = "In-Context Learning Prompts"  # Change to 'cot' for Chain-of-Thought prompting

    print(f"Starting classification using {prompt_type} prompting.\n")

    # Extract 50 sentences from the test dataset
    sentences = get_sentences(test_data, column_name="Requirement")[:50]

    print(f"Processing the following 50 sentences:\n")
    for idx, sentence in enumerate(sentences, 1):
        print(f"{idx}. {sentence}")
    print("\nClassifying...\n")

    # Classify each sentence individually
    predictions = []
    for idx, sentence in enumerate(sentences, 1):
        prediction = classify_sentence(sentence, prompt_type=prompt_type)
        predictions.append(prediction)
        print(f"Sentence {idx}: \"{sentence}\" => {'Requirement' if prediction else 'Not Requirement'}")
        time.sleep(1)  # To respect rate limits; adjust as necessary

    # Assuming you have the ground truth labels for the test set
    # For demonstration, I'll extract them similarly
    ground_truth_labels = get_true_label(test_data, column_name="Req/Not Req")[:50]

    # Evaluate the results
    metrics = evaluation(ground_truth_labels, predictions)

except Exception as e:
    print(f"An error occurred: {e}")


Starting classification using In-Context Learning Prompts prompting.

Processing the following 50 sentences:

1. System Initialization performs those functions necessary to transform the hardware consisting of the FCP processors, network elements, and on-board I/O devices into a real time system executing tasks with fault tolerant message exchanges.
2. Whenever a power-on reset occurs, System Initialization shall [SRS194] perform the following functions.
3. As part of System Initialization , the Boot ROM shall [SRS234] be configured to, after completing IBIT, call the manufacturer-supplied VxWorks Board Support Package (BSP) initialization software followed by a call to the FTSS System Initialization software.
4. System Initialization shall [SRS014] initiate the watchdog timer.
5. System Initialization shall [SRS292] enable and reset the processor’s watchdog timer such that, in the absence of a fault, the watchdog timer does not expire and reset the processor..
6. System Initialization

In [32]:
metrics = evaluation(ground_truth_labels, predictions)

### "Few-Shot Exemplar-based Prompt"

In [12]:
# Repeat the process for CoT prompting if desired
try:
    prompt_type = "Few-Shot Exemplar-based Prompt"  # Change to 'zero-shot' for Zero-shot prompting

    print(f"\nStarting classification using {prompt_type} prompting.\n")

    # Extract the same 50 sentences for consistency
    sentences = get_sentences(test_data, column_name="Requirement")[:50]
    print(f"Processing the following 50 sentences:\n")
    for idx, sentence in enumerate(sentences, 1):
        print(f"{idx}. {sentence}")
    print("\nClassifying...\n")

    # Classify each sentence individually
    predictions = []
    for idx, sentence in enumerate(sentences, 1):
        prediction = classify_sentence(sentence, prompt_type=prompt_type)
        predictions.append(prediction)
        print(f"Sentence {idx}: \"{sentence}\" => {'Requirement' if prediction else 'Not Requirement'}")
        time.sleep(1)  # To respect rate limits; adjust as necessary

    # Extract ground truth labels
    ground_truth_labels = get_true_label(test_data, column_name="Req/Not Req")[:50]

    # Evaluate the results
    metrics = evaluation(ground_truth_labels, predictions)

except Exception as e:
    print(f"An error occurred: {e}")


Starting classification using Few-Shot Exemplar-based Prompt prompting.

Processing the following 50 sentences:

1. System Initialization performs those functions necessary to transform the hardware consisting of the FCP processors, network elements, and on-board I/O devices into a real time system executing tasks with fault tolerant message exchanges.
2. Whenever a power-on reset occurs, System Initialization shall [SRS194] perform the following functions.
3. As part of System Initialization , the Boot ROM shall [SRS234] be configured to, after completing IBIT, call the manufacturer-supplied VxWorks Board Support Package (BSP) initialization software followed by a call to the FTSS System Initialization software.
4. System Initialization shall [SRS014] initiate the watchdog timer.
5. System Initialization shall [SRS292] enable and reset the processor’s watchdog timer such that, in the absence of a fault, the watchdog timer does not expire and reset the processor..
6. System Initializa

### Ultimate Prompt

In [86]:
# Repeat the process for CoT prompting if desired
try:
    prompt_type = "Ultimate Prompt"  # Change to 'zero-shot' for Zero-shot prompting

    print(f"\nStarting classification using {prompt_type} prompting.\n")

    # Extract the same 50 sentences for consistency
    sentences = get_sentences(test_data, column_name="Requirement")[:20]
    print(f"Processing the following 50 sentences:\n")
    for idx, sentence in enumerate(sentences, 1):
        print(f"{idx}. {sentence}")
    print("\nClassifying...\n")

    # Classify each sentence individually
    predictions = []
    for idx, sentence in enumerate(sentences, 1):
        prediction = classify_sentence(sentence, prompt_type=prompt_type)
        predictions.append(prediction)
        print(f"Sentence {idx}: \"{sentence}\" => {'Requirement' if prediction else 'Not Requirement'}")
        time.sleep(1)  # To respect rate limits; adjust as necessary

    # Extract ground truth labels
    ground_truth_labels = get_true_label(test_data, column_name="Req/Not Req")[:20]

    # Evaluate the results
    metrics = evaluation(ground_truth_labels, predictions)

except Exception as e:
    print(f"An error occurred: {e}")


Starting classification using Ultimate Prompt prompting.

Processing the following 50 sentences:

1. System Initialization performs those functions necessary to transform the hardware consisting of the FCP processors, network elements, and on-board I/O devices into a real time system executing tasks with fault tolerant message exchanges.
2. Whenever a power-on reset occurs, System Initialization shall [SRS194] perform the following functions.
3. As part of System Initialization , the Boot ROM shall [SRS234] be configured to, after completing IBIT, call the manufacturer-supplied VxWorks Board Support Package (BSP) initialization software followed by a call to the FTSS System Initialization software.
4. System Initialization shall [SRS014] initiate the watchdog timer.
5. System Initialization shall [SRS292] enable and reset the processor’s watchdog timer such that, in the absence of a fault, the watchdog timer does not expire and reset the processor..
6. System Initialization shall [SRS

In [69]:
import json
import openai

# Function to classify the text
def classify_text(text):
    # Define the messages for the chat completion
    messages = [
        {
            "role": "system",
            "content": (
                "You are a highly experienced software engineer with over 25 years of expertise in requirements engineering. "
                "Your role is to carefully distinguish between actionable requirements and supplementary or descriptive information. "
                "You are methodical, objective, and precise in your analysis."
            )
        },
        {
            "role": "user",
            "content": 
            """
            ### Task: Classify the following text as "requirement" or "non-requirement."

            ### Definitions:
            - *Requirement:* A statement that specifies a mandatory functionality, feature, constraint, or operational need. It typically uses actionable terms like "shall," "must," "requires," or clearly indicates a technical or functional demand.
            - *Non-Requirement:* A statement that is descriptive, procedural, informational, or otherwise lacks specificity about a mandatory functionality or constraint. These may reference requirements without being actionable themselves.

            ### Instructions:
            1. Read the text carefully.
            2. Compare it to the definitions provided above.
            3. Follow this decision-making process:
                - *Step 1:* Does the text describe a mandatory functionality, feature, or constraint?
                - *Step 2:* Does it include actionable terms such as "shall," "must," "requires," or similar?
                - *Step 3:* If descriptive, procedural, or lacking specificity, classify as "non-requirement."
            4. Be neutral and avoid assuming the text belongs to a specific class. Use examples to guide your decision.
            5. Output your classification in the following JSON format:

            json
            {{
                "reasoning": "{step-by-step reasoning here. Highlight whether the text aligns with actionable requirements or descriptive content.}",
                "label": "{Requirement/Non-Requirement}"
            }}

            ### Analyze and Classify:
            Text: "|text|"
            
            """
        }
    ]

    print(messages)
    
    messages[1]["content"] = messages[1]["content"].replace("|text|", text)
    
    print("modift", messages)
    
    # Call OpenAI API to get the classification result
    response = openai.ChatCompletion.create(
        model="gpt-4",  # Use a chat-based model
        messages=messages,
        max_tokens=500,
        temperature=0
    )
    
    # print(response)

    # Parse the response to extract the label
    try:
        # Get the content from the response
        content = response['choices'][0]['message']['content']
        # Parse the JSON content from the assistant's response
        result_data = json.loads(content)
        print(result_data)
        
        # Extract and return the label
        label = result_data['label']
        return label
    except (json.JSONDecodeError, KeyError) as e:
        return f"Error parsing response: {e}"

# Test the function with a sample text
sample_text = "The system shall log all user activities for auditing purposes."
result_label = classify_text(sample_text)

# Print only the classification label (Requirement/Non-Requirement)
print(result_label)


[{'role': 'system', 'content': 'You are a highly experienced software engineer with over 25 years of expertise in requirements engineering. Your role is to carefully distinguish between actionable requirements and supplementary or descriptive information. You are methodical, objective, and precise in your analysis.'}, {'role': 'user', 'content': '\n            ### Task: Classify the following text as "requirement" or "non-requirement."\n\n            ### Definitions:\n            - *Requirement:* A statement that specifies a mandatory functionality, feature, constraint, or operational need. It typically uses actionable terms like "shall," "must," "requires," or clearly indicates a technical or functional demand.\n            - *Non-Requirement:* A statement that is descriptive, procedural, informational, or otherwise lacks specificity about a mandatory functionality or constraint. These may reference requirements without being actionable themselves.\n\n            ### Instructions:\n  