In [None]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import collections

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]
VALID_NFR_ABBREVIATIONS = {
    "usability": "US",
    "reliability": "RL",
    "performance": "PE",
    "portability": "PO",
    "security": "SE",
    "other": "OT"
}
ABBREVIATION_TO_FULL = {v: k for k, v in VALID_NFR_ABBREVIATIONS.items()}

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

print(f"Loaded {len(nfr_data)} non-functional reviews.")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)

# --- 4. Structured Chain-of-Thought Prompt ---
NFR_CLASSIFICATION_PROMPT_TEXT = """
You are a highly skilled software requirements expert, specializing in non-functional requirements (NFRs). Your task is to accurately classify a given user review into one of the following NFR types.

**NFR Categories:**
- Usability (US): How easy the system is to use, learn, and its user interface.
- Reliability (RL): The system's ability to perform consistently without failure, its uptime, and data recovery.
- Performance (PE): The system's speed, responsiveness, efficiency, throughput, and resource consumption.
- Portability (PO): How easily the system can be adapted or moved to different operating environments, platforms, or devices.
- Security (SE): Protection of data from unauthorized access, attacks, ensuring privacy and data integrity.
- Other (OT): Any review that does not fit clearly into the above specific NFR categories.

**Instructions:**
1. Read the 'User Review' carefully.
2. In a brief, step-by-step reasoning, analyze the review. What core concern is the user expressing? Does it relate to ease of use, stability, speed, compatibility, or data protection? Explain your reasoning process.
3. Based on your reasoning, determine the single best NFR category from the 'NFR Categories' list.
4. State your final classification clearly, preceded by "FINAL CLASSIFICATION:".
5. Your final output for classification MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability', 'RL: Reliability', 'OT: Other'). Do NOT include any other text, explanation, or punctuation after "FINAL CLASSIFICATION:".

**User Review:** '''{review_text}'''

**Thinking Process:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model using a specified prompt template.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 256
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

def parse_cot_prediction(raw_response: str) -> str:
    """
    Parses the raw LLM response from a Structured CoT prompt to extract the final classification.
    """
    # Regex to capture the full category name after "FINAL CLASSIFICATION: XX:"
    match = re.search(
        r"FINAL CLASSIFICATION:\s*(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)",
        raw_response,
        re.IGNORECASE | re.DOTALL
    )
    if match:
        return match.group(1).strip().lower()
    
    # Fallback to check for any valid labels in the last few lines if the strict format isn't found
    lines = raw_response.split('\n')
    for line in reversed(lines[-5:]):
        for label_full in VALID_NFR_LABELS:
            if label_full in line.lower():
                return label_full
        for label_abbr, label_full in ABBREVIATION_TO_FULL.items():
            if label_abbr.lower() in line.lower() and label_full in line.lower():
                return label_full
    
    logger.warning(f"Failed to parse classification from raw response: '{raw_response.strip()}'")
    return "Failed Parsing"


# --- 6. Main Evaluation Loop for All Models (Single-Pass Structured CoT) ---
all_models_nfr_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting NFR Classification Evaluation for Model: {current_model_name} (Structured CoT) {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=NFR_CLASSIFICATION_PROMPT_TEXT
        )
        
        if response_data["success"]:
            final_prediction = parse_cot_prediction(response_data["raw_response"])
            predictions.append(final_prediction)
        else:
            predictions.append("Failed")
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} (Structured CoT) completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} (Structured CoT) ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} (Structured CoT) ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_nfr_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name} (Structured CoT).")
        all_models_nfr_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} (Structured CoT) Complete {'='*20}\n")

print("\n\n========== ALL NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of NFR Accuracies:")
for model, metrics in all_models_nfr_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final NFR Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews.
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████| 1278/1278 [1:54:42<00:00,  5.39s/it]



✅ Classification with llama2 (Structured CoT) completed in 114.70 minutes

--- Sample of Predictions for llama2 (Structured CoT) ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for llama2 (Structured CoT) ---
              precision    recall  f1-score   support

   usability       0.81      0.79      0.80       432
 reliability       0.66      0.81      0.73       587
 performance       1.00      0.12      0.21       121
 portability       0.27      0.12      0.16       119
    security       0.40      0.89      0.56        19
       other       0.00   

Classifying reviews with mistral: 100%|██████████████████████████████████████████| 1278/1278 [2:31:21<00:00,  7.11s/it]



✅ Classification with mistral (Structured CoT) completed in 151.36 minutes

--- Sample of Predictions for mistral (Structured CoT) ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for mistral (Structured CoT) ---
              precision    recall  f1-score   support

   usability       0.79      0.79      0.79       432
 reliability       0.83      0.49      0.62       587
 performance       0.36      0.97      0.53       121
 portability       0.52      0.34      0.41       119
    security       0.32      0.95      0.47        19
       other       0.00

Classifying reviews with llama3:8b: 100%|████████████████████████████████████████| 1278/1278 [4:12:05<00:00, 11.84s/it]



✅ Classification with llama3:8b (Structured CoT) completed in 252.09 minutes

--- Sample of Predictions for llama3:8b (Structured CoT) ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for llama3:8b (Structured CoT) ---
              precision    recall  f1-score   support

   usability       0.76      0.87      0.81       432
 reliability       0.81      0.50      0.62       587
 performance       0.40      0.93      0.56       121
 portability       0.66      0.31      0.42       119
    security       0.24      0.89      0.38        19
       other     

Classifying reviews with gemma:7b:  48%|███████████████████▎                    | 617/1278 [2:19:13<2:38:55, 14.43s/it]

In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import collections

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]
VALID_NFR_ABBREVIATIONS = {
    "usability": "US",
    "reliability": "RL",
    "performance": "PE",
    "portability": "PO",
    "security": "SE",
    "other": "OT"
}
ABBREVIATION_TO_FULL = {v: k for k, v in VALID_NFR_ABBREVIATIONS.items()}

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

print(f"Loaded {len(nfr_data)} non-functional reviews.")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)

# --- 4. Structured Chain-of-Thought Prompt ---
NFR_CLASSIFICATION_PROMPT_TEXT = """
You are a highly skilled software requirements expert, specializing in non-functional requirements (NFRs). Your task is to accurately classify a given user review into one of the following NFR types.

**NFR Categories:**
- Usability (US): How easy the system is to use, learn, and its user interface.
- Reliability (RL): The system's ability to perform consistently without failure, its uptime, and data recovery.
- Performance (PE): The system's speed, responsiveness, efficiency, throughput, and resource consumption.
- Portability (PO): How easily the system can be adapted or moved to different operating environments, platforms, or devices.
- Security (SE): Protection of data from unauthorized access, attacks, ensuring privacy and data integrity.
- Other (OT): Any review that does not fit clearly into the above specific NFR categories.

**Instructions:**
1. Read the 'User Review' carefully.
2. In a brief, step-by-step reasoning, analyze the review. What core concern is the user expressing? Does it relate to ease of use, stability, speed, compatibility, or data protection? Explain your reasoning process.
3. Based on your reasoning, determine the single best NFR category from the 'NFR Categories' list.
4. State your final classification clearly, preceded by "FINAL CLASSIFICATION:".
5. Your final output for classification MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability', 'RL: Reliability', 'OT: Other'). Do NOT include any other text, explanation, or punctuation after "FINAL CLASSIFICATION:".

**User Review:** '''{review_text}'''

**Thinking Process:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model using a specified prompt template.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 256
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

def parse_cot_prediction(raw_response: str) -> str:
    """
    Parses the raw LLM response from a Structured CoT prompt to extract the final classification.
    """
    # Regex to capture the full category name after "FINAL CLASSIFICATION: XX:"
    match = re.search(
        r"FINAL CLASSIFICATION:\s*(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)",
        raw_response,
        re.IGNORECASE | re.DOTALL
    )
    if match:
        return match.group(1).strip().lower()
    
    # Fallback to check for any valid labels in the last few lines if the strict format isn't found
    lines = raw_response.split('\n')
    for line in reversed(lines[-5:]):
        for label_full in VALID_NFR_LABELS:
            if label_full in line.lower():
                return label_full
        for label_abbr, label_full in ABBREVIATION_TO_FULL.items():
            if label_abbr.lower() in line.lower() and label_full in line.lower():
                return label_full
    
    logger.warning(f"Failed to parse classification from raw response: '{raw_response.strip()}'")
    return "Failed Parsing"


# --- 6. Main Evaluation Loop for All Models (Single-Pass Structured CoT) ---
all_models_nfr_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting NFR Classification Evaluation for Model: {current_model_name} (Structured CoT) {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=NFR_CLASSIFICATION_PROMPT_TEXT
        )
        
        if response_data["success"]:
            final_prediction = parse_cot_prediction(response_data["raw_response"])
            predictions.append(final_prediction)
        else:
            predictions.append("Failed")
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} (Structured CoT) completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} (Structured CoT) ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} (Structured CoT) ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_nfr_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name} (Structured CoT).")
        all_models_nfr_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} (Structured CoT) Complete {'='*20}\n")

print("\n\n========== ALL NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of NFR Accuracies:")
for model, metrics in all_models_nfr_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final NFR Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews.
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████| 1278/1278 [1:53:58<00:00,  5.35s/it]



✅ Classification with llama2 (Structured CoT) completed in 113.98 minutes

--- Sample of Predictions for llama2 (Structured CoT) ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for llama2 (Structured CoT) ---
              precision    recall  f1-score   support

   usability       0.80      0.78      0.79       432
 reliability       0.66      0.81      0.73       587
 performance       1.00      0.13      0.23       121
 portability       0.27      0.12      0.16       119
    security       0.41      0.89      0.57        19
       other       0.00   

Classifying reviews with mistral: 100%|██████████████████████████████████████████| 1278/1278 [3:09:22<00:00,  8.89s/it]



✅ Classification with mistral (Structured CoT) completed in 189.37 minutes

--- Sample of Predictions for mistral (Structured CoT) ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for mistral (Structured CoT) ---
              precision    recall  f1-score   support

   usability       0.79      0.80      0.80       432
 reliability       0.84      0.48      0.61       587
 performance       0.36      0.98      0.53       121
 portability       0.52      0.35      0.42       119
    security       0.29      0.95      0.44        19
       other       0.00

Classifying reviews with llama3:8b: 100%|████████████████████████████████████████| 1278/1278 [5:44:46<00:00, 16.19s/it]



✅ Classification with llama3:8b (Structured CoT) completed in 344.78 minutes

--- Sample of Predictions for llama3:8b (Structured CoT) ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for llama3:8b (Structured CoT) ---
              precision    recall  f1-score   support

   usability       0.76      0.84      0.80       432
 reliability       0.81      0.51      0.63       587
 performance       0.39      0.93      0.55       121
 portability       0.64      0.32      0.43       119
    security       0.24      0.95      0.39        19
       other     


**Reasoning:** The user review expresses concerns related to the usability of the WhatsApp application. Specifically, the issues include:
- Video and voice calls not working properly.
- Greenish effect on the screen during calls.
- Difficulty in seeing the friend's face clearly.
- Message sending problems.

These issues suggest problems with the user interface and overall ease of use of the application.'
Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████| 1278/1278 [3:23:57<00:00,  9.58s/it]



✅ Classification with gemma:7b (Structured CoT) completed in 203.95 minutes

--- Sample of Predictions for gemma:7b (Structured CoT) ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for gemma:7b (Structured CoT) ---
              precision    recall  f1-score   support

   usability       0.80      0.83      0.81       432
 reliability       0.80      0.66      0.73       587
 performance       0.36      0.81      0.50       121
 portability       0.64      0.19      0.30       118
    security       0.57      0.89      0.69        19
       other       0


FINAL CLASSIFICATION: RL'

FINAL CLASSIFICATION: RL'

FINAL CLASSIFICATION: US'
Classifying reviews with phi3:mini: 100%|████████████████████████████████████████| 1278/1278 [1:20:22<00:00,  3.77s/it]


✅ Classification with phi3:mini (Structured CoT) completed in 80.38 minutes

--- Sample of Predictions for phi3:mini (Structured CoT) ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for phi3:mini (Structured CoT) ---
              precision    recall  f1-score   support

   usability       0.59      0.55      0.57       431
 reliability       0.80      0.37      0.50       581
 performance       0.47      0.80      0.59       121
 portability       0.51      0.32      0.39       117
    security       0.29      0.84      0.43        19
       other      




In [None]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import collections

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]
VALID_NFR_ABBREVIATIONS = {
    "usability": "US",
    "reliability": "RL",
    "performance": "PE",
    "portability": "PO",
    "security": "SE",
    "other": "OT"
}
ABBREVIATION_TO_FULL = {v: k for k, v in VALID_NFR_ABBREVIATIONS.items()}

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

print(f"Loaded {len(nfr_data)} non-functional reviews.")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)

# --- 4. Structured Chain-of-Thought Prompt ---
NFR_CLASSIFICATION_PROMPT_TEXT = """
You are a highly skilled software requirements expert, specializing in non-functional requirements (NFRs). Your task is to accurately classify a given user review into one of the following NFR types.

**NFR Categories:**
- Usability (US): How easy the system is to use, learn, and its user interface.
- Reliability (RL): The system's ability to perform consistently without failure, its uptime, and data recovery.
- Performance (PE): The system's speed, responsiveness, efficiency, throughput, and resource consumption.
- Portability (PO): How easily the system can be adapted or moved to different operating environments, platforms, or devices.
- Security (SE): Protection of data from unauthorized access, attacks, ensuring privacy and data integrity.
- Other (OT): Any review that does not fit clearly into the above specific NFR categories.

**Instructions:**
1. Read the 'User Review' carefully.
2. In a brief, step-by-step reasoning, analyze the review. What core concern is the user expressing? Does it relate to ease of use, stability, speed, compatibility, or data protection? Explain your reasoning process.
3. Based on your reasoning, determine the single best NFR category from the 'NFR Categories' list.
4. State your final classification clearly, preceded by "FINAL CLASSIFICATION:".
5. Your final output for classification MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability', 'RL: Reliability', 'OT: Other'). Do NOT include any other text, explanation, or punctuation after "FINAL CLASSIFICATION:".

**User Review:** '''{review_text}'''

**Thinking Process:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model using a specified prompt template.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 256
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

def parse_cot_prediction(raw_response: str) -> str:
    """
    Parses the raw LLM response from a Structured CoT prompt to extract the final classification.
    """
    # Regex to capture the full category name after "FINAL CLASSIFICATION: XX:"
    match = re.search(
        r"FINAL CLASSIFICATION:\s*(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)",
        raw_response,
        re.IGNORECASE | re.DOTALL
    )
    if match:
        return match.group(1).strip().lower()
    
    # Fallback to check for any valid labels in the last few lines if the strict format isn't found
    lines = raw_response.split('\n')
    for line in reversed(lines[-5:]):
        for label_full in VALID_NFR_LABELS:
            if label_full in line.lower():
                return label_full
        for label_abbr, label_full in ABBREVIATION_TO_FULL.items():
            if label_abbr.lower() in line.lower() and label_full in line.lower():
                return label_full
    
    logger.warning(f"Failed to parse classification from raw response: '{raw_response.strip()}'")
    return "Failed Parsing"


# --- 6. Main Evaluation Loop for All Models (Single-Pass Structured CoT) ---
all_models_nfr_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting NFR Classification Evaluation for Model: {current_model_name} (Structured CoT) {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=NFR_CLASSIFICATION_PROMPT_TEXT
        )
        
        if response_data["success"]:
            final_prediction = parse_cot_prediction(response_data["raw_response"])
            predictions.append(final_prediction)
        else:
            predictions.append("Failed")
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} (Structured CoT) completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} (Structured CoT) ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} (Structured CoT) ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_nfr_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name} (Structured CoT).")
        all_models_nfr_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} (Structured CoT) Complete {'='*20}\n")

print("\n\n========== ALL NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of NFR Accuracies:")
for model, metrics in all_models_nfr_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final NFR Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews.
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████| 1278/1278 [2:18:33<00:00,  6.51s/it]



✅ Classification with llama2 (Structured CoT) completed in 138.57 minutes

--- Sample of Predictions for llama2 (Structured CoT) ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for llama2 (Structured CoT) ---
              precision    recall  f1-score   support

   usability       0.80      0.79      0.80       432
 reliability       0.66      0.81      0.73       587
 performance       1.00      0.11      0.19       121
 portability       0.27      0.12      0.16       119
    security       0.41      0.89      0.57        19
       other       0.00   

Classifying reviews with mistral: 100%|██████████████████████████████████████████| 1278/1278 [3:04:42<00:00,  8.67s/it]



✅ Classification with mistral (Structured CoT) completed in 184.70 minutes

--- Sample of Predictions for mistral (Structured CoT) ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for mistral (Structured CoT) ---
              precision    recall  f1-score   support

   usability       0.79      0.80      0.79       432
 reliability       0.83      0.49      0.62       587
 performance       0.36      0.95      0.52       121
 portability       0.49      0.34      0.40       119
    security       0.33      0.95      0.49        19
       other       0.00

Classifying reviews with llama3:8b: 100%|████████████████████████████████████████| 1278/1278 [5:08:37<00:00, 14.49s/it]



✅ Classification with llama3:8b (Structured CoT) completed in 308.63 minutes

--- Sample of Predictions for llama3:8b (Structured CoT) ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for llama3:8b (Structured CoT) ---
              precision    recall  f1-score   support

   usability       0.75      0.85      0.80       432
 reliability       0.82      0.50      0.62       587
 performance       0.40      0.92      0.56       121
 portability       0.63      0.31      0.42       119
    security       0.23      0.95      0.38        19
       other     

Classifying reviews with gemma:7b:  53%|█████████████████████▏                  | 677/1278 [2:25:26<2:40:23, 16.01s/it]

In [None]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import collections

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]
VALID_NFR_ABBREVIATIONS = {
    "usability": "US",
    "reliability": "RL",
    "performance": "PE",
    "portability": "PO",
    "security": "SE",
    "other": "OT"
}
ABBREVIATION_TO_FULL = {v: k for k, v in VALID_NFR_ABBREVIATIONS.items()}

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

print(f"Loaded {len(nfr_data)} non-functional reviews.")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)

# --- 4. Structured Chain-of-Thought Prompt ---
NFR_CLASSIFICATION_PROMPT_TEXT = """
You are a highly skilled software requirements expert, specializing in non-functional requirements (NFRs). Your task is to accurately classify a given user review into one of the following NFR types.

**NFR Categories:**
- Usability (US): How easy the system is to use, learn, and its user interface.
- Reliability (RL): The system's ability to perform consistently without failure, its uptime, and data recovery.
- Performance (PE): The system's speed, responsiveness, efficiency, throughput, and resource consumption.
- Portability (PO): How easily the system can be adapted or moved to different operating environments, platforms, or devices.
- Security (SE): Protection of data from unauthorized access, attacks, ensuring privacy and data integrity.
- Other (OT): Any review that does not fit clearly into the above specific NFR categories.

**Instructions:**
1. Read the 'User Review' carefully.
2. In a brief, step-by-step reasoning, analyze the review. What core concern is the user expressing? Does it relate to ease of use, stability, speed, compatibility, or data protection? Explain your reasoning process.
3. Based on your reasoning, determine the single best NFR category from the 'NFR Categories' list.
4. State your final classification clearly, preceded by "FINAL CLASSIFICATION:".
5. Your final output for classification MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability', 'RL: Reliability', 'OT: Other'). Do NOT include any other text, explanation, or punctuation after "FINAL CLASSIFICATION:".

**User Review:** '''{review_text}'''

**Thinking Process:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model using a specified prompt template.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 256
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

def parse_cot_prediction(raw_response: str) -> str:
    """
    Parses the raw LLM response from a Structured CoT prompt to extract the final classification.
    """
    # Regex to capture the full category name after "FINAL CLASSIFICATION: XX:"
    match = re.search(
        r"FINAL CLASSIFICATION:\s*(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)",
        raw_response,
        re.IGNORECASE | re.DOTALL
    )
    if match:
        return match.group(1).strip().lower()
    
    # Fallback to check for any valid labels in the last few lines if the strict format isn't found
    lines = raw_response.split('\n')
    for line in reversed(lines[-5:]):
        for label_full in VALID_NFR_LABELS:
            if label_full in line.lower():
                return label_full
        for label_abbr, label_full in ABBREVIATION_TO_FULL.items():
            if label_abbr.lower() in line.lower() and label_full in line.lower():
                return label_full
    
    logger.warning(f"Failed to parse classification from raw response: '{raw_response.strip()}'")
    return "Failed Parsing"


# --- 6. Main Evaluation Loop for All Models (Single-Pass Structured CoT) ---
all_models_nfr_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting NFR Classification Evaluation for Model: {current_model_name} (Structured CoT) {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=NFR_CLASSIFICATION_PROMPT_TEXT
        )
        
        if response_data["success"]:
            final_prediction = parse_cot_prediction(response_data["raw_response"])
            predictions.append(final_prediction)
        else:
            predictions.append("Failed")
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} (Structured CoT) completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} (Structured CoT) ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} (Structured CoT) ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_nfr_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name} (Structured CoT).")
        all_models_nfr_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} (Structured CoT) Complete {'='*20}\n")

print("\n\n========== ALL NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of NFR Accuracies:")
for model, metrics in all_models_nfr_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final NFR Evaluation End ---")