In [3]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

print(f"Loaded {len(nfr_data)} non-functional reviews.")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. Constraint-Based Prompt Template ---
# This prompt asks the model to identify the NFR and then extract the specific constraint.
constraint_based_prompt_text = """
You are a software requirements expert. Your task is to precisely classify the provided user review into one of the following Non-Functional Requirement (NFR) types and then, importantly, extract the specific constraint mentioned in the review.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Instructions:**
1.  Read the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Constraint-Based NFR ---
all_models_constraint_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting CONSTRAINT-BASED Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=constraint_based_prompt_text
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # This regex captures the full category name from the strict output format
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ CONSTRAINT-BASED Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of CONSTRAINT-BASED Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- CONSTRAINT-BASED Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_constraint_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_constraint_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} CONSTRAINT-BASED Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL CONSTRAINT-BASED NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of CONSTRAINT-BASED NFR Accuracies:")
for model, metrics in all_models_constraint_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final CONSTRAINT-BASED Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews.
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████| 1278/1278 [1:04:41<00:00,  3.04s/it]



✅ CONSTRAINT-BASED Classification with llama2 completed in 64.69 minutes

--- Sample of CONSTRAINT-BASED Predictions for llama2 ---
                                              review ground_truth  \
0  without this the video calls could potentially...     security   
1  collects way too much unneeded information abo...     security   
2  why exctly do you need full read access to my ...     security   
3                     more private than fb messenger     security   
4  this app is the best message and chat service,...     security   

        predicted  
0       usability  
1  Failed Parsing  
2       usability  
3       usability  
4       usability  

--- CONSTRAINT-BASED Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.34      1.00      0.51       432
 reliability       0.50      0.00      0.00       585
 performance       0.83      0.08      0.15       121
 portability       0.00      0.00      0.00       118
  

Classifying reviews with mistral: 100%|██████████████████████████████████████████| 1278/1278 [1:18:41<00:00,  3.69s/it]



✅ CONSTRAINT-BASED Classification with mistral completed in 78.69 minutes

--- Sample of CONSTRAINT-BASED Predictions for mistral ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- CONSTRAINT-BASED Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.89      0.60      0.72       383
 reliability       0.81      0.17      0.28       569
 performance       0.16      0.98      0.28       118
 portability       0.70      0.13      0.22       106
    security       0.33      0.89      0.48        19
       other       0.00 

Classifying reviews with llama3:8b: 100%|████████████████████████████████████████| 1278/1278 [1:05:45<00:00,  3.09s/it]



✅ CONSTRAINT-BASED Classification with llama3:8b completed in 65.76 minutes

--- Sample of CONSTRAINT-BASED Predictions for llama3:8b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- CONSTRAINT-BASED Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       1.00      0.01      0.01       432
 reliability       0.77      0.58      0.66       587
 performance       0.66      0.79      0.72       121
 portability       0.74      0.17      0.27       119
    security       0.74      0.74      0.74        19
       other      

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████| 1278/1278 [1:38:45<00:00,  4.64s/it]



✅ CONSTRAINT-BASED Classification with gemma:7b completed in 98.77 minutes

--- Sample of CONSTRAINT-BASED Predictions for gemma:7b ---
                                              review ground_truth  \
0  without this the video calls could potentially...     security   
1  collects way too much unneeded information abo...     security   
2  why exctly do you need full read access to my ...     security   
3                     more private than fb messenger     security   
4  this app is the best message and chat service,...     security   

        predicted  
0  Failed Parsing  
1     performance  
2  Failed Parsing  
3  Failed Parsing  
4  Failed Parsing  

--- CONSTRAINT-BASED Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.93      0.38      0.54       324
 reliability       0.91      0.17      0.29       437
 performance       0.14      1.00      0.25       103
 portability       0.67      0.06      0.11       

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████| 1278/1278 [50:16<00:00,  2.36s/it]


✅ CONSTRAINT-BASED Classification with phi3:mini completed in 50.27 minutes

--- Sample of CONSTRAINT-BASED Predictions for phi3:mini ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- CONSTRAINT-BASED Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       0.93      0.10      0.18       403
 reliability       0.82      0.47      0.60       557
 performance       0.41      0.91      0.56       121
 portability       0.38      0.32      0.35       113
    security       0.40      0.89      0.56        19
       other      




In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

print(f"Loaded {len(nfr_data)} non-functional reviews.")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. Constraint-Based Prompt Template ---
# This prompt asks the model to identify the NFR and then extract the specific constraint.
constraint_based_prompt_text = """
You are a software requirements expert. Your task is to precisely classify the provided user review into one of the following Non-Functional Requirement (NFR) types and then, importantly, extract the specific constraint mentioned in the review.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Instructions:**
1.  Read the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Constraint-Based NFR ---
all_models_constraint_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting CONSTRAINT-BASED Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=constraint_based_prompt_text
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # This regex captures the full category name from the strict output format
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ CONSTRAINT-BASED Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of CONSTRAINT-BASED Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- CONSTRAINT-BASED Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_constraint_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_constraint_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} CONSTRAINT-BASED Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL CONSTRAINT-BASED NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of CONSTRAINT-BASED NFR Accuracies:")
for model, metrics in all_models_constraint_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final CONSTRAINT-BASED Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews.
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|█████████████████████████████████████████████| 1278/1278 [57:40<00:00,  2.71s/it]



✅ CONSTRAINT-BASED Classification with llama2 completed in 57.67 minutes

--- Sample of CONSTRAINT-BASED Predictions for llama2 ---
                                              review ground_truth  \
0  without this the video calls could potentially...     security   
1  collects way too much unneeded information abo...     security   
2  why exctly do you need full read access to my ...     security   
3                     more private than fb messenger     security   
4  this app is the best message and chat service,...     security   

        predicted  
0       usability  
1  Failed Parsing  
2       usability  
3       usability  
4       usability  

--- CONSTRAINT-BASED Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.34      1.00      0.51       432
 reliability       1.00      0.00      0.00       585
 performance       0.82      0.07      0.14       121
 portability       0.00      0.00      0.00       119
  

Classifying reviews with mistral: 100%|████████████████████████████████████████████| 1278/1278 [55:32<00:00,  2.61s/it]



✅ CONSTRAINT-BASED Classification with mistral completed in 55.54 minutes

--- Sample of CONSTRAINT-BASED Predictions for mistral ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- CONSTRAINT-BASED Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.88      0.62      0.73       386
 reliability       0.82      0.17      0.28       567
 performance       0.17      0.98      0.29       117
 portability       0.67      0.13      0.22       106
    security       0.33      0.89      0.49        19
       other       0.00 

Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████| 1278/1278 [57:32<00:00,  2.70s/it]



✅ CONSTRAINT-BASED Classification with llama3:8b completed in 57.54 minutes

--- Sample of CONSTRAINT-BASED Predictions for llama3:8b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security     other
4  this app is the best message and chat service,...     security  security

--- CONSTRAINT-BASED Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       1.00      0.01      0.01       432
 reliability       0.77      0.58      0.66       587
 performance       0.67      0.78      0.72       121
 portability       0.74      0.17      0.27       119
    security       0.72      0.68      0.70        19
       other      

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████| 1278/1278 [1:22:57<00:00,  3.89s/it]



✅ CONSTRAINT-BASED Classification with gemma:7b completed in 82.96 minutes

--- Sample of CONSTRAINT-BASED Predictions for gemma:7b ---
                                              review ground_truth  \
0  without this the video calls could potentially...     security   
1  collects way too much unneeded information abo...     security   
2  why exctly do you need full read access to my ...     security   
3                     more private than fb messenger     security   
4  this app is the best message and chat service,...     security   

        predicted  
0  Failed Parsing  
1     performance  
2  Failed Parsing  
3  Failed Parsing  
4  Failed Parsing  

--- CONSTRAINT-BASED Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.94      0.38      0.54       319
 reliability       0.92      0.17      0.29       439
 performance       0.15      1.00      0.25       103
 portability       0.67      0.06      0.11       

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████| 1278/1278 [50:10<00:00,  2.36s/it]


✅ CONSTRAINT-BASED Classification with phi3:mini completed in 50.18 minutes

--- Sample of CONSTRAINT-BASED Predictions for phi3:mini ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- CONSTRAINT-BASED Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       0.93      0.10      0.18       403
 reliability       0.82      0.47      0.60       557
 performance       0.41      0.91      0.56       121
 portability       0.38      0.32      0.35       113
    security       0.40      0.89      0.56        19
       other      




In [2]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

print(f"Loaded {len(nfr_data)} non-functional reviews.")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. Constraint-Based Prompt Template ---
# This prompt asks the model to identify the NFR and then extract the specific constraint.
constraint_based_prompt_text = """
You are a software requirements expert. Your task is to precisely classify the provided user review into one of the following Non-Functional Requirement (NFR) types and then, importantly, extract the specific constraint mentioned in the review.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Instructions:**
1.  Read the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Constraint-Based NFR ---
all_models_constraint_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting CONSTRAINT-BASED Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=constraint_based_prompt_text
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # This regex captures the full category name from the strict output format
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ CONSTRAINT-BASED Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of CONSTRAINT-BASED Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- CONSTRAINT-BASED Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_constraint_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_constraint_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} CONSTRAINT-BASED Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL CONSTRAINT-BASED NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of CONSTRAINT-BASED NFR Accuracies:")
for model, metrics in all_models_constraint_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final CONSTRAINT-BASED Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews.
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|█████████████████████████████████████████████| 1278/1278 [56:00<00:00,  2.63s/it]



✅ CONSTRAINT-BASED Classification with llama2 completed in 56.01 minutes

--- Sample of CONSTRAINT-BASED Predictions for llama2 ---
                                              review ground_truth  predicted
0  without this the video calls could potentially...     security  usability
1  collects way too much unneeded information abo...     security  usability
2  why exctly do you need full read access to my ...     security  usability
3                     more private than fb messenger     security  usability
4  this app is the best message and chat service,...     security  usability

--- CONSTRAINT-BASED Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.34      1.00      0.51       432
 reliability       0.50      0.00      0.00       585
 performance       0.83      0.08      0.15       121
 portability       0.00      0.00      0.00       118
    security       0.33      0.06      0.10        17
       other       0.

Classifying reviews with mistral: 100%|████████████████████████████████████████████| 1278/1278 [55:30<00:00,  2.61s/it]



✅ CONSTRAINT-BASED Classification with mistral completed in 55.51 minutes

--- Sample of CONSTRAINT-BASED Predictions for mistral ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- CONSTRAINT-BASED Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.88      0.62      0.73       386
 reliability       0.82      0.17      0.28       567
 performance       0.17      0.98      0.29       117
 portability       0.67      0.13      0.22       106
    security       0.33      0.89      0.49        19
       other       0.00 

Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████| 1278/1278 [57:33<00:00,  2.70s/it]



✅ CONSTRAINT-BASED Classification with llama3:8b completed in 57.55 minutes

--- Sample of CONSTRAINT-BASED Predictions for llama3:8b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security     other
4  this app is the best message and chat service,...     security  security

--- CONSTRAINT-BASED Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       1.00      0.01      0.01       432
 reliability       0.77      0.58      0.66       587
 performance       0.67      0.78      0.72       121
 portability       0.74      0.17      0.27       119
    security       0.72      0.68      0.70        19
       other      

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████| 1278/1278 [1:22:51<00:00,  3.89s/it]



✅ CONSTRAINT-BASED Classification with gemma:7b completed in 82.85 minutes

--- Sample of CONSTRAINT-BASED Predictions for gemma:7b ---
                                              review ground_truth  \
0  without this the video calls could potentially...     security   
1  collects way too much unneeded information abo...     security   
2  why exctly do you need full read access to my ...     security   
3                     more private than fb messenger     security   
4  this app is the best message and chat service,...     security   

        predicted  
0  Failed Parsing  
1     performance  
2  Failed Parsing  
3  Failed Parsing  
4  Failed Parsing  

--- CONSTRAINT-BASED Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.94      0.38      0.54       319
 reliability       0.92      0.17      0.29       439
 performance       0.15      1.00      0.25       103
 portability       0.67      0.06      0.11       

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████| 1278/1278 [50:12<00:00,  2.36s/it]


✅ CONSTRAINT-BASED Classification with phi3:mini completed in 50.20 minutes

--- Sample of CONSTRAINT-BASED Predictions for phi3:mini ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- CONSTRAINT-BASED Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       0.93      0.10      0.18       403
 reliability       0.82      0.47      0.60       557
 performance       0.41      0.91      0.56       121
 portability       0.38      0.32      0.35       113
    security       0.40      0.89      0.56        19
       other      




In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Sampled NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

# --- IMPORTANT: Sample the data for a quick test run (e.g., 30 reviews) ---
# REMEMBER TO REMOVE OR COMMENT OUT THIS LINE FOR THE FULL DATASET RUN!
nfr_data = nfr_data.sample(n=min(30, len(nfr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded {len(nfr_data)} non-functional reviews (sampled for pilot run).")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. Constraint-Based Prompt Template ---
constraint_based_prompt_text = """
You are a software requirements expert. Your task is to precisely classify the provided user review into one of the following Non-Functional Requirement (NFR) types.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Instructions:**
1.  Read the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Constraint-Based NFR ---
all_models_constraint_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting CONSTRAINT-BASED Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=constraint_based_prompt_text
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ CONSTRAINT-BASED Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of CONSTRAINT-BASED Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- CONSTRAINT-BASED Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_constraint_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_constraint_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} CONSTRAINT-BASED Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL CONSTRAINT-BASED NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of CONSTRAINT-BASED NFR Accuracies:")
for model, metrics in all_models_constraint_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final CONSTRAINT-BASED Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 30 non-functional reviews (sampled for pilot run).
Sample of loaded NFR data:
                                              review ground_truth
0  it takes about 45 seconds for the page to turn...  performance
1  on my no anyone use whatsapp id , when the id ...  reliability
2  can't find a book unless i know exctly what bo...    usability
3     a paper white view would be a wonderful option    usability
4  bring it back or at least the option to turn i...    usability
----------------------------------------



Classifying reviews with llama2: 100%|█████████████████████████████████████████████████| 30/30 [01:43<00:00,  3.44s/it]



✅ CONSTRAINT-BASED Classification with llama2 completed in 1.72 minutes

--- Sample of CONSTRAINT-BASED Predictions for llama2 ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability    usability
2  can't find a book unless i know exctly what bo...    usability    usability
3     a paper white view would be a wonderful option    usability    usability
4  bring it back or at least the option to turn i...    usability    usability

--- CONSTRAINT-BASED Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.35      1.00      0.51         9
 reliability       1.00      0.08      0.15        12
 performance       1.00      0.75      0.86         4
 portability       0.00      0.00      0.00         5
    security       0.00      0.00      0.00         0
       oth

Classifying reviews with mistral: 100%|████████████████████████████████████████████████| 30/30 [01:35<00:00,  3.18s/it]



✅ CONSTRAINT-BASED Classification with mistral completed in 1.59 minutes

--- Sample of CONSTRAINT-BASED Predictions for mistral ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability  performance
2  can't find a book unless i know exctly what bo...    usability    usability
3     a paper white view would be a wonderful option    usability    usability
4  bring it back or at least the option to turn i...    usability  performance

--- CONSTRAINT-BASED Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       1.00      0.44      0.62         9
 reliability       0.00      0.00      0.00        11
 performance       0.20      1.00      0.33         4
 portability       0.50      0.20      0.29         5
    security       0.00      0.00      0.00         0
       

Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████████| 30/30 [01:47<00:00,  3.57s/it]



✅ CONSTRAINT-BASED Classification with llama3:8b completed in 1.78 minutes

--- Sample of CONSTRAINT-BASED Predictions for llama3:8b ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability        other
2  can't find a book unless i know exctly what bo...    usability  performance
3     a paper white view would be a wonderful option    usability    usability
4  bring it back or at least the option to turn i...    usability  performance

--- CONSTRAINT-BASED Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       1.00      0.33      0.50         9
 reliability       0.71      0.42      0.53        12
 performance       0.31      1.00      0.47         4
 portability       0.50      0.20      0.29         5
    security       0.00      0.00      0.00         0
 

Classifying reviews with gemma:7b: 100%|███████████████████████████████████████████████| 30/30 [02:04<00:00,  4.14s/it]



✅ CONSTRAINT-BASED Classification with gemma:7b completed in 2.07 minutes

--- Sample of CONSTRAINT-BASED Predictions for gemma:7b ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability    usability
2  can't find a book unless i know exctly what bo...    usability  performance
3     a paper white view would be a wonderful option    usability    usability
4  bring it back or at least the option to turn i...    usability  performance

--- CONSTRAINT-BASED Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.75      0.33      0.46         9
 reliability       0.00      0.00      0.00        12
 performance       0.16      1.00      0.28         4
 portability       1.00      0.20      0.33         5
    security       0.00      0.00      0.00         0
    

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████████| 30/30 [01:13<00:00,  2.45s/it]


✅ CONSTRAINT-BASED Classification with phi3:mini completed in 1.23 minutes

--- Sample of CONSTRAINT-BASED Predictions for phi3:mini ---
                                              review ground_truth  \
0  it takes about 45 seconds for the page to turn...  performance   
1  on my no anyone use whatsapp id , when the id ...  reliability   
2  can't find a book unless i know exctly what bo...    usability   
3     a paper white view would be a wonderful option    usability   
4  bring it back or at least the option to turn i...    usability   

        predicted  
0     performance  
1  Failed Parsing  
2       usability  
3           other  
4           other  

--- CONSTRAINT-BASED Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       1.00      0.22      0.36         9
 reliability       0.56      0.45      0.50        11
 performance       0.33      0.75      0.46         4
 portability       1.00      0.20      0.33     




In [None]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

print(f"Loaded {len(nfr_data)} non-functional reviews.")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. Constraint-Based Prompt Template ---
# This prompt asks the model to identify the NFR and then extract the specific constraint.
constraint_based_prompt_text = """
You are a software requirements expert. Your task is to precisely classify the provided user review into one of the following Non-Functional Requirement (NFR) types and then, importantly, extract the specific constraint mentioned in the review.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Instructions:**
1.  Read the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Constraint-Based NFR ---
all_models_constraint_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting CONSTRAINT-BASED Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=constraint_based_prompt_text
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # This regex captures the full category name from the strict output format
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ CONSTRAINT-BASED Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of CONSTRAINT-BASED Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- CONSTRAINT-BASED Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_constraint_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_constraint_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} CONSTRAINT-BASED Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL CONSTRAINT-BASED NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of CONSTRAINT-BASED NFR Accuracies:")
for model, metrics in all_models_constraint_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final CONSTRAINT-BASED Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews.
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████| 1278/1278 [2:06:22<00:00,  5.93s/it]



✅ CONSTRAINT-BASED Classification with llama2 completed in 126.38 minutes

--- Sample of CONSTRAINT-BASED Predictions for llama2 ---
                                              review ground_truth  predicted
0  without this the video calls could potentially...     security  usability
1  collects way too much unneeded information abo...     security  usability
2  why exctly do you need full read access to my ...     security  usability
3                     more private than fb messenger     security  usability
4  this app is the best message and chat service,...     security  usability

--- CONSTRAINT-BASED Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.34      1.00      0.51       432
 reliability       0.50      0.00      0.00       586
 performance       0.85      0.09      0.16       121
 portability       0.00      0.00      0.00       119
    security       0.33      0.06      0.10        17
       other       0

Classifying reviews with mistral: 100%|██████████████████████████████████████████| 1278/1278 [2:57:12<00:00,  8.32s/it]



✅ CONSTRAINT-BASED Classification with mistral completed in 177.21 minutes

--- Sample of CONSTRAINT-BASED Predictions for mistral ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- CONSTRAINT-BASED Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.88      0.61      0.72       386
 reliability       0.82      0.17      0.28       570
 performance       0.16      0.98      0.28       117
 portability       0.71      0.14      0.23       108
    security       0.33      0.89      0.48        19
       other       0.00

Classifying reviews with llama3:8b: 100%|████████████████████████████████████████| 1278/1278 [4:50:31<00:00, 13.64s/it]



✅ CONSTRAINT-BASED Classification with llama3:8b completed in 290.53 minutes

--- Sample of CONSTRAINT-BASED Predictions for llama3:8b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security     other
4  this app is the best message and chat service,...     security  security

--- CONSTRAINT-BASED Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       1.00      0.01      0.02       432
 reliability       0.77      0.57      0.66       587
 performance       0.66      0.78      0.71       121
 portability       0.73      0.16      0.26       119
    security       0.72      0.68      0.70        19
       other     

Classifying reviews with gemma:7b:  61%|████████████████████████▌               | 783/1278 [3:09:01<2:21:38, 17.17s/it]

In [None]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

print(f"Loaded {len(nfr_data)} non-functional reviews.")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. Constraint-Based Prompt Template ---
# This prompt asks the model to identify the NFR and then extract the specific constraint.
constraint_based_prompt_text = """
You are a software requirements expert. Your task is to precisely classify the provided user review into one of the following Non-Functional Requirement (NFR) types and then, importantly, extract the specific constraint mentioned in the review.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Instructions:**
1.  Read the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Constraint-Based NFR ---
all_models_constraint_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting CONSTRAINT-BASED Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=constraint_based_prompt_text
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # This regex captures the full category name from the strict output format
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ CONSTRAINT-BASED Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of CONSTRAINT-BASED Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- CONSTRAINT-BASED Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_constraint_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_constraint_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} CONSTRAINT-BASED Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL CONSTRAINT-BASED NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of CONSTRAINT-BASED NFR Accuracies:")
for model, metrics in all_models_constraint_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final CONSTRAINT-BASED Evaluation End ---")

In [2]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of top-performing Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "mistral",
    "llama3:8b",
    "gemma:7b"
]

# --- 3. Data Loading and Preparation (Sampled NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

# --- IMPORTANT: Sample the data for a quick test run (30 reviews) ---
# REMEMBER TO REMOVE OR COMMENT OUT THIS LINE FOR THE FULL DATASET RUN!
nfr_data = nfr_data.sample(n=min(30, len(nfr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded {len(nfr_data)} non-functional reviews (sampled for pilot run).")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. Hybrid Prompt Template (Role-Based + Few-Shot + Constraint-Based) ---
FEW_SHOT_EXAMPLES = [
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "PE: Performance"},
    {"review": "keeps crashing when I try to save a new record", "classification": "RL: Reliability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "PO: Portability"}
]

formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"App Review: {ex['review']}\nClassification: {ex['classification']}\n\n"

hybrid_prompt_text = """
You are a highly skilled software requirements expert, specializing in the classification of user feedback. Your task is to precisely classify the provided app review into one of the following Non-Functional Requirement (NFR) types and output the result in a strictly defined format.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Examples:**
{few_shot_examples_text}

**Instructions:**
1.  Read the 'App Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list based on the definitions and examples.
3.  Your final output MUST be ONLY the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability'). Do NOT include any other text, reasoning, or punctuation.

**App Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str, few_shot_examples_text: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(
        review_text=review_text,
        few_shot_examples_text=few_shot_examples_text
    )

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Hybrid Prompting ---
all_models_hybrid_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting HYBRID Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=hybrid_prompt_text,
            few_shot_examples_text=formatted_few_shot_text
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ HYBRID Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of HYBRID Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- HYBRID Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_hybrid_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_hybrid_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} HYBRID Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL HYBRID NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of HYBRID NFR Accuracies:")
for model, metrics in all_models_hybrid_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final HYBRID Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 30 non-functional reviews (sampled for pilot run).
Sample of loaded NFR data:
                                              review ground_truth
0  it takes about 45 seconds for the page to turn...  performance
1  on my no anyone use whatsapp id , when the id ...  reliability
2  can't find a book unless i know exctly what bo...    usability
3     a paper white view would be a wonderful option    usability
4  bring it back or at least the option to turn i...    usability
----------------------------------------



Classifying reviews with mistral: 100%|████████████████████████████████████████████████| 30/30 [01:33<00:00,  3.10s/it]



✅ HYBRID Classification with mistral completed in 1.55 minutes

--- Sample of HYBRID Predictions for mistral ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability     security
2  can't find a book unless i know exctly what bo...    usability    usability
3     a paper white view would be a wonderful option    usability        other
4  bring it back or at least the option to turn i...    usability     security

--- HYBRID Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.83      0.56      0.67         9
 reliability       0.60      0.25      0.35        12
 performance       0.40      1.00      0.57         4
 portability       1.00      0.20      0.33         5
    security       0.00      0.00      0.00         0
       other       0.00      0.00    

Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████████| 30/30 [01:41<00:00,  3.40s/it]



✅ HYBRID Classification with llama3:8b completed in 1.70 minutes

--- Sample of HYBRID Predictions for llama3:8b ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability     security
2  can't find a book unless i know exctly what bo...    usability        other
3     a paper white view would be a wonderful option    usability        other
4  bring it back or at least the option to turn i...    usability        other

--- HYBRID Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       1.00      0.11      0.20         9
 reliability       0.71      0.42      0.53        12
 performance       0.40      0.50      0.44         4
 portability       1.00      0.20      0.33         5
    security       0.00      0.00      0.00         0
       other       0.00      0.

Classifying reviews with gemma:7b: 100%|███████████████████████████████████████████████| 30/30 [02:01<00:00,  4.04s/it]


✅ HYBRID Classification with gemma:7b completed in 2.02 minutes

--- Sample of HYBRID Predictions for gemma:7b ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability     security
2  can't find a book unless i know exctly what bo...    usability     security
3     a paper white view would be a wonderful option    usability        other
4  bring it back or at least the option to turn i...    usability        other

--- HYBRID Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.00      0.00      0.00         9
 reliability       0.00      0.00      0.00        11
 performance       0.50      0.75      0.60         4
 portability       0.67      0.40      0.50         5
    security       0.00      0.00      0.00         0
       other       0.00      0.00 


