In [6]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
data_raw = pd.concat(sheets.values(), ignore_index=True)

# Rename columns for easier access and consistency
data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Define valid NFR categories (lowercase for standardization)
VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

# Standardize ground_truth labels to lowercase and strip whitespace
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

# Filter out rows where ground_truth is not one of our valid labels
initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

# --- TEMPORARY: Sample the data for a quick test run (e.g., 50 reviews) ---
# REMEMBER TO REMOVE OR COMMENT OUT THIS LINE FOR THE FULL DATASET RUN!
# nfr_data = nfr_data.sample(n=min(50, len(nfr_data)), random_state=42).reset_index(drop=True)
# --- END TEMPORARY SAMPLING ---


print(f"Loaded {len(nfr_data)} non-functional reviews (sampled for test).")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. NFR Definitions & Few-Shot Examples (for the Prompt) ---
# These are the definitions and examples that were used in your prompts.json P6 context
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage.",
    "portability": "Portability requirements describe the ease with which a system can be transferred from one environment to another.",
    "reliability": "Reliability requirements describe the ability of a system to perform its functions under stated conditions for a specified period of time without failure."
}

FEW_SHOT_EXAMPLES = [
    # Security Class Examples (from your prompts.json P5/P6 context)
    {"review": "without this the video calls could potentially be intercepted by hackers", "classification": "Security"},
    {"review": "collects way too much unneeded information about my private life and location without my consent", "classification": "Security"},
    {"review": "why exctly do you need full read access to my text messages on my phone", "classification": "Security"},

    # Portability Class Examples
    {"review": "this app is very great and compatible with my tablet and phone", "classification": "Portability"},
    {"review": "does not work on my new phone", "classification": "Portability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "Portability"},

    # Performance efficiency Class Examples
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "Performance"},
    {"review": "my phone overheats and battery drains very quickly when using this app", "classification": "Performance"},
    {"review": "too much lagging and freezing of the app after new update", "classification": "Performance"},

    # Reliability Class Examples
    {"review": "on my no anyone use whatsapp id , when the id open up and I have to put on the id it still don't open", "classification": "Reliability"},
    {"review": "The app stopped working after the recent update", "classification": "Reliability"},
    {"review": "Keeps crashing when I try to save a new record", "classification": "Reliability"},

    # Usability Class Examples
    {"review": "can't find a book unless i know exctly what book I want to buy", "classification": "Usability"},
    {"review": "this app needs to make it easier to add notes and save it", "classification": "Usability"},
    {"review": "very complicated and user unfriendly interface", "classification": "Usability"}
]

# Format these examples into a single string for the prompt
formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"Requirement: {ex['review']}\nClassification: {ex['classification']}\n\n"

# Define the list of NFR categories as a string for the prompt
all_nfr_labels_str = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other" # Based on your 5 classes + Other
])

# --- 4. The NFR Classification Prompt (P6 from prompts.json) ---
# Extracted directly from your prompts.json under NFR_CLASSIFICATION -> P6
nfr_classification_prompt_text = """
You are a software requirements expert. Your task is to classify user reviews into one of the following Non-Functional Requirement (NFR) types.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Examples:**
{few_shot_examples_text}

**Instructions:**
1.  Analyze the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list that the review most closely aligns with, based on the definitions and examples.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability', 'RL: Reliability', 'OT: Other'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, **prompt_kwargs) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    # Format the prompt using few-shot examples and NFR categories list
    formatted_prompt = nfr_classification_prompt_text.format(
        review_text=review_text,
        nfr_categories_list=all_nfr_labels_str, # This is used in the prompt's header
        few_shot_examples_text=prompt_kwargs.get('few_shot_examples_text', '') # Use prompt_kwargs for few-shot
    )

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 100 # Limit output length
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180) # Increased timeout
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models ---
all_models_nfr_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting NFR Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            nfr_categories_list=all_nfr_labels_str, # Pass the categories list for prompt formatting
            few_shot_examples_text=formatted_few_shot_text # Pass the few-shot examples for prompt formatting
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # --- Regex to capture the exact category name based on P6's strict output format ---
            # P6 expects "AB: Category Name" (e.g., "US: Usability")
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE # Use MULTILINE to match from start/end of line
            )
            
            # Extract the full category name (group 1) and convert to lowercase for comparison
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
            
            # Optional: Print raw output for debugging if needed (comment out for clean runs)
            # if pred == "Failed Parsing" or pred not in VALID_NFR_LABELS:
            #    print(f"\nFailed Parsing/Invalid Pred for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}' -> Parsed: '{pred}'")

        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_nfr_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_nfr_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of NFR Accuracies:")
for model, metrics in all_models_nfr_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final NFR Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews (sampled for test).
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████| 1278/1278 [1:06:27<00:00,  3.12s/it]



✅ Classification with llama2 completed in 66.46 minutes

--- Sample of Predictions for llama2 ---
                                              review ground_truth  predicted
0  without this the video calls could potentially...     security   security
1  collects way too much unneeded information abo...     security  usability
2  why exctly do you need full read access to my ...     security  usability
3                     more private than fb messenger     security  usability
4  this app is the best message and chat service,...     security  usability

--- Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.61      0.95      0.74       431
 reliability       0.72      0.55      0.62       584
 performance       0.56      0.59      0.58       119
 portability       0.86      0.05      0.10       119
    security       0.20      0.19      0.19        16
       other       0.00      0.00      0.00         0

    accuracy     

Classifying reviews with mistral: 100%|██████████████████████████████████████████| 1278/1278 [1:04:34<00:00,  3.03s/it]



✅ Classification with mistral completed in 64.58 minutes

--- Sample of Predictions for mistral ---
                                              review ground_truth  predicted
0  without this the video calls could potentially...     security   security
1  collects way too much unneeded information abo...     security   security
2  why exctly do you need full read access to my ...     security   security
3                     more private than fb messenger     security  usability
4  this app is the best message and chat service,...     security   security

--- Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.83      0.54      0.65       404
 reliability       0.80      0.45      0.57       580
 performance       0.28      0.97      0.43       120
 portability       0.45      0.32      0.37       118
    security       0.87      0.76      0.81        17
       other       0.00      0.00      0.00         0

    accuracy  

Classifying reviews with llama3:8b: 100%|████████████████████████████████████████| 1278/1278 [1:10:10<00:00,  3.29s/it]



✅ Classification with llama3:8b completed in 70.17 minutes

--- Sample of Predictions for llama3:8b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       0.88      0.64      0.74       432
 reliability       0.83      0.54      0.65       587
 performance       0.29      0.95      0.45       121
 portability       0.52      0.34      0.41       119
    security       0.21      0.95      0.35        19
       other       0.00      0.00      0.00         0

    accuracy  

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████| 1278/1278 [1:23:10<00:00,  3.91s/it]



✅ Classification with gemma:7b completed in 83.18 minutes

--- Sample of Predictions for gemma:7b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.86      0.36      0.51       426
 reliability       0.79      0.58      0.67       585
 performance       0.32      0.87      0.47       121
 portability       0.28      0.28      0.28       119
    security       0.80      0.84      0.82        19
       other       0.00      0.00      0.00         0

    accuracy     

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████| 1278/1278 [50:22<00:00,  2.36s/it]


✅ Classification with phi3:mini completed in 50.37 minutes

--- Sample of Predictions for phi3:mini ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       0.83      0.58      0.69       275
 reliability       0.77      0.37      0.50       409
 performance       0.35      0.97      0.51       120
 portability       0.27      0.40      0.32       102
    security       0.57      0.94      0.71        18
       other       0.00      0.00      0.00         0

    accuracy  




In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
data_raw = pd.concat(sheets.values(), ignore_index=True)

# Rename columns for easier access and consistency
data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Define valid NFR categories (lowercase for standardization)
VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

# Standardize ground_truth labels to lowercase and strip whitespace
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

# Filter out rows where ground_truth is not one of our valid labels
initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

# --- TEMPORARY: Sample the data for a quick test run (e.g., 50 reviews) ---
# REMEMBER TO REMOVE OR COMMENT OUT THIS LINE FOR THE FULL DATASET RUN!
# nfr_data = nfr_data.sample(n=min(50, len(nfr_data)), random_state=42).reset_index(drop=True)
# --- END TEMPORARY SAMPLING ---


print(f"Loaded {len(nfr_data)} non-functional reviews (sampled for test).")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. NFR Definitions & Few-Shot Examples (for the Prompt) ---
# These are the definitions and examples that were used in your prompts.json P6 context
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage.",
    "portability": "Portability requirements describe the ease with which a system can be transferred from one environment to another.",
    "reliability": "Reliability requirements describe the ability of a system to perform its functions under stated conditions for a specified period of time without failure."
}

FEW_SHOT_EXAMPLES = [
    # Security Class Examples (from your prompts.json P5/P6 context)
    {"review": "without this the video calls could potentially be intercepted by hackers", "classification": "Security"},
    {"review": "collects way too much unneeded information about my private life and location without my consent", "classification": "Security"},
    {"review": "why exctly do you need full read access to my text messages on my phone", "classification": "Security"},

    # Portability Class Examples
    {"review": "this app is very great and compatible with my tablet and phone", "classification": "Portability"},
    {"review": "does not work on my new phone", "classification": "Portability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "Portability"},

    # Performance efficiency Class Examples
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "Performance"},
    {"review": "my phone overheats and battery drains very quickly when using this app", "classification": "Performance"},
    {"review": "too much lagging and freezing of the app after new update", "classification": "Performance"},

    # Reliability Class Examples
    {"review": "on my no anyone use whatsapp id , when the id open up and I have to put on the id it still don't open", "classification": "Reliability"},
    {"review": "The app stopped working after the recent update", "classification": "Reliability"},
    {"review": "Keeps crashing when I try to save a new record", "classification": "Reliability"},

    # Usability Class Examples
    {"review": "can't find a book unless i know exctly what book I want to buy", "classification": "Usability"},
    {"review": "this app needs to make it easier to add notes and save it", "classification": "Usability"},
    {"review": "very complicated and user unfriendly interface", "classification": "Usability"}
]

# Format these examples into a single string for the prompt
formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"Requirement: {ex['review']}\nClassification: {ex['classification']}\n\n"

# Define the list of NFR categories as a string for the prompt
all_nfr_labels_str = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other" # Based on your 5 classes + Other
])

# --- 4. The NFR Classification Prompt (P6 from prompts.json) ---
# Extracted directly from your prompts.json under NFR_CLASSIFICATION -> P6
nfr_classification_prompt_text = """
You are a software requirements expert. Your task is to classify user reviews into one of the following Non-Functional Requirement (NFR) types.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Examples:**
{few_shot_examples_text}

**Instructions:**
1.  Analyze the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list that the review most closely aligns with, based on the definitions and examples.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability', 'RL: Reliability', 'OT: Other'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, **prompt_kwargs) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    # Format the prompt using few-shot examples and NFR categories list
    formatted_prompt = nfr_classification_prompt_text.format(
        review_text=review_text,
        nfr_categories_list=all_nfr_labels_str, # This is used in the prompt's header
        few_shot_examples_text=prompt_kwargs.get('few_shot_examples_text', '') # Use prompt_kwargs for few-shot
    )

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 100 # Limit output length
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180) # Increased timeout
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models ---
all_models_nfr_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting NFR Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            nfr_categories_list=all_nfr_labels_str, # Pass the categories list for prompt formatting
            few_shot_examples_text=formatted_few_shot_text # Pass the few-shot examples for prompt formatting
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # --- Regex to capture the exact category name based on P6's strict output format ---
            # P6 expects "AB: Category Name" (e.g., "US: Usability")
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE # Use MULTILINE to match from start/end of line
            )
            
            # Extract the full category name (group 1) and convert to lowercase for comparison
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
            
            # Optional: Print raw output for debugging if needed (comment out for clean runs)
            # if pred == "Failed Parsing" or pred not in VALID_NFR_LABELS:
            #    print(f"\nFailed Parsing/Invalid Pred for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}' -> Parsed: '{pred}'")

        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_nfr_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_nfr_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of NFR Accuracies:")
for model, metrics in all_models_nfr_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final NFR Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews (sampled for test).
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|█████████████████████████████████████████████| 1278/1278 [56:44<00:00,  2.66s/it]



✅ Classification with llama2 completed in 56.74 minutes

--- Sample of Predictions for llama2 ---
                                              review ground_truth  predicted
0  without this the video calls could potentially...     security   security
1  collects way too much unneeded information abo...     security  usability
2  why exctly do you need full read access to my ...     security  usability
3                     more private than fb messenger     security  usability
4  this app is the best message and chat service,...     security  usability

--- Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.60      0.94      0.74       430
 reliability       0.71      0.54      0.61       585
 performance       0.55      0.59      0.57       119
 portability       0.86      0.05      0.10       119
    security       0.20      0.18      0.19        17
       other       0.00      0.00      0.00         0

    accuracy     

Classifying reviews with mistral: 100%|████████████████████████████████████████████| 1278/1278 [55:05<00:00,  2.59s/it]



✅ Classification with mistral completed in 55.09 minutes

--- Sample of Predictions for mistral ---
                                              review ground_truth  predicted
0  without this the video calls could potentially...     security   security
1  collects way too much unneeded information abo...     security   security
2  why exctly do you need full read access to my ...     security   security
3                     more private than fb messenger     security  usability
4  this app is the best message and chat service,...     security   security

--- Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.83      0.54      0.65       404
 reliability       0.80      0.45      0.57       580
 performance       0.28      0.97      0.43       120
 portability       0.45      0.32      0.37       118
    security       0.87      0.76      0.81        17
       other       0.00      0.00      0.00         0

    accuracy  

Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████| 1278/1278 [58:20<00:00,  2.74s/it]



✅ Classification with llama3:8b completed in 58.34 minutes

--- Sample of Predictions for llama3:8b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       0.88      0.64      0.74       432
 reliability       0.83      0.54      0.65       587
 performance       0.29      0.95      0.45       121
 portability       0.52      0.34      0.41       119
    security       0.21      0.95      0.35        19
       other       0.00      0.00      0.00         0

    accuracy  

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████| 1278/1278 [1:08:46<00:00,  3.23s/it]



✅ Classification with gemma:7b completed in 68.78 minutes

--- Sample of Predictions for gemma:7b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.86      0.36      0.51       426
 reliability       0.79      0.58      0.67       585
 performance       0.32      0.87      0.47       121
 portability       0.28      0.28      0.28       119
    security       0.80      0.84      0.82        19
       other       0.00      0.00      0.00         0

    accuracy     

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████| 1278/1278 [50:11<00:00,  2.36s/it]


✅ Classification with phi3:mini completed in 50.20 minutes

--- Sample of Predictions for phi3:mini ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       0.83      0.58      0.69       275
 reliability       0.77      0.37      0.50       409
 performance       0.35      0.97      0.51       120
 portability       0.27      0.40      0.32       102
    security       0.57      0.94      0.71        18
       other       0.00      0.00      0.00         0

    accuracy  




In [2]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
data_raw = pd.concat(sheets.values(), ignore_index=True)

# Rename columns for easier access and consistency
data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Define valid NFR categories (lowercase for standardization)
VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

# Standardize ground_truth labels to lowercase and strip whitespace
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

# Filter out rows where ground_truth is not one of our valid labels
initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

# --- TEMPORARY: Sample the data for a quick test run (e.g., 50 reviews) ---
# REMEMBER TO REMOVE OR COMMENT OUT THIS LINE FOR THE FULL DATASET RUN!
# nfr_data = nfr_data.sample(n=min(50, len(nfr_data)), random_state=42).reset_index(drop=True)
# --- END TEMPORARY SAMPLING ---


print(f"Loaded {len(nfr_data)} non-functional reviews (sampled for test).")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. NFR Definitions & Few-Shot Examples (for the Prompt) ---
# These are the definitions and examples that were used in your prompts.json P6 context
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage.",
    "portability": "Portability requirements describe the ease with which a system can be transferred from one environment to another.",
    "reliability": "Reliability requirements describe the ability of a system to perform its functions under stated conditions for a specified period of time without failure."
}

FEW_SHOT_EXAMPLES = [
    # Security Class Examples (from your prompts.json P5/P6 context)
    {"review": "without this the video calls could potentially be intercepted by hackers", "classification": "Security"},
    {"review": "collects way too much unneeded information about my private life and location without my consent", "classification": "Security"},
    {"review": "why exctly do you need full read access to my text messages on my phone", "classification": "Security"},

    # Portability Class Examples
    {"review": "this app is very great and compatible with my tablet and phone", "classification": "Portability"},
    {"review": "does not work on my new phone", "classification": "Portability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "Portability"},

    # Performance efficiency Class Examples
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "Performance"},
    {"review": "my phone overheats and battery drains very quickly when using this app", "classification": "Performance"},
    {"review": "too much lagging and freezing of the app after new update", "classification": "Performance"},

    # Reliability Class Examples
    {"review": "on my no anyone use whatsapp id , when the id open up and I have to put on the id it still don't open", "classification": "Reliability"},
    {"review": "The app stopped working after the recent update", "classification": "Reliability"},
    {"review": "Keeps crashing when I try to save a new record", "classification": "Reliability"},

    # Usability Class Examples
    {"review": "can't find a book unless i know exctly what book I want to buy", "classification": "Usability"},
    {"review": "this app needs to make it easier to add notes and save it", "classification": "Usability"},
    {"review": "very complicated and user unfriendly interface", "classification": "Usability"}
]

# Format these examples into a single string for the prompt
formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"Requirement: {ex['review']}\nClassification: {ex['classification']}\n\n"

# Define the list of NFR categories as a string for the prompt
all_nfr_labels_str = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other" # Based on your 5 classes + Other
])

# --- 4. The NFR Classification Prompt (P6 from prompts.json) ---
# Extracted directly from your prompts.json under NFR_CLASSIFICATION -> P6
nfr_classification_prompt_text = """
You are a software requirements expert. Your task is to classify user reviews into one of the following Non-Functional Requirement (NFR) types.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Examples:**
{few_shot_examples_text}

**Instructions:**
1.  Analyze the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list that the review most closely aligns with, based on the definitions and examples.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability', 'RL: Reliability', 'OT: Other'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, **prompt_kwargs) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    # Format the prompt using few-shot examples and NFR categories list
    formatted_prompt = nfr_classification_prompt_text.format(
        review_text=review_text,
        nfr_categories_list=all_nfr_labels_str, # This is used in the prompt's header
        few_shot_examples_text=prompt_kwargs.get('few_shot_examples_text', '') # Use prompt_kwargs for few-shot
    )

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 100 # Limit output length
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180) # Increased timeout
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models ---
all_models_nfr_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting NFR Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            nfr_categories_list=all_nfr_labels_str, # Pass the categories list for prompt formatting
            few_shot_examples_text=formatted_few_shot_text # Pass the few-shot examples for prompt formatting
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # --- Regex to capture the exact category name based on P6's strict output format ---
            # P6 expects "AB: Category Name" (e.g., "US: Usability")
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE # Use MULTILINE to match from start/end of line
            )
            
            # Extract the full category name (group 1) and convert to lowercase for comparison
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
            
            # Optional: Print raw output for debugging if needed (comment out for clean runs)
            # if pred == "Failed Parsing" or pred not in VALID_NFR_LABELS:
            #    print(f"\nFailed Parsing/Invalid Pred for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}' -> Parsed: '{pred}'")

        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_nfr_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_nfr_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of NFR Accuracies:")
for model, metrics in all_models_nfr_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final NFR Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews (sampled for test).
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|█████████████████████████████████████████████| 1278/1278 [58:19<00:00,  2.74s/it]



✅ Classification with llama2 completed in 58.32 minutes

--- Sample of Predictions for llama2 ---
                                              review ground_truth  predicted
0  without this the video calls could potentially...     security   security
1  collects way too much unneeded information abo...     security  usability
2  why exctly do you need full read access to my ...     security  usability
3                     more private than fb messenger     security  usability
4  this app is the best message and chat service,...     security  usability

--- Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.60      0.94      0.74       430
 reliability       0.71      0.54      0.61       585
 performance       0.55      0.59      0.57       119
 portability       0.86      0.05      0.10       119
    security       0.20      0.18      0.19        17
       other       0.00      0.00      0.00         0

    accuracy     

Classifying reviews with mistral: 100%|████████████████████████████████████████████| 1278/1278 [54:33<00:00,  2.56s/it]



✅ Classification with mistral completed in 54.56 minutes

--- Sample of Predictions for mistral ---
                                              review ground_truth  predicted
0  without this the video calls could potentially...     security   security
1  collects way too much unneeded information abo...     security   security
2  why exctly do you need full read access to my ...     security   security
3                     more private than fb messenger     security  usability
4  this app is the best message and chat service,...     security   security

--- Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.83      0.54      0.65       405
 reliability       0.80      0.45      0.58       580
 performance       0.28      0.97      0.43       120
 portability       0.45      0.32      0.38       118
    security       0.87      0.72      0.79        18
       other       0.00      0.00      0.00         0

    accuracy  

Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████| 1278/1278 [58:05<00:00,  2.73s/it]



✅ Classification with llama3:8b completed in 58.08 minutes

--- Sample of Predictions for llama3:8b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       0.88      0.64      0.74       432
 reliability       0.83      0.54      0.65       587
 performance       0.29      0.95      0.45       121
 portability       0.52      0.34      0.41       119
    security       0.21      0.95      0.35        19
       other       0.00      0.00      0.00         0

    accuracy  

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████| 1278/1278 [3:02:18<00:00,  8.56s/it]



✅ Classification with gemma:7b completed in 182.30 minutes

--- Sample of Predictions for gemma:7b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.86      0.37      0.51       427
 reliability       0.79      0.58      0.67       585
 performance       0.32      0.87      0.47       121
 portability       0.29      0.28      0.28       119
    security       0.80      0.84      0.82        19
       other       0.00      0.00      0.00         0

    accuracy    

Classifying reviews with phi3:mini: 100%|████████████████████████████████████████| 1278/1278 [5:11:16<00:00, 14.61s/it]



✅ Classification with phi3:mini completed in 311.27 minutes

--- Sample of Predictions for phi3:mini ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       0.83      0.58      0.68       275
 reliability       0.77      0.37      0.50       404
 performance       0.35      0.97      0.51       120
 portability       0.27      0.42      0.33       101
    security       0.59      0.94      0.72        18
       other       0.00      0.00      0.00         0

    accuracy 

In [7]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import os # Just in case it's needed for path resolution or similar

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b", # Assuming you pulled llama3:8b
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full Dataset) ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test.txt"

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Ignore malformed lines
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- Standardize labels from the file to match VALID_FR_LABELS ---
# This mapping covers common variations found in your dataset
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)
# --- END NEW ---

# Filter out rows where ground_truth is not one of our valid labels after mapping
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

# --- IMPORTANT: Removed the sampling line here to process the full dataset ---
# fr_data = fr_data.sample(n=min(20, len(fr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded {len(fr_data)} functional reviews from the full dataset for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. The Consistent Classification Prompt ---
classification_prompt_text = """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Determine which of the three categories (Feature Request, Bug Report, Other) it *most accurately* fits based on the provided definitions.
3.  Your final output MUST be only the category name, without any additional text, explanation, or punctuation.

**App Review Segment:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str) -> dict:
    """
    Sends a classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = classification_prompt_text.format(review_text=review_text)

    data = {
        "model": model_name, # Use the model name passed as argument
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models ---
all_models_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_with_ollama_model(row['review'], current_model_name)
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # Regex to capture the exact category name after "Classification:"
            match = re.search(
                r"(?:CLASSIFICATION:\s*)?(Feature Request|Bug Report|Other)",
                predicted_raw,
                re.IGNORECASE | re.DOTALL
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
            
            # Optional: Print raw output for debugging if needed (comment out for clean runs)
            # if pred == "Failed Parsing":
            #    print(f"\nFailed Parsing for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}'")

        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = fr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_FR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_FR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report # Store the full report string
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL MODELS EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies:")
# Using accuracy_score here requires it to be imported. Add it to imports:
# from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import accuracy_score # Add this import
for model, metrics in all_models_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final Evaluation End ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---
Loaded 512 functional reviews from the full dataset for testing.
Sample of loaded FR data:
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████████| 512/512 [25:59<00:00,  3.05s/it]



✅ Classification with llama2 completed in 26.00 minutes

--- Sample of Predictions for llama2 ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama2 ---
                 precision    recall  f1-score   support

feature request       0.22      0.60      0.32        50
     bug report       0.72      0.92      0.80       288
          other       0.80      0.02      0.04       174

       accuracy                           0.58       512
      macro avg       0.58      0.51      0.39       512
   weighted avg       0.70      0.58      0.50 

Classifying reviews with mistral: 100%|██████████████████████████████████████████████| 512/512 [24:09<00:00,  2.83s/it]



✅ Classification with mistral completed in 24.15 minutes

--- Sample of Predictions for mistral ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for mistral ---
                 precision    recall  f1-score   support

feature request       0.43      0.58      0.50        50
     bug report       0.87      0.70      0.78       288
          other       0.64      0.78      0.70       174

       accuracy                           0.72       512
      macro avg       0.65      0.69      0.66       512
   weighted avg       0.75      0.72      0.

Classifying reviews with llama3:8b: 100%|████████████████████████████████████████████| 512/512 [26:36<00:00,  3.12s/it]



✅ Classification with llama3:8b completed in 26.61 minutes

--- Sample of Predictions for llama3:8b ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama3:8b ---
                 precision    recall  f1-score   support

feature request       0.63      0.38      0.47        50
     bug report       0.70      0.96      0.81       288
          other       0.92      0.45      0.60       174

       accuracy                           0.73       512
      macro avg       0.75      0.60      0.63       512
   weighted avg       0.77      0.73  

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████████| 512/512 [31:21<00:00,  3.67s/it]



✅ Classification with gemma:7b completed in 31.36 minutes

--- Sample of Predictions for gemma:7b ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for gemma:7b ---
                 precision    recall  f1-score   support

feature request       0.23      0.86      0.37        50
     bug report       0.85      0.82      0.83       288
          other       0.98      0.28      0.44       174

       accuracy                           0.64       512
      macro avg       0.69      0.65      0.55       512
   weighted avg       0.83      0.64     

Classifying reviews with phi3:mini: 100%|████████████████████████████████████████████| 512/512 [19:44<00:00,  2.31s/it]


✅ Classification with phi3:mini completed in 19.74 minutes

--- Sample of Predictions for phi3:mini ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for phi3:mini ---
                 precision    recall  f1-score   support

feature request       0.52      0.58      0.55        50
     bug report       0.73      0.91      0.81       288
          other       0.85      0.48      0.62       174

       accuracy                           0.73       512
      macro avg       0.70      0.66      0.66       512
   weighted avg       0.75      0.73  




In [3]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
data_raw = pd.concat(sheets.values(), ignore_index=True)

# Rename columns for easier access and consistency
data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Define valid NFR categories (lowercase for standardization)
VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

# Standardize ground_truth labels to lowercase and strip whitespace
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

# Filter out rows where ground_truth is not one of our valid labels
initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

# --- TEMPORARY: Sample the data for a quick test run (e.g., 50 reviews) ---
# REMEMBER TO REMOVE OR COMMENT OUT THIS LINE FOR THE FULL DATASET RUN!
nfr_data = nfr_data.sample(n=min(50, len(nfr_data)), random_state=42).reset_index(drop=True)
# --- END TEMPORARY SAMPLING ---


print(f"Loaded {len(nfr_data)} non-functional reviews (sampled for test).")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. NFR Definitions & Few-Shot Examples (for the Prompt) ---
# These are the definitions and examples that were used in your prompts.json P6 context
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage.",
    "portability": "Portability requirements describe the ease with which a system can be transferred from one environment to another.",
    "reliability": "Reliability requirements describe the ability of a system to perform its functions under stated conditions for a specified period of time without failure."
}

FEW_SHOT_EXAMPLES = [
    # Security Class Examples (from your prompts.json P5/P6 context)
    {"review": "without this the video calls could potentially be intercepted by hackers", "classification": "Security"},
    {"review": "collects way too much unneeded information about my private life and location without my consent", "classification": "Security"},
    {"review": "why exctly do you need full read access to my text messages on my phone", "classification": "Security"},

    # Portability Class Examples
    {"review": "this app is very great and compatible with my tablet and phone", "classification": "Portability"},
    {"review": "does not work on my new phone", "classification": "Portability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "Portability"},

    # Performance efficiency Class Examples
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "Performance"},
    {"review": "my phone overheats and battery drains very quickly when using this app", "classification": "Performance"},
    {"review": "too much lagging and freezing of the app after new update", "classification": "Performance"},

    # Reliability Class Examples
    {"review": "on my no anyone use whatsapp id , when the id open up and I have to put on the id it still don't open", "classification": "Reliability"},
    {"review": "The app stopped working after the recent update", "classification": "Reliability"},
    {"review": "Keeps crashing when I try to save a new record", "classification": "Reliability"},

    # Usability Class Examples
    {"review": "can't find a book unless i know exctly what book I want to buy", "classification": "Usability"},
    {"review": "this app needs to make it easier to add notes and save it", "classification": "Usability"},
    {"review": "very complicated and user unfriendly interface", "classification": "Usability"}
]

# Format these examples into a single string for the prompt
formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"Requirement: {ex['review']}\nClassification: {ex['classification']}\n\n"

# Define the list of NFR categories as a string for the prompt
all_nfr_labels_str = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other" # Based on your 5 classes + Other
])

# --- 4. The NFR Classification Prompt (P6 from prompts.json) ---
# Extracted directly from your prompts.json under NFR_CLASSIFICATION -> P6
nfr_classification_prompt_text = """
You are a software requirements expert. Your task is to classify user reviews into one of the following Non-Functional Requirement (NFR) types.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Examples:**
{few_shot_examples_text}

**Instructions:**
1.  Analyze the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list that the review most closely aligns with, based on the definitions and examples.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability', 'RL: Reliability', 'OT: Other'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, **prompt_kwargs) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    # Format the prompt using few-shot examples and NFR categories list
    formatted_prompt = nfr_classification_prompt_text.format(
        review_text=review_text,
        nfr_categories_list=all_nfr_labels_str, # This is used in the prompt's header
        few_shot_examples_text=prompt_kwargs.get('few_shot_examples_text', '') # Use prompt_kwargs for few-shot
    )

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 100 # Limit output length
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180) # Increased timeout
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models ---
all_models_nfr_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting NFR Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            nfr_categories_list=all_nfr_labels_str, # Pass the categories list for prompt formatting
            few_shot_examples_text=formatted_few_shot_text # Pass the few-shot examples for prompt formatting
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # --- Regex to capture the exact category name based on P6's strict output format ---
            # P6 expects "AB: Category Name" (e.g., "US: Usability")
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE # Use MULTILINE to match from start/end of line
            )
            
            # Extract the full category name (group 1) and convert to lowercase for comparison
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
            
            # Optional: Print raw output for debugging if needed (comment out for clean runs)
            # if pred == "Failed Parsing" or pred not in VALID_NFR_LABELS:
            #    print(f"\nFailed Parsing/Invalid Pred for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}' -> Parsed: '{pred}'")

        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_nfr_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_nfr_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of NFR Accuracies:")
for model, metrics in all_models_nfr_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final NFR Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 50 non-functional reviews (sampled for test).
Sample of loaded NFR data:
                                              review ground_truth
0  it takes about 45 seconds for the page to turn...  performance
1  on my no anyone use whatsapp id , when the id ...  reliability
2  can't find a book unless i know exctly what bo...    usability
3     a paper white view would be a wonderful option    usability
4  bring it back or at least the option to turn i...    usability
----------------------------------------



Classifying reviews with llama2: 100%|█████████████████████████████████████████████████| 50/50 [02:40<00:00,  3.20s/it]



✅ Classification with llama2 completed in 2.67 minutes

--- Sample of Predictions for llama2 ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability  reliability
2  can't find a book unless i know exctly what bo...    usability    usability
3     a paper white view would be a wonderful option    usability    usability
4  bring it back or at least the option to turn i...    usability    usability

--- Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.71      0.86      0.77        14
 reliability       0.67      0.70      0.68        23
 performance       0.43      0.75      0.55         4
 portability       1.00      0.11      0.20         9
    security       0.00      0.00      0.00         0
       other       0.00      0.00      0.00         0

    ac

Classifying reviews with mistral: 100%|████████████████████████████████████████████████| 50/50 [02:34<00:00,  3.08s/it]



✅ Classification with mistral completed in 2.57 minutes

--- Sample of Predictions for mistral ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability  reliability
2  can't find a book unless i know exctly what bo...    usability    usability
3     a paper white view would be a wonderful option    usability  portability
4  bring it back or at least the option to turn i...    usability    usability

--- Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       1.00      0.38      0.56        13
 reliability       0.75      0.52      0.62        23
 performance       0.24      1.00      0.38         4
 portability       0.50      0.22      0.31         9
    security       0.00      0.00      0.00         0
       other       0.00      0.00      0.00         0

   

Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████████| 50/50 [02:38<00:00,  3.18s/it]



✅ Classification with llama3:8b completed in 2.65 minutes

--- Sample of Predictions for llama3:8b ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability  reliability
2  can't find a book unless i know exctly what bo...    usability    usability
3     a paper white view would be a wonderful option    usability    usability
4  bring it back or at least the option to turn i...    usability     security

--- Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       1.00      0.79      0.88        14
 reliability       0.83      0.65      0.73        23
 performance       0.29      1.00      0.44         4
 portability       0.60      0.33      0.43         9
    security       0.00      0.00      0.00         0
       other       0.00      0.00      0.00         

Classifying reviews with gemma:7b: 100%|███████████████████████████████████████████████| 50/50 [03:21<00:00,  4.03s/it]



✅ Classification with gemma:7b completed in 3.36 minutes

--- Sample of Predictions for gemma:7b ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability  reliability
2  can't find a book unless i know exctly what bo...    usability    usability
3     a paper white view would be a wonderful option    usability        other
4  bring it back or at least the option to turn i...    usability  reliability

--- Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.80      0.31      0.44        13
 reliability       0.77      0.74      0.76        23
 performance       0.29      1.00      0.44         4
 portability       0.67      0.22      0.33         9
    security       0.00      0.00      0.00         0
       other       0.00      0.00      0.00         0



Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████████| 50/50 [02:01<00:00,  2.44s/it]


✅ Classification with phi3:mini completed in 2.03 minutes

--- Sample of Predictions for phi3:mini ---
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability  reliability
2  can't find a book unless i know exctly what bo...    usability    usability
3     a paper white view would be a wonderful option    usability  portability
4  bring it back or at least the option to turn i...    usability    usability

--- Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       1.00      0.50      0.67        10
 reliability       0.67      0.35      0.46        17
 performance       0.25      1.00      0.40         4
 portability       0.43      0.38      0.40         8
    security       0.00      0.00      0.00         0
       other       0.00      0.00      0.00         




In [4]:
import pandas as pd

# Assuming you have just run the unified NFR classification cell with the 50-review sample.
# The 'nfr_data' DataFrame should still be in memory from that execution.

print("--- Ground Truth Distribution in the 50-Review Sample ---")
print(nfr_data['ground_truth'].value_counts())

print("\n--- Reviews labeled as 'security' in sample (if any) ---")
print(nfr_data[nfr_data['ground_truth'] == 'security'])

print("\n--- Reviews labeled as 'other' in sample (if any) ---")
print(nfr_data[nfr_data['ground_truth'] == 'other'])

--- Ground Truth Distribution in the 50-Review Sample ---
ground_truth
reliability    23
usability      14
portability     9
performance     4
Name: count, dtype: int64

--- Reviews labeled as 'security' in sample (if any) ---
Empty DataFrame
Columns: [review, ground_truth]
Index: []

--- Reviews labeled as 'other' in sample (if any) ---
Empty DataFrame
Columns: [review, ground_truth]
Index: []


In [2]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
data_raw = pd.concat(sheets.values(), ignore_index=True)

# Rename columns for easier access and consistency
data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Define valid NFR categories (lowercase for standardization)
VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

# Standardize ground_truth labels to lowercase and strip whitespace
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

# Filter out rows where ground_truth is not one of our valid labels
initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

print(f"Loaded {len(nfr_data)} non-functional reviews from the full dataset for testing.")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. NFR Definitions & Few-Shot Examples (for the Prompt) ---
# These are the definitions and examples that were used in your prompts.json P6 context
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage.",
    "portability": "Portability requirements describe the ease with which a system can be transferred from one environment to another.",
    "reliability": "Reliability requirements describe the ability of a system to perform its functions under stated conditions for a specified period of time without failure."
}

FEW_SHOT_EXAMPLES = [
    # Security Class Examples (from your prompts.json P5/P6 context)
    {"review": "without this the video calls could potentially be intercepted by hackers", "classification": "Security"},
    {"review": "collects way too much unneeded information about my private life and location without my consent", "classification": "Security"},
    {"review": "why exctly do you need full read access to my text messages on my phone", "classification": "Security"},

    # Portability Class Examples
    {"review": "this app is very great and compatible with my tablet and phone", "classification": "Portability"},
    {"review": "does not work on my new phone", "classification": "Portability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "Portability"},

    # Performance efficiency Class Examples
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "Performance"},
    {"review": "my phone overheats and battery drains very quickly when using this app", "classification": "Performance"},
    {"review": "too much lagging and freezing of the app after new update", "classification": "Performance"},

    # Reliability Class Examples
    {"review": "on my no anyone use whatsapp id , when the id open up and I have to put on the id it still don't open", "classification": "Reliability"},
    {"review": "The app stopped working after the recent update", "classification": "Reliability"},
    {"review": "Keeps crashing when I try to save a new record", "classification": "Reliability"},

    # Usability Class Examples
    {"review": "can't find a book unless i know exctly what book I want to buy", "classification": "Usability"},
    {"review": "this app needs to make it easier to add notes and save it", "classification": "Usability"},
    {"review": "very complicated and user unfriendly interface", "classification": "Usability"}
]

# Format these examples into a single string for the prompt
formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"Requirement: {ex['review']}\nClassification: {ex['classification']}\n\n"

# Define the list of NFR categories as a string for the prompt
all_nfr_labels_str = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other" # Based on your 5 classes + Other
])

# --- 4. The NFR Classification Prompt (P6 from prompts.json) ---
# Extracted directly from your prompts.json under NFR_CLASSIFICATION -> P6
nfr_classification_prompt_text = """
You are a software requirements expert. Your task is to classify user reviews into one of the following Non-Functional Requirement (NFR) types.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Examples:**
{few_shot_examples_text}

**Instructions:**
1.  Analyze the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list that the review most closely aligns with, based on the definitions and examples.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability', 'RL: Reliability', 'OT: Other'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, **prompt_kwargs) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    # Format the prompt using few-shot examples and NFR categories list
    formatted_prompt = nfr_classification_prompt_text.format(
        review_text=review_text,
        nfr_categories_list=all_nfr_labels_str, # This is used in the prompt's header
        few_shot_examples_text=formatted_few_shot_text
    )

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 100 # Limit output length
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180) # Increased timeout
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models ---
all_models_nfr_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting NFR Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            nfr_categories_list=all_nfr_labels_str, # Pass the categories list for prompt formatting
            few_shot_examples_text=formatted_few_shot_text # Pass the few-shot examples for prompt formatting
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # --- Regex to capture the exact category name based on P6's strict output format ---
            # P6 expects "AB: Category Name" (e.g., "US: Usability")
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE # Use MULTILINE to match from start/end of line
            )
            
            # Extract the full category name (group 1) and convert to lowercase for comparison
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
            
            # Optional: Print raw output for debugging if needed (comment out for clean runs)
            # if pred == "Failed Parsing" or pred not in VALID_NFR_LABELS:
            #    print(f"\nFailed Parsing/Invalid Pred for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}' -> Parsed: '{pred}'")

        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_nfr_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_nfr_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of NFR Accuracies:")
for model, metrics in all_models_nfr_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final NFR Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews from the full dataset for testing.
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2:   5%|██▍                                          | 68/1278 [03:43<1:06:08,  3.28s/it]


KeyboardInterrupt: 

In [None]:
import sys
sys.path.append("src")  # Tells Python where to look for your custom modules

from src.llm_client import LLMClient
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import time
from src.config import Config

In [None]:
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets
data = pd.concat(sheets.values(), ignore_index=True)
data = data[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

In [None]:
data.head()

In [None]:
sampled_data = pd.concat([df.sample(n=5, random_state=42) for df in sheets.values()], ignore_index=True)
data = sampled_data[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

In [None]:
data.head()

In [None]:
# --- THIS IS WHERE YOU CONTROL THE PROMPT ID ---
# To test P1 for NFR Classification:
Config.set_active_prompt_id("NFR_CLASSIFICATION", "P5") 

# To test P2 for NFR Classification, just change the line above:
# Config.set_active_prompt_id("NFR_CLASSIFICATION", "P2") 

# To test P3 for NFR Classification:
# Config.set_active_prompt_id("NFR_CLASSIFICATION", "P3") 

In [None]:
import sys
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import time
import logging

# Set up basic logging for clarity in notebook output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Adjust sys.path to ensure modules in 'src' can be imported
if "src" not in sys.path:
    sys.path.append("src")

from src.llm_client import LLMClient
from src.config import Config

# --- 1. Data Loading and Preparation ---
print("--- Loading and preparing data ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
data = pd.concat(sheets.values(), ignore_index=True)

# --- NEW: Sample the data down to 25 reviews ---
# Why: This reduces the number of LLM calls, making tests much faster.
# random_state ensures you get the same 25 reviews each time for reproducibility.
data = data.sample(n=25, random_state=42).reset_index(drop=True)
# -----------------------------------------------

# Rename columns for easier access and consistency
data = data[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Standardize ground_truth labels to lowercase and strip whitespace
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()
print(f"Loaded {len(data)} reviews.")
print("Sample of loaded data:")
print(data.head())
print("-" * 40)

# --- 2. LLM Client Initialization ---
print("--- Initializing LLM Client ---")
client = LLMClient()
if not client.test_connection():
    print("❌ LLM connection failed at initialization. Please check Ollama server.")
else:
    print("✅ LLM Client initialized and connected.")
print("-" * 40)


# --- 3. Define NFR Definitions (Required for P3 'Definition-Based' prompts) ---
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage."
    # Add more NFR definitions as needed for all your categories (e.g., "availability", "maintainability", etc.)
}

# --- Few-Shot Examples for P5 ---
# Extracted from the first 3 rows of each of your provided NFR class CSVs.
# These will be used to demonstrate the classification task to the LLM.
FEW_SHOT_EXAMPLES = [
    # Security Class Examples
    {"review": "without this the video calls could potentially be intercepted by hackers", "classification": "Security"},
    {"review": "collects way too much unneeded information about my private life and location without my consent", "classification": "Security"},
    {"review": "why exctly do you need full read access to my text messages on my phone", "classification": "Security"},

    # Portability Class Examples
    {"review": "this app is very great and compatible with my tablet and phone", "classification": "Portability"},
    {"review": "does not work on my new phone", "classification": "Portability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "Portability"},

    # Performance efficiency Class Examples
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "Performance"},
    {"review": "my phone overheats and battery drains very quickly when using this app", "classification": "Performance"},
    {"review": "too much lagging and freezing of the app after new update", "classification": "Performance"},

    # Reliability Class Examples
    {"review": "on my no anyone use whatsapp id , when the id open up and I have to put on the id it still don't open", "classification": "Reliability"},
    {"review": "The app stopped working after the recent update", "classification": "Reliability"},
    {"review": "Keeps crashing when I try to save a new record", "classification": "Reliability"},

    # Usability Class Examples
    {"review": "can't find a book unless i know exctly what book I want to buy", "classification": "Usability"},
    {"review": "this app needs to make it easier to add notes and save it", "classification": "Usability"},
    {"review": "very complicated and user unfriendly interface", "classification": "Usability"}
]

# Format these examples into a single string for the prompt
formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"Requirement: {ex['review']}\nClassification: {ex['classification']}\n\n"


# --- 4. Evaluation Function Definition ---
def evaluate_nfr_prompt_strategy(
    prompt_id: str,
    data_df: pd.DataFrame,
    client_instance: LLMClient,
    category_name: str = "NFR_CLASSIFICATION",
    **prompt_kwargs
) -> pd.DataFrame:
    """
    Evaluates the NFR classification performance using a specific prompt ID.

    Args:
        prompt_id (str): The ID of the prompt to activate from prompts.json.
        data_df (pd.DataFrame): The DataFrame containing 'review' and 'ground_truth' columns.
        client_instance (LLMClient): An initialized instance of the LLMClient.
        category_name (str): The category name for the prompt in prompts.json.
        **prompt_kwargs: Additional keyword arguments to pass to the prompt's .format() method.

    Returns:
        pd.DataFrame: A DataFrame with 'review', 'ground_truth', 'predicted' columns,
                      and a performance report for the given prompt.
    """
    
    Config.set_active_prompt_id(category_name, prompt_id)
    
    print(f"\n--- Starting evaluation for Prompt ID: {prompt_id} ---")

    predictions = []
    start_time = time.time()

    for i, row in tqdm(data_df.iterrows(), total=len(data_df), desc=f"Classifying with {prompt_id}"):
        response = client_instance.classify_nfr(row['review'], **prompt_kwargs)
        pred = response.classification if response.success else "Failed"
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {prompt_id} completed in {elapsed/60:.2f} minutes")

    results_df = data_df.copy()
    results_df['predicted'] = predictions
    
    results_df['predicted'] = results_df['predicted'].str.strip().str.lower()

    filtered_results = results_df[results_df['predicted'] != 'failed']
    
    report = classification_report(
        filtered_results['ground_truth'], 
        filtered_results['predicted'], 
        zero_division=0
    )
    
    print(f"\n--- Classification Report for Prompt ID: {prompt_id} ---")
    print(f"\n{report}")
    print(f"--- End Report for Prompt ID: {prompt_id} ---\n")

    return results_df, report

In [None]:
import sys
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import time
import logging

# Set up basic logging for clarity in notebook output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Adjust sys.path to ensure modules in 'src' can be imported
if "src" not in sys.path:
    sys.path.append("src")

from src.llm_client import LLMClient
from src.config import Config

# --- 1. Data Loading and Preparation ---
print("--- Loading and preparing data ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
data = pd.concat(sheets.values(), ignore_index=True)

# --- NEW: Sample the data down to 25 reviews ---
data = data.sample(n=25, random_state=42).reset_index(drop=True)
# -----------------------------------------------

# Rename columns for easier access and consistency
data = data[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Standardize ground_truth labels to lowercase and strip whitespace
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()
print(f"Loaded {len(data)} reviews.")
print("Sample of loaded data:")
print(data.head())
print("-" * 40)

# --- 2. LLM Client Initialization ---
print("--- Initializing LLM Client ---")
client = LLMClient()
if not client.test_connection():
    print("❌ LLM connection failed at initialization. Please check Ollama server.")
else:
    print("✅ LLM Client initialized and connected.")
print("-" * 40)


# --- 3. Define NFR Definitions & Few-Shot Examples (Required for P3 and P5) ---
# NFR Definitions
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage.",
    "portability": "Portability requirements describe the ease with which a system can be transferred from one environment to another.",
    "reliability": "Reliability requirements describe the ability of a system to perform its functions under stated conditions for a specified period of time without failure."
}

# Few-Shot Examples for P5
FEW_SHOT_EXAMPLES = [
    # Security Class Examples
    {"review": "without this the video calls could potentially be intercepted by hackers", "classification": "Security"},
    {"review": "collects way too much unneeded information about my private life and location without my consent", "classification": "Security"},
    {"review": "why exctly do you need full read access to my text messages on my phone", "classification": "Security"},

    # Portability Class Examples
    {"review": "this app is very great and compatible with my tablet and phone", "classification": "Portability"},
    {"review": "does not work on my new phone", "classification": "Portability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "Portability"},

    # Performance efficiency Class Examples
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "Performance"},
    {"review": "my phone overheats and battery drains very quickly when using this app", "classification": "Performance"},
    {"review": "too much lagging and freezing of the app after new update", "classification": "Performance"},

    # Reliability Class Examples
    {"review": "on my no anyone use whatsapp id , when the id open up and I have to put on the id it still don't open", "classification": "Reliability"},
    {"review": "The app stopped working after the recent update", "classification": "Reliability"},
    {"review": "Keeps crashing when I try to save a new record", "classification": "Reliability"},

    # Usability Class Examples
    {"review": "can't find a book unless i know exctly what book I want to buy", "classification": "Usability"},
    {"review": "this app needs to make it easier to add notes and save it", "classification": "Usability"},
    {"review": "very complicated and user unfriendly interface", "classification": "Usability"}
]

# Format these examples into a single string for the prompt
formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"Requirement: {ex['review']}\nClassification: {ex['classification']}\n\n"

# --- Define the list of NFR categories as a string for prompts P1, P4, P5 ---
# Moved this definition here so it's accessible by all evaluation calls
all_nfr_labels_str = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other" # Based on your 5 classes + Other
])
# Why: This variable is used by multiple prompts (like P1, P4, P5) to tell the LLM which categories to choose from.
# Defining it here ensures it's always available before any prompt evaluation calls.
print("-" * 40)


# --- 4. Evaluation Function Definition ---
def evaluate_nfr_prompt_strategy(
    prompt_id: str,
    data_df: pd.DataFrame,
    client_instance: LLMClient,
    category_name: str = "NFR_CLASSIFICATION",
    **prompt_kwargs
) -> pd.DataFrame:
    """
    Evaluates the NFR classification performance using a specific prompt ID.

    Args:
        prompt_id (str): The ID of the prompt to activate from prompts.json.
        data_df (pd.DataFrame): The DataFrame containing 'review' and 'ground_truth' columns.
        client_instance (LLMClient): An initialized instance of the LLMClient.
        category_name (str): The category name for the prompt in prompts.json.
        **prompt_kwargs: Additional keyword arguments to pass to the prompt's .format() method.

    Returns:
        pd.DataFrame: A DataFrame with 'review', 'ground_truth', 'predicted' columns,
                      and a performance report for the given prompt.
    """
    
    Config.set_active_prompt_id(category_name, prompt_id)
    
    print(f"\n--- Starting evaluation for Prompt ID: {prompt_id} ---")

    predictions = []
    start_time = time.time()

    for i, row in tqdm(data_df.iterrows(), total=len(data_df), desc=f"Classifying with {prompt_id}"):
        response = client_instance.classify_nfr(row['review'], **prompt_kwargs)
        pred = response.classification if response.success else "Failed"
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {prompt_id} completed in {elapsed/60:.2f} minutes")

    results_df = data_df.copy()
    results_df['predicted'] = predictions
    
    results_df['predicted'] = results_df['predicted'].str.strip().str.lower()

    filtered_results = results_df[results_df['predicted'] != 'failed']
    
    report = classification_report(
        filtered_results['ground_truth'], 
        filtered_results['predicted'], 
        zero_division=0
    )
    
    print(f"\n--- Classification Report for Prompt ID: {prompt_id} ---\n")
    print(f"{report}\n")
    print(f"--- End Report for Prompt ID: {prompt_id} ---\n")

    return results_df, report

# --- 5. Sequential Evaluation of Prompts --- 
'''
# Evaluation for P1 (Original, now corrected to your 5 classes)
print("\n========== EVALUATING PROMPT P1 (Multi-Class Selection) ==========")
results_p1_df, report_p1 = evaluate_nfr_prompt_strategy(
    prompt_id="P1",
    data_df=data,
    client_instance=client,
    # P1 now uses a hardcoded list of categories in its text,
    # so no 'nfr_categories_list' needed here for formatting,
    # but the prompt text in prompts.json must be accurate.
)
print("First 5 predictions for P1:")
print(results_p1_df.head())


# Evaluation for P2 (El-Hajjami's Zero-Shot Prompt)
# This prompt is designed for BINARY classification (e.g., Is this requirement 'security'? Yes/No)
# It requires target_classification_category.
nfr_category_for_p2_test = "security" # You can change this to any NFR category from your dataset

print("\n========== EVALUATING PROMPT P2 ==========")
results_p2_df, report_p2 = evaluate_nfr_prompt_strategy(
    prompt_id="P2",
    data_df=data,
    client_instance=client,
    target_classification_category=nfr_category_for_p2_test
)
print("First 5 predictions for P2:")
print(results_p2_df.head())


# Evaluation for P3 (Combined Alhoshan Patterns Prompt)
# This prompt is highly flexible and needs specific values for its placeholders.
# Example below uses 'usability' with a 'Definition-Based' 'is about' pattern.
print("\n========== EVALUATING PROMPT P3 (Usability - Is About Definition) ==========")
results_p3_df, report_p3 = evaluate_nfr_prompt_strategy(
    prompt_id="P3",
    data_df=data,
    client_instance=client,
    prefix_definition=f"{NFR_DEFINITIONS['usability']}. Therefore, ",
    classification_statement_or_question="this requirement is about",
    candidate_label="usability"
)
print("First 5 predictions for P3:")
print(results_p3_df.head())

'''
# Evaluation for P4 (Multi-Class Selection Prompt - placeholder version)
# This prompt uses `nfr_categories_list` as a placeholder to dynamically insert the list of categories.
all_nfr_labels_str_for_p4 = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other"
])

print("\n========== EVALUATING PROMPT P4 (Multi-Class Selection - Dynamic List) ==========")
results_p4_df, report_p4 = evaluate_nfr_prompt_strategy(
    prompt_id="P4",
    data_df=data,
    client_instance=client,
    nfr_categories_list=all_nfr_labels_str_for_p4 # Pass the list of categories as a string
)
print("First 5 predictions for P4:")
print(results_p4_df.head())


# Evaluation for P5 (Few-Shot Multi-Class Selection)
# This prompt uses both few-shot_examples_text and nfr_categories_list.
# The `formatted_few_shot_text` variable is defined earlier in the notebook.

print("\n========== EVALUATING PROMPT P5 (Few-Shot Multi-Class Selection) ==========")
results_p5_df, report_p5 = evaluate_nfr_prompt_strategy(
    prompt_id="P5",
    data_df=data,
    client_instance=client,
    nfr_categories_list=all_nfr_labels_str_for_p4, # Use the same list of categories
    few_shot_examples_text=formatted_few_shot_text # Pass the prepared few-shot examples
)
print("First 5 predictions for P5:")
print(results_p5_df.head())


print("\n--- All evaluations completed ---")

In [None]:
results_p5_df

In [None]:
import sys
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import time
import logging

# Set up basic logging for clarity in notebook output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Adjust sys.path to ensure modules in 'src' can be imported
if "src" not in sys.path:
    sys.path.append("src")

from src.llm_client import LLMClient
from src.config import Config

# --- 1. Data Loading and Preparation ---
print("--- Loading and preparing data ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
data = pd.concat(sheets.values(), ignore_index=True)

# --- NEW: Sample the data down to 25 reviews ---
data = data.sample(n=25, random_state=42).reset_index(drop=True)
# -----------------------------------------------

# Rename columns for easier access and consistency
data = data[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Standardize ground_truth labels to lowercase and strip whitespace
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()
print(f"Loaded {len(data)} reviews.")
print("Sample of loaded data:")
print(data.head())
print("-" * 40)

# --- 2. LLM Client Initialization ---
print("--- Initializing LLM Client ---")
client = LLMClient()
if not client.test_connection():
    print("❌ LLM connection failed at initialization. Please check Ollama server.")
else:
    print("✅ LLM Client initialized and connected.")
print("-" * 40)


# --- 3. Define NFR Definitions & Few-Shot Examples (Required for P5) ---
# NFR Definitions
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage.",
    "portability": "Portability requirements describe the ease with which a system can be transferred from one environment to another.",
    "reliability": "Reliability requirements describe the ability of a system to perform its functions under stated conditions for a specified period of time without failure."
}

# Few-Shot Examples for P5
FEW_SHOT_EXAMPLES = [
    # Security Class Examples
    {"review": "without this the video calls could potentially be intercepted by hackers", "classification": "Security"},
    {"review": "collects way too much unneeded information about my private life and location without my consent", "classification": "Security"},
    {"review": "why exctly do you need full read access to my text messages on my phone", "classification": "Security"},

    # Portability Class Examples
    {"review": "this app is very great and compatible with my tablet and phone", "classification": "Portability"},
    {"review": "does not work on my new phone", "classification": "Portability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "Portability"},

    # Performance efficiency Class Examples
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "Performance"},
    {"review": "my phone overheats and battery drains very quickly when using this app", "classification": "Performance"},
    {"review": "too much lagging and freezing of the app after new update", "classification": "Performance"},

    # Reliability Class Examples
    {"review": "on my no anyone use whatsapp id , when the id open up and I have to put on the id it still don't open", "classification": "Reliability"},
    {"review": "The app stopped working after the recent update", "classification": "Reliability"},
    {"review": "Keeps crashing when I try to save a new record", "classification": "Reliability"},

    # Usability Class Examples
    {"review": "can't find a book unless i know exctly what book I want to buy", "classification": "Usability"},
    {"review": "this app needs to make it easier to add notes and save it", "classification": "Usability"},
    {"review": "very complicated and user unfriendly interface", "classification": "Usability"}
]

# Format these examples into a single string for the prompt
formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"Requirement: {ex['review']}\nClassification: {ex['classification']}\n\n"

# --- Define the list of NFR categories as a string for prompts P1, P4, P5 ---
all_nfr_labels_str = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other" # Your 5 classes + 'Other'
])
print("-" * 40)


# --- 4. Evaluation Function Definition ---
def evaluate_nfr_prompt_strategy(
    prompt_id: str,
    data_df: pd.DataFrame,
    client_instance: LLMClient,
    category_name: str = "NFR_CLASSIFICATION",
    **prompt_kwargs
) -> pd.DataFrame:
    """
    Evaluates the NFR classification performance using a specific prompt ID.

    Args:
        prompt_id (str): The ID of the prompt to activate from prompts.json.
        data_df (pd.DataFrame): The DataFrame containing 'review' and 'ground_truth' columns.
        client_instance (LLMClient): An initialized instance of the LLMClient.
        category_name (str): The category name for the prompt in prompts.json.
        **prompt_kwargs: Additional keyword arguments to pass to the prompt's .format() method.

    Returns:
        pd.DataFrame: A DataFrame with 'review', 'ground_truth', 'predicted' columns,
                      and a performance report for the given prompt.
    """
    
    Config.set_active_prompt_id(category_name, prompt_id)
    
    print(f"\n--- Starting evaluation for Prompt ID: {prompt_id} ---")

    predictions = []
    start_time = time.time()

    for i, row in tqdm(data_df.iterrows(), total=len(data_df), desc=f"Classifying with {prompt_id}"):
        response = client_instance.classify_nfr(row['review'], **prompt_kwargs)
        pred = response.classification if response.success else "Failed"
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {prompt_id} completed in {elapsed/60:.2f} minutes")

    results_df = data_df.copy()
    results_df['predicted'] = predictions
    
    results_df['predicted'] = results_df['predicted'].str.strip().str.lower()

    filtered_results = results_df[results_df['predicted'] != 'failed']
    
    report = classification_report(
        filtered_results['ground_truth'], 
        filtered_results['predicted'], 
        zero_division=0
    )
    
    print(f"\n--- Classification Report for Prompt ID: {prompt_id} ---\n")
    print(f"{report}\n")
    print(f"--- End Report for Prompt ID: {prompt_id} ---\n")

    return results_df, report

In [None]:
# --- 5. Evaluation for P5 (Few-Shot Multi-Class Selection) ---
# This is the ONLY evaluation block that will run in this cell.
print("\n========== EVALUATING PROMPT P5 (Few-Shot Multi-Class Selection) ==========")
results_p5_df, report_p5 = evaluate_nfr_prompt_strategy(
    prompt_id="P5",
    data_df=data,
    client_instance=client,
    nfr_categories_list=all_nfr_labels_str, # Use the defined list of categories
    few_shot_examples_text=formatted_few_shot_text # Pass the prepared few-shot examples
)
print("First 5 predictions for P5:")
print(results_p5_df.head())

print("\n--- All evaluations completed ---")

In [None]:
results_p5_df

In [None]:
# --- Evaluation for P6 (Few-Shot Multi-Class Strict Selection with Definitions) ---
# This prompt uses hardcoded definitions and a strict output list in its text.
# It still uses 'formatted_few_shot_text' and 'all_nfr_labels_str' for general context and parsing.

print("\n========== EVALUATING PROMPT P6 (Few-Shot Multi-Class Strict Selection with Definitions) ==========")
results_p6_df, report_p6 = evaluate_nfr_prompt_strategy(
    prompt_id="P6",
    data_df=data,
    client_instance=client,
    # Pass all_nfr_labels_str as it's still needed by the prompt's instructions and for parsing
    nfr_categories_list=all_nfr_labels_str,
    few_shot_examples_text=formatted_few_shot_text
)
print("First 5 predictions for P6:")
print(results_p6_df.head())


print("\n--- All evaluations completed ---") # This line would be the final one in your notebook cell

In [None]:
results_p6_df

In [None]:
import sys
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import time
import logging
import os # For os.getenv

# --- IMPORTANT: Set your Gemini API Key as an environment variable or directly ---
# If you are pasting your key directly into src/config.py, you do NOT need this line here.
# If you prefer to use an environment variable, uncomment and set it:
# os.environ["GEMINI_API_KEY"] = "YOUR_ACTUAL_GEMINI_API_KEY_HERE"
# -------------------------------------------------------------------------------

# Set up basic logging for clarity in notebook output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.getLogger('src.llm_client').setLevel(logging.DEBUG) # Enable DEBUG for llm_client

# Adjust sys.path to ensure modules in 'src' can be imported
if "src" not in sys.path:
    sys.path.append("src")

from src.llm_client import LLMClient
from src.config import Config

# --- 1. Data Loading and Preparation ---
print("--- Loading and preparing data ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
data = pd.concat(sheets.values(), ignore_index=True)

# --- NEW: Sample the data down to 25 reviews ---
data = data.sample(n=25, random_state=42).reset_index(drop=True)
# -----------------------------------------------

# Rename columns for easier access and consistency
data = data[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Standardize ground_truth labels to lowercase and strip whitespace
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()
print(f"Loaded {len(data)} reviews.")
print("Sample of loaded data:")
print(data.head())
print("-" * 40)

# --- 2. LLM Client Initialization ---
print("--- Initializing LLM Client ---")
# Ensure Config.py has DEFAULT_LLM_MODEL set to a Gemini 1.5-Flash model (e.g., "models/gemini-1.5-flash-latest")
# and Config.GEMINI_API_KEY is set (either directly in config or via environment variable as setup in config).
client = LLMClient()
if not client.test_connection():
    print("❌ LLM connection failed at initialization. Please check API key, model name in config, and network.")
else:
    print("✅ LLM Client initialized and connected.")
print("-" * 40)


# --- 3. Define NFR Definitions & Few-Shot Examples (Required for P5) ---
# NFR Definitions
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage.",
    "portability": "Portability requirements describe the ease with which a system can be transferred from one environment to another.",
    "reliability": "Reliability requirements describe the ability of a system to perform its functions under stated conditions for a specified period of time without failure."
}

# Few-Shot Examples for P5
FEW_SHOT_EXAMPLES = [
    # Security Class Examples
    {"review": "without this the video calls could potentially be intercepted by hackers", "classification": "Security"},
    {"review": "collects way too much unneeded information about my private life and location without my consent", "classification": "Security"},
    {"review": "why exctly do you need full read access to my text messages on my phone", "classification": "Security"},

    # Portability Class Examples
    {"review": "this app is very great and compatible with my tablet and phone", "classification": "Portability"},
    {"review": "does not work on my new phone", "classification": "Portability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "Portability"},

    # Performance efficiency Class Examples
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "Performance"},
    {"review": "my phone overheats and battery drains very quickly when using this app", "classification": "Performance"},
    {"review": "too much lagging and freezing of the app after new update", "classification": "Performance"},

    # Reliability Class Examples
    {"review": "on my no anyone use whatsapp id , when the id open up and I have to put on the id it still don't open", "classification": "Reliability"},
    {"review": "The app stopped working after the recent update", "classification": "Reliability"},
    {"review": "Keeps crashing when I try to save a new record", "classification": "Reliability"},

    # Usability Class Examples
    {"review": "can't find a book unless i know exctly what book I want to buy", "classification": "Usability"},
    {"review": "this app needs to make it easier to add notes and save it", "classification": "Usability"},
    {"review": "very complicated and user unfriendly interface", "classification": "Usability"}
]

# Format these examples into a single string for the prompt
formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"Requirement: {ex['review']}\nClassification: {ex['classification']}\n\n"

# --- Define the list of NFR categories as a string for prompts P1, P4, P5, P6 ---
all_nfr_labels_str = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other" # Your 5 classes + 'Other'
])
print("-" * 40)


# --- 4. Evaluation Function Definition ---
def evaluate_nfr_prompt_strategy(
    prompt_id: str,
    data_df: pd.DataFrame,
    client_instance: LLMClient,
    category_name: str = "NFR_CLASSIFICATION",
    **prompt_kwargs
) -> pd.DataFrame:
    """
    Evaluates the NFR classification performance using a specific prompt ID.

    Args:
        prompt_id (str): The ID of the prompt to activate from prompts.json.
        data_df (pd.DataFrame): The DataFrame containing 'review' and 'ground_truth' columns.
        client_instance (LLMClient): An initialized instance of the LLMClient.
        category_name (str): The category name for the prompt in prompts.json.
        **prompt_kwargs: Additional keyword arguments to pass to the prompt's .format() method.

    Returns:
        pd.DataFrame: A DataFrame with 'review', 'ground_truth', 'predicted' columns,
                      and a performance report for the given prompt.
    """
    
    Config.set_active_prompt_id(category_name, prompt_id)
    
    print(f"\n--- Starting evaluation for Prompt ID: {prompt_id} ---")

    predictions = []
    start_time = time.time()

    for i, row in tqdm(data_df.iterrows(), total=len(data_df), desc=f"Classifying with {prompt_id}"):
        response = client_instance.classify_nfr(row['review'], **prompt_kwargs)
        pred = response.classification if response.success else "Failed"
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {prompt_id} completed in {elapsed/60:.2f} minutes")

    results_df = data_df.copy()
    results_df['predicted'] = predictions
    
    results_df['predicted'] = results_df['predicted'].str.strip().str.lower()

    filtered_results = results_df[results_df['predicted'] != 'failed']
    
    report = classification_report(
        filtered_results['ground_truth'], 
        filtered_results['predicted'], 
        zero_division=0
    )
    
    print(f"\n--- Classification Report for Prompt ID: {prompt_id} ---\n")
    print(f"{report}\n")
    print(f"--- End Report for Prompt ID: {prompt_id} ---\n")

    return results_df, report


In [None]:
# --- 5. Evaluation for P5 (Few-Shot Multi-Class Strict Selection) ---
# This is the ONLY evaluation block that will run in this cell.
print("\n========== EVALUATING PROMPT P5 (Few-Shot Multi-Class Strict Selection) ==========")
results_p5_df, report_p5 = evaluate_nfr_prompt_strategy(
    prompt_id="P5",
    data_df=data,
    client_instance=client,
    nfr_categories_list=all_nfr_labels_str,
    few_shot_examples_text=formatted_few_shot_text
)
print("First 5 predictions for P5:")
print(results_p5_df.head())

print("\n--- All evaluations completed ---")

In [None]:
results_p5_df

In [1]:
import sys
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import time
import logging
import os

# --- IMPORTANT: Ensure your Gemini API Key is set in src/config.py directly ---
# No need for os.environ here if you've put it directly in config.py
# -------------------------------------------------------------------------------

# Set up basic logging for clarity in notebook output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# Why: Enables detailed logging from llm_client, useful for debugging LLM responses.
logging.getLogger('src.llm_client').setLevel(logging.DEBUG) 

# Adjust sys.path to ensure modules in 'src' can be imported
# Why: Allows Python to find your custom modules like llm_client and config.
if "src" not in sys.path:
    sys.path.append("src")

from src.llm_client import LLMClient
from src.config import Config

# --- 1. Data Loading and Preparation ---
print("--- Loading and preparing data ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
# Why: Merges data from all sheets of your Excel file into one DataFrame.
data = pd.concat(sheets.values(), ignore_index=True)

# --- NEW: Sample the data down to 25 reviews ---
# Why: Reduces the dataset size for faster testing and iteration.
data = data.sample(n=25, random_state=42).reset_index(drop=True)
# -----------------------------------------------

# Rename columns for easier access and consistency
# Why: Standardizes column names for easier manipulation in the code.
data = data[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Standardize ground_truth labels to lowercase and strip whitespace
# Why: Ensures consistency for comparison with LLM predictions,
# as LLMs might output varying cases or with extra spaces.
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()
print(f"Loaded {len(data)} reviews.")
print("Sample of loaded data:")
print(data.head())
print("-" * 40)

# --- 2. LLM Client Initialization ---
print("--- Initializing LLM Client ---")
# Why: Creates an instance of the LLMClient to interact with the chosen LLM (Gemini).
# It also performs a connection test to ensure the API is reachable and configured correctly.
client = LLMClient()
if not client.test_connection():
    print("❌ LLM connection failed at initialization. Please check API key in config, model name in config, and network.")
    # Consider raising an exception here to stop execution if connection is critical for further steps.
else:
    print("✅ LLM Client initialized and connected.")
print("-" * 40)


# --- 3. Define NFR Definitions & Few-Shot Examples (Required for P5/P6) ---
# Why: These provide explicit definitions and in-context learning examples for the LLM.
# They are crucial for guiding multi-class NFR classification.
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage.",
    "portability": "Portability requirements describe the ease with which a system can be transferred from one environment to another.",
    "reliability": "Reliability requirements describe the ability of a system to perform its functions under stated conditions for a specified period of time without failure."
}

FEW_SHOT_EXAMPLES = [
    # Security Class Examples
    {"review": "without this the video calls could potentially be intercepted by hackers", "classification": "Security"},
    {"review": "collects way too much unneeded information about my private life and location without my consent", "classification": "Security"},
    {"review": "why exctly do you need full read access to my text messages on my phone", "classification": "Security"},

    # Portability Class Examples
    {"review": "this app is very great and compatible with my tablet and phone", "classification": "Portability"},
    {"review": "does not work on my new phone", "classification": "Portability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "Portability"},

    # Performance efficiency Class Examples
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "Performance"},
    {"review": "my phone overheats and battery drains very quickly when using this app", "classification": "Performance"},
    {"review": "too much lagging and freezing of the app after new update", "classification": "Performance"},

    # Reliability Class Examples
    {"review": "on my no anyone use whatsapp id , when the id open up and I have to put on the id it still don't open", "classification": "Reliability"},
    {"review": "The app stopped working after the recent update", "classification": "Reliability"},
    {"review": "Keeps crashing when I try to save a new record", "classification": "Reliability"},

    # Usability Class Examples
    {"review": "can't find a book unless i know exctly what book I want to buy", "classification": "Usability"},
    {"review": "this app needs to make it easier to add notes and save it", "classification": "Usability"},
    {"review": "very complicated and user unfriendly interface", "classification": "Usability"}
]

# Format these examples into a single string for the prompt
# Why: Prompts expect a single string for few-shot examples.
formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"Requirement: {ex['review']}\nClassification: {ex['classification']}\n\n"

# --- Define the list of NFR categories as a string for prompts P1, P4, P5, P6 ---
# Why: This string is used within prompts to list the target categories for the LLM.
all_nfr_labels_str = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other" # Your 5 classes + 'Other'
])
print("-" * 40)


# --- 4. Evaluation Function Definition ---
# Why: Encapsulates the entire evaluation process (set prompt, classify, report)
# for easy reusability and clarity.
def evaluate_nfr_prompt_strategy(
    prompt_id: str,
    data_df: pd.DataFrame,
    client_instance: LLMClient,
    category_name: str = "NFR_CLASSIFICATION",
    **prompt_kwargs
) -> pd.DataFrame:
    """
    Evaluates the NFR classification performance using a specific prompt ID.

    Args:
        prompt_id (str): The ID of the prompt to activate from prompts.json.
        data_df (pd.DataFrame): The DataFrame containing 'review' and 'ground_truth' columns.
        client_instance (LLMClient): An initialized instance of the LLMClient.
        category_name (str): The category name for the prompt in prompts.json.
        **prompt_kwargs: Additional keyword arguments to pass to the prompt's .format() method.

    Returns:
        pd.DataFrame: A DataFrame with 'review', 'ground_truth', 'predicted' columns,
                      and a performance report for the given prompt.
    """
    
    # Why: Activates the specified prompt from prompts.json for this evaluation run.
    Config.set_active_prompt_id(category_name, prompt_id)
    
    print(f"\n--- Starting evaluation for Prompt ID: {prompt_id} ---")

    predictions = []
    start_time = time.time()

    # Why: Iterates through each review, calls the LLM, and collects predictions.
    for i, row in tqdm(data_df.iterrows(), total=len(data_df), desc=f"Classifying with {prompt_id}"):
        # Calls the LLMClient's classify_nfr method, passing review text and any extra prompt arguments.
        response = client_instance.classify_nfr(row['review'], **prompt_kwargs)
        pred = response.classification if response.success else "Failed"
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {prompt_id} completed in {elapsed/60:.2f} minutes")

    # Prepare results DataFrame
    # Why: Creates a new DataFrame to store predictions and ensures consistency in labeling.
    results_df = data_df.copy()
    results_df['predicted'] = predictions
    results_df['predicted'] = results_df['predicted'].str.strip().str.lower()

    # Evaluate and report performance
    # Why: Filters out failed predictions and generates a standard classification report
    # (Precision, Recall, F1-score) for quantitative evaluation.
    filtered_results = results_df[results_df['predicted'] != 'failed']
    
    report = classification_report(
        filtered_results['ground_truth'], 
        filtered_results['predicted'], 
        zero_division=0 # Prevents warnings for classes with no true or predicted samples.
    )
    
    print(f"\n--- Classification Report for Prompt ID: {prompt_id} ---\n")
    print(f"{report}\n")
    print(f"--- End Report for Prompt ID: {prompt_id} ---\n")

    return results_df, report

# --- 5. Evaluation for P5 (Few-Shot Multi-Class Strict Selection) ---
# This is the ONLY evaluation block that will run in this cell.
print("\n========== EVALUATING PROMPT P5 (Few-Shot Multi-Class Strict Selection) ==========")
results_p5_df, report_p5 = evaluate_nfr_prompt_strategy(
    prompt_id="P5",
    data_df=data,
    client_instance=client,
    nfr_categories_list=all_nfr_labels_str, # Passes the list of categories for prompt formatting
    few_shot_examples_text=formatted_few_shot_text # Passes the prepared few-shot examples for prompt formatting
)
print("First 5 predictions for P5:")
print(results_p5_df.head())

print("\n--- All evaluations completed ---")

--- Loading and preparing data ---


2025-07-17 02:05:39,426 - src.llm_client - INFO - Initialized Gemini model: models/gemini-1.5-flash-latest
2025-07-17 02:05:39,426 - src.llm_client - INFO - Initialized LLM client with https://generativelanguage.googleapis.com, model: models/gemini-1.5-flash-latest
2025-07-17 02:05:39,426 - src.llm_client - INFO - Testing Gemini API connection...


Loaded 25 reviews.
Sample of loaded data:
                                              review ground_truth
0  it takes about 45 seconds for the page to turn...  performance
1  on my no anyone use whatsapp id , when the id ...  reliability
2  can't find a book unless i know exctly what bo...    usability
3     a paper white view would be a wonderful option    usability
4  bring it back or at least the option to turn i...    usability
----------------------------------------
--- Initializing LLM Client ---


2025-07-17 02:05:39,953 - src.llm_client - INFO - Successfully connected to Gemini API using model 'models/gemini-1.5-flash-latest'.
2025-07-17 02:05:39,953 - src.config - INFO - Loaded all prompts from prompts.json
2025-07-17 02:05:39,960 - src.config - INFO - Active prompt for 'NFR_CLASSIFICATION' set to 'P5'.


✅ LLM Client initialized and connected.
----------------------------------------
----------------------------------------


--- Starting evaluation for Prompt ID: P5 ---


Classifying with P5:   0%|                                                                      | 0/25 [00:00<?, ?it/s]2025-07-17 02:05:39,968 - src.llm_client - DEBUG - Classifying NFR for requirement: it takes about 45 seconds for the page to turn or bookmark a page, rather than it being automatic...
2025-07-17 02:05:39,969 - src.llm_client - DEBUG - Making Gemini request (attempt 1)
2025-07-17 02:05:40,328 - src.llm_client - DEBUG - Gemini response received: 11 characters
2025-07-17 02:05:40,331 - src.llm_client - DEBUG - NFR Classification result: Performance - Performance...
Classifying with P5:   4%|██▍                                                           | 1/25 [00:00<00:08,  2.74it/s]2025-07-17 02:05:40,331 - src.llm_client - DEBUG - Classifying NFR for requirement: on my no anyone use whatsapp id , when the id is already available on my  phone , don't know how is ...
2025-07-17 02:05:40,336 - src.llm_client - DEBUG - Making Gemini request (attempt 1)
2025-07-17 02:05:40,7


✅ Classification with P5 completed in 0.13 minutes

--- Classification Report for Prompt ID: P5 ---

              precision    recall  f1-score   support

       other       0.00      0.00      0.00         0
 performance       0.33      1.00      0.50         2
 portability       1.00      0.33      0.50         3
 reliability       0.00      0.00      0.00         6
    security       0.00      0.00      0.00         0
   usability       0.75      0.75      0.75         4

    accuracy                           0.40        15
   macro avg       0.35      0.35      0.29        15
weighted avg       0.44      0.40      0.37        15


--- End Report for Prompt ID: P5 ---

First 5 predictions for P5:
                                              review ground_truth    predicted
0  it takes about 45 seconds for the page to turn...  performance  performance
1  on my no anyone use whatsapp id , when the id ...  reliability     security
2  can't find a book unless i know exctly what bo..




In [None]:
client = LLMClient()
predictions = []
start_time = time.time()

for i, row in tqdm(data.iterrows(), total=len(data), desc="Classifying Reviews"):
    response = client.classify_nfr(row['review'])
    pred = response.classification if response.success else "Failed"
    predictions.append(pred)

data['predicted'] = predictions
elapsed = time.time() - start_time
print(f"\n✅ Done in {elapsed/60:.2f} minutes")

In [None]:
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()
data['predicted'] = data['predicted'].str.strip().str.lower()

In [None]:
data.head()

In [None]:
# Filter failed responses
filtered = data[data['predicted'] != 'Failed']

# Evaluate
print(classification_report(filtered['ground_truth'], filtered['predicted']))

In [None]:
from sklearn.metrics import classification_report
import time
from tqdm import tqdm
import pandas as pd
import logging

# Set up basic logging if it's not already, for clarity in notebook output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

def evaluate_nfr_prompt_strategy(
    prompt_id: str,
    data_df: pd.DataFrame,
    client_instance: LLMClient,
    category_name: str = "NFR_CLASSIFICATION"
) -> pd.DataFrame:
    """
    Evaluates the NFR classification performance using a specific prompt ID.

    Args:
        prompt_id (str): The ID of the prompt to activate from prompts.json.
        data_df (pd.DataFrame): The DataFrame containing 'review' and 'ground_truth' columns.
        client_instance (LLMClient): An initialized instance of the LLMClient.
        category_name (str): The category name for the prompt in prompts.json.

    Returns:
        pd.DataFrame: A DataFrame with 'review', 'ground_truth', 'predicted' columns,
                      and a performance report for the given prompt.
    """
    
    # 1. Set the active prompt in Config
    # Why: This tells the Config class which specific prompt text to return when llm_client asks for it.
    # It ensures that all subsequent classify_nfr calls use this chosen prompt.
    Config.set_active_prompt_id(category_name, prompt_id)
    
    logger.info(f"Starting evaluation for Prompt ID: {prompt_id}")

    predictions = []
    # Why: Record start time to measure how long the classification takes.
    start_time = time.time() 

    # 2. Run the classification loop
    # Why: Iterate through each review in your dataset and get the LLM's classification.
    # tqdm provides a progress bar, useful for long runs.
    for i, row in tqdm(data_df.iterrows(), total=len(data_df), desc=f"Classifying with {prompt_id}"):
        # The classify_nfr method now implicitly uses the prompt set via Config.set_active_prompt_id
        response = client_instance.classify_nfr(row['review'])
        pred = response.classification if response.success else "Failed"
        predictions.append(pred)

    # Why: Calculate elapsed time.
    elapsed = time.time() - start_time
    logger.info(f"Classification with {prompt_id} completed in {elapsed/60:.2f} minutes")

    # 3. Prepare data for evaluation
    # Why: Create a copy to avoid modifying the original DataFrame directly.
    results_df = data_df.copy()
    results_df['predicted'] = predictions
    
    # Why: Standardize case and remove whitespace for accurate comparison.
    results_df['ground_truth'] = results_df['ground_truth'].str.strip().str.lower()
    results_df['predicted'] = results_df['predicted'].str.strip().str.lower()

    # 4. Evaluate and report performance
    # Why: Filter out any 'Failed' predictions to ensure metrics are calculated only on valid responses.
    filtered_results = results_df[results_df['predicted'] != 'failed']
    
    # Why: Generate a detailed classification report (precision, recall, f1-score, support).
    # zero_division=0 prevents warnings when a class has no true or predicted samples.
    report = classification_report(
        filtered_results['ground_truth'], 
        filtered_results['predicted'], 
        zero_division=0
    )
    
    logger.info(f"\n--- Classification Report for Prompt ID: {prompt_id} ---")
    logger.info(f"\n{report}")
    logger.info(f"--- End Report for Prompt ID: {prompt_id} ---\n")

    return results_df, report

In [None]:
import sys
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import time
import logging

# Set up basic logging for clarity in notebook output
# This ensures you see INFO messages from config and llm_client
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Adjust sys.path to ensure modules in 'src' can be imported
# Why: This tells Python where to find your 'config.py' and 'llm_client.py' files.
# It's essential if your notebook is not in the same directory as the 'src' folder.
if "src" not in sys.path:
    sys.path.append("src")

from src.llm_client import LLMClient
from src.config import Config

# --- 1. Data Loading and Preparation ---
print("--- Loading and preparing data ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
# The `sheets.values()` extracts all DataFrames from the dictionary returned by pd.read_excel
data = pd.concat(sheets.values(), ignore_index=True)

# Rename columns for easier access and consistency
data = data[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Standardize ground_truth labels to lowercase and strip whitespace
# Why: Ensures consistent comparison with predicted labels from the LLM,
# which might output in varying cases or with extra spaces.
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()
print(f"Loaded {len(data)} reviews.")
print("Sample of loaded data:")
print(data.head())
print("-" * 40)

# --- 2. LLM Client Initialization ---
# Why: Create an instance of your LLMClient. This client will be used for all classification calls.
# It also attempts to connect to your Ollama server.
print("--- Initializing LLM Client ---")
client = LLMClient()
if not client.test_connection():
    print("❌ LLM connection failed at initialization. Please check Ollama server.")
    # You might want to exit or raise an error here if connection is critical
else:
    print("✅ LLM Client initialized and connected.")
print("-" * 40)


# --- 3. Define NFR Definitions (Required for P3 'Definition-Based' prompts) ---
# Why: P3 (Combined Alhoshan Patterns) might use definitions. This dictionary stores them.
# You will need to expand this dictionary with all your specific NFR definitions based on ISO/IEC 25010
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage."
    # Add more NFR definitions as needed for all your categories (e.g., "availability", "maintainability", etc.)
}


# --- 4. Evaluation Function Definition ---
# Why: This function encapsulates the process of setting a prompt, running classification,
# and reporting results. This makes it easy to repeat for different prompts.
def evaluate_nfr_prompt_strategy(
    prompt_id: str,
    data_df: pd.DataFrame,
    client_instance: LLMClient,
    category_name: str = "NFR_CLASSIFICATION",
    **prompt_kwargs # This will capture extra arguments like target_classification_category or candidate_label
) -> pd.DataFrame:
    """
    Evaluates the NFR classification performance using a specific prompt ID.

    Args:
        prompt_id (str): The ID of the prompt to activate from prompts.json.
        data_df (pd.DataFrame): The DataFrame containing 'review' and 'ground_truth' columns.
        client_instance (LLMClient): An initialized instance of the LLMClient.
        category_name (str): The category name for the prompt in prompts.json.
        **prompt_kwargs: Additional keyword arguments to pass to the prompt's .format() method.
                         E.g., `target_classification_category` for P2, or `prefix_definition`,
                         `classification_statement_or_question`, `candidate_label` for P3.

    Returns:
        pd.DataFrame: A DataFrame with 'review', 'ground_truth', 'predicted' columns,
                      and a performance report for the given prompt.
    """
    
    # Set the active prompt in Config
    # Why: This is how you tell the system which specific prompt from prompts.json to use for this run.
    # The llm_client will then fetch this active prompt.
    Config.set_active_prompt_id(category_name, prompt_id)
    
    print(f"\n--- Starting evaluation for Prompt ID: {prompt_id} ---")

    predictions = []
    start_time = time.time() # Record start time for performance measurement

    # Run the classification loop
    # Why: tqdm provides a progress bar, which is helpful for long classification runs.
    for i, row in tqdm(data_df.iterrows(), total=len(data_df), desc=f"Classifying with {prompt_id}"):
        # The classify_nfr method now implicitly uses the prompt set via Config.set_active_prompt_id
        # and receives all extra arguments from prompt_kwargs.
        response = client_instance.classify_nfr(row['review'], **prompt_kwargs)
        pred = response.classification if response.success else "Failed"
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {prompt_id} completed in {elapsed/60:.2f} minutes")

    # Prepare results DataFrame
    results_df = data_df.copy() # Create a copy to avoid modifying the original 'data'
    results_df['predicted'] = predictions
    
    # Standardize predicted labels for accurate comparison
    results_df['predicted'] = results_df['predicted'].str.strip().str.lower()

    # Evaluate and report performance
    # Why: Filter out any 'Failed' predictions to ensure metrics are calculated only on valid responses.
    filtered_results = results_df[results_df['predicted'] != 'failed']
    
    # Generate a detailed classification report
    # zero_division=0 prevents warnings when a class has no true or predicted samples (common with small datasets).
    report = classification_report(
        filtered_results['ground_truth'], 
        filtered_results['predicted'], 
        zero_division=0
    )
    
    print(f"\n--- Classification Report for Prompt ID: {prompt_id} ---")
    print(f"\n{report}")
    print(f"--- End Report for Prompt ID: {prompt_id} ---\n")

    return results_df, report

# --- 5. Sequential Evaluation of Prompts --- 
# Now, we run the evaluation for P2 and P3. 
# Remember to ensure these prompt_id's exist in your prompts.json under NFR_CLASSIFICATION. 
# You MUST add 'P2' and 'P3' to your prompts.json as discussed.

# Evaluation for P2 (El-Hajjami's Zero-Shot Prompt)
# This prompt is designed for BINARY classification (e.g., Is this requirement 'security'? Yes/No)
# You need to specify *which* NFR category you want it to classify for each run.
# For multi-class NFR classification, you'd typically run this prompt multiple times,
# once for each NFR category, and then aggregate the "yes" responses.
# For simplicity in this example, we'll pick one category.

nfr_category_for_p2_test = "security" # <--- IMPORTANT: CHOOSE AN NFR CATEGORY HERE FOR P2 TEST

print("\n========== EVALUATING PROMPT P2 ==========")
results_p2_df, report_p2 = evaluate_nfr_prompt_strategy(
    prompt_id="P2",
    data_df=data,
    client_instance=client,
    # --- NEW ARGUMENT REQUIRED FOR P2 ---
    target_classification_category=nfr_category_for_p2_test 
    # ------------------------------------
)
print("First 5 predictions for P2:")
print(results_p2_df.head())


# Evaluation for P3 (Combined Alhoshan Patterns Prompt)
# This prompt is highly flexible and needs specific values for its placeholders,
# depending on which pattern (Assertion/Definition/Q&A, is-about/belongs-to) you're simulating.
# Example below uses 'usability' with a 'Definition-Based' 'is about' pattern.

print("\n========== EVALUATING PROMPT P3 (Usability - Is About Definition) ==========")
results_p3_df, report_p3 = evaluate_nfr_prompt_strategy(
    prompt_id="P3",
    data_df=data,
    client_instance=client,
    # --- NEW ARGUMENTS REQUIRED FOR P3 (example: "is about" assertion for Usability with definition) ---
    prefix_definition=f"{NFR_DEFINITIONS['usability']}. Therefore, ", # This will be empty string "" for Assertion-Based
    classification_statement_or_question="this requirement is about", # "this requirement belongs to", "Is this requirement about", etc.
    candidate_label="usability" # The specific NFR label to test against
    # --------------------------------------------------------------------------------------------------
)
print("First 5 predictions for P3:")
print(results_p3_df.head())

print("\n--- All evaluations completed ---")

In [None]:
# --- 5. Sequential Evaluation of Prompts --- 

# Evaluation for P2 (El-Hajjami's Zero-Shot Prompt)
nfr_category_for_p2_test = "security" # You can change this to any NFR category from your dataset

print("\n========== EVALUATING PROMPT P2 ==========")
results_p2_df, report_p2 = evaluate_nfr_prompt_strategy(
    prompt_id="P2",
    data_df=data,
    client_instance=client,
    target_classification_category=nfr_category_for_p2_test
)
print("First 5 predictions for P2:")
print(results_p2_df.head())


# Evaluation for P3 (Combined Alhoshan Patterns Prompt)
print("\n========== EVALUATING PROMPT P3 (Usability - Is About Definition) ==========")
results_p3_df, report_p3 = evaluate_nfr_prompt_strategy(
    prompt_id="P3",
    data_df=data,
    client_instance=client,
    prefix_definition=f"{NFR_DEFINITIONS['usability']}. Therefore, ",
    classification_statement_or_question="this requirement is about",
    candidate_label="usability"
)
print("First 5 predictions for P3:")
print(results_p3_df.head())

print("\n--- All evaluations completed ---")

In [None]:
# --- Evaluation for P4 (Multi-Class Selection Prompt) ---
# This prompt requires a comma-separated list of all NFR categories.
all_nfr_labels_str = ", ".join([label.capitalize() for label in Config.VALID_NFR_LABELS if label != "other"])
# Why: Creates a string like "Security, Usability, Performance" from your Config.VALID_NFR_LABELS
# It capitalizes them for better LLM input and excludes "other" for the primary list if desired,
# as "Other" is handled as a fallback in the prompt itself.

print("\n========== EVALUATING PROMPT P4 (Multi-Class Selection) ==========")
results_p4_df, report_p4 = evaluate_nfr_prompt_strategy(
    prompt_id="P4",
    data_df=data,
    client_instance=client,
    # --- NEW ARGUMENT REQUIRED FOR P4 ---
    nfr_categories_list=all_nfr_labels_str
    # ------------------------------------
)
print("First 5 predictions for P4:")
print(results_p4_df.head())

print("\n--- All evaluations completed ---") # This line would be the final one in your cell

In [None]:
# --- Evaluation for P5 (Few-Shot Multi-Class Selection Prompt) ---
# This prompt uses the pre-defined FEW_SHOT_EXAMPLES.
# The nfr_categories_list is also needed to tell the LLM what categories to choose from.

# You already have `all_nfr_labels_str` defined from the P4 evaluation block,
# which correctly lists your 5 categories (Usability, Reliability, Performance, Portability, Security).

print("\n========== EVALUATING PROMPT P5 (Few-Shot Multi-Class Selection) ==========")
results_p5_df, report_p5 = evaluate_nfr_prompt_strategy(
    prompt_id="P5",
    data_df=data,
    client_instance=client,
    nfr_categories_list=all_nfr_labels_str, # Pass the list of categories to choose from
    few_shot_examples_text=formatted_few_shot_text # Pass the prepared few-shot examples
)
print("First 5 predictions for P5:")
print(results_p5_df.head())

print("\n--- All evaluations completed ---") # This line would be the final one in your cell

In [None]:
print("All predictions for P5:")
print(results_p5_df)

In [1]:
import sys
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import time
import logging

# Set up basic logging for clarity in notebook output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Adjust sys.path to ensure modules in 'src' can be imported
if "src" not in sys.path:
    sys.path.append("src")

from src.llm_client import LLMClient
from src.config import Config

# --- 1. Data Loading and Preparation ---
print("--- Loading and preparing data ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

# Combine all sheets into a single DataFrame
data = pd.concat(sheets.values(), ignore_index=True)

# --- NEW: Sample the data down to 25 reviews ---
data = data.sample(n=25, random_state=42).reset_index(drop=True)
# -----------------------------------------------

# Rename columns for easier access and consistency
data = data[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

# Standardize ground_truth labels to lowercase and strip whitespace
data['ground_truth'] = data['ground_truth'].str.strip().str.lower()
print(f"Loaded {len(data)} reviews.")
print("Sample of loaded data:")
print(data.head())
print("-" * 40)

# --- 2. LLM Client Initialization ---
print("--- Initializing LLM Client ---")
client = LLMClient()
if not client.test_connection():
    print("❌ LLM connection failed at initialization. Please check Ollama server.")
else:
    print("✅ LLM Client initialized and connected.")
print("-" * 40)


# --- 3. Define NFR Definitions & Few-Shot Examples (Required for P3 and P5) ---
# NFR Definitions
NFR_DEFINITIONS = {
    "usability": "Usability requirements define how easy a system is to use and learn for its intended users.",
    "security": "Security requirements describe how the system must protect information and data from unauthorized access or modification.",
    "performance": "Performance requirements describe how well a system performs its functions, often related to speed, efficiency, and resource usage.",
    "portability": "Portability requirements describe the ease with which a system can be transferred from one environment to another.",
    "reliability": "Reliability requirements describe the ability of a system to perform its functions under stated conditions for a specified period of time without failure."
}

# Few-Shot Examples for P5
FEW_SHOT_EXAMPLES = [
    # Security Class Examples
    {"review": "without this the video calls could potentially be intercepted by hackers", "classification": "Security"},
    {"review": "collects way too much unneeded information about my private life and location without my consent", "classification": "Security"},
    {"review": "why exctly do you need full read access to my text messages on my phone", "classification": "Security"},

    # Portability Class Examples
    {"review": "this app is very great and compatible with my tablet and phone", "classification": "Portability"},
    {"review": "does not work on my new phone", "classification": "Portability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "Portability"},

    # Performance efficiency Class Examples
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "Performance"},
    {"review": "my phone overheats and battery drains very quickly when using this app", "classification": "Performance"},
    {"review": "too much lagging and freezing of the app after new update", "classification": "Performance"},

    # Reliability Class Examples
    {"review": "on my no anyone use whatsapp id , when the id open up and I have to put on the id it still don't open", "classification": "Reliability"},
    {"review": "The app stopped working after the recent update", "classification": "Reliability"},
    {"review": "Keeps crashing when I try to save a new record", "classification": "Reliability"},

    # Usability Class Examples
    {"review": "can't find a book unless i know exctly what book I want to buy", "classification": "Usability"},
    {"review": "this app needs to make it easier to add notes and save it", "classification": "Usability"},
    {"review": "very complicated and user unfriendly interface", "classification": "Usability"}
]

# Format these examples into a single string for the prompt
formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"Requirement: {ex['review']}\nClassification: {ex['classification']}\n\n"

# --- Define the list of NFR categories as a string for prompts P1, P4, P5 ---
# Moved this definition here so it's accessible by all evaluation calls
all_nfr_labels_str = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other" # Based on your 5 classes + Other
])
# Why: This variable is used by multiple prompts (like P1, P4, P5) to tell the LLM which categories to choose from.
# Defining it here ensures it's always available before any prompt evaluation calls.
print("-" * 40)


# --- 4. Evaluation Function Definition ---
def evaluate_nfr_prompt_strategy(
    prompt_id: str,
    data_df: pd.DataFrame,
    client_instance: LLMClient,
    category_name: str = "NFR_CLASSIFICATION",
    **prompt_kwargs
) -> pd.DataFrame:
    """
    Evaluates the NFR classification performance using a specific prompt ID.

    Args:
        prompt_id (str): The ID of the prompt to activate from prompts.json.
        data_df (pd.DataFrame): The DataFrame containing 'review' and 'ground_truth' columns.
        client_instance (LLMClient): An initialized instance of the LLMClient.
        category_name (str): The category name for the prompt in prompts.json.
        **prompt_kwargs: Additional keyword arguments to pass to the prompt's .format() method.

    Returns:
        pd.DataFrame: A DataFrame with 'review', 'ground_truth', 'predicted' columns,
                      and a performance report for the given prompt.
    """
    
    Config.set_active_prompt_id(category_name, prompt_id)
    
    print(f"\n--- Starting evaluation for Prompt ID: {prompt_id} ---")

    predictions = []
    start_time = time.time()

    for i, row in tqdm(data_df.iterrows(), total=len(data_df), desc=f"Classifying with {prompt_id}"):
        response = client_instance.classify_nfr(row['review'], **prompt_kwargs)
        pred = response.classification if response.success else "Failed"
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {prompt_id} completed in {elapsed/60:.2f} minutes")

    results_df = data_df.copy()
    results_df['predicted'] = predictions
    
    results_df['predicted'] = results_df['predicted'].str.strip().str.lower()

    filtered_results = results_df[results_df['predicted'] != 'failed']
    
    report = classification_report(
        filtered_results['ground_truth'], 
        filtered_results['predicted'], 
        zero_division=0
    )
    
    print(f"\n--- Classification Report for Prompt ID: {prompt_id} ---\n")
    print(f"{report}\n")
    print(f"--- End Report for Prompt ID: {prompt_id} ---\n")

    return results_df, report

# --- 5. Sequential Evaluation of Prompts --- 
'''
# Evaluation for P1 (Original, now corrected to your 5 classes)
print("\n========== EVALUATING PROMPT P1 (Multi-Class Selection) ==========")
results_p1_df, report_p1 = evaluate_nfr_prompt_strategy(
    prompt_id="P1",
    data_df=data,
    client_instance=client,
    # P1 now uses a hardcoded list of categories in its text,
    # so no 'nfr_categories_list' needed here for formatting,
    # but the prompt text in prompts.json must be accurate.
)
print("First 5 predictions for P1:")
print(results_p1_df.head())


# Evaluation for P2 (El-Hajjami's Zero-Shot Prompt)
# This prompt is designed for BINARY classification (e.g., Is this requirement 'security'? Yes/No)
# It requires target_classification_category.
nfr_category_for_p2_test = "security" # You can change this to any NFR category from your dataset

print("\n========== EVALUATING PROMPT P2 ==========")
results_p2_df, report_p2 = evaluate_nfr_prompt_strategy(
    prompt_id="P2",
    data_df=data,
    client_instance=client,
    target_classification_category=nfr_category_for_p2_test
)
print("First 5 predictions for P2:")
print(results_p2_df.head())


# Evaluation for P3 (Combined Alhoshan Patterns Prompt)
# This prompt is highly flexible and needs specific values for its placeholders.
# Example below uses 'usability' with a 'Definition-Based' 'is about' pattern.
print("\n========== EVALUATING PROMPT P3 (Usability - Is About Definition) ==========")
results_p3_df, report_p3 = evaluate_nfr_prompt_strategy(
    prompt_id="P3",
    data_df=data,
    client_instance=client,
    prefix_definition=f"{NFR_DEFINITIONS['usability']}. Therefore, ",
    classification_statement_or_question="this requirement is about",
    candidate_label="usability"
)
print("First 5 predictions for P3:")
print(results_p3_df.head())

'''
# Evaluation for P4 (Multi-Class Selection Prompt - placeholder version)
# This prompt uses `nfr_categories_list` as a placeholder to dynamically insert the list of categories.
all_nfr_labels_str_for_p4 = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other"
])

print("\n========== EVALUATING PROMPT P4 (Multi-Class Selection - Dynamic List) ==========")
results_p4_df, report_p4 = evaluate_nfr_prompt_strategy(
    prompt_id="P4",
    data_df=data,
    client_instance=client,
    nfr_categories_list=all_nfr_labels_str_for_p4 # Pass the list of categories as a string
)
print("First 5 predictions for P4:")
print(results_p4_df.head())


# Evaluation for P5 (Few-Shot Multi-Class Selection)
# This prompt uses both few-shot_examples_text and nfr_categories_list.
# The `formatted_few_shot_text` variable is defined earlier in the notebook.

print("\n========== EVALUATING PROMPT P5 (Few-Shot Multi-Class Selection) ==========")
results_p5_df, report_p5 = evaluate_nfr_prompt_strategy(
    prompt_id="P5",
    data_df=data,
    client_instance=client,
    nfr_categories_list=all_nfr_labels_str_for_p4, # Use the same list of categories
    few_shot_examples_text=formatted_few_shot_text # Pass the prepared few-shot examples
)
print("First 5 predictions for P5:")
print(results_p5_df.head())


print("\n--- All evaluations completed ---")

--- Loading and preparing data ---
Loaded 25 reviews.
Sample of loaded data:
                                              review ground_truth
0  it takes about 45 seconds for the page to turn...  performance
1  on my no anyone use whatsapp id , when the id ...  reliability
2  can't find a book unless i know exctly what bo...    usability
3     a paper white view would be a wonderful option    usability
4  bring it back or at least the option to turn i...    usability
----------------------------------------
--- Initializing LLM Client ---


2025-07-27 03:28:25,683 - src.llm_client - INFO - Initialized Gemini model: models/gemini-1.5-flash-latest
2025-07-27 03:28:25,690 - src.llm_client - INFO - Initialized LLM client with https://generativelanguage.googleapis.com, model: models/gemini-1.5-flash-latest, provider: gemini
2025-07-27 03:28:25,690 - src.llm_client - INFO - Testing Gemini API connection...
2025-07-27 03:28:26,270 - src.llm_client - INFO - Successfully connected to Gemini API using model 'models/gemini-1.5-flash-latest'.
2025-07-27 03:28:26,275 - src.config - INFO - Loaded all prompts from prompts.json
2025-07-27 03:28:26,276 - src.config - INFO - Active prompt for 'NFR_CLASSIFICATION' set to 'P4'.


✅ LLM Client initialized and connected.
----------------------------------------
----------------------------------------


--- Starting evaluation for Prompt ID: P4 ---


Classifying with P4:  60%|████████████████████████████████████▌                        | 15/25 [00:07<00:04,  2.04it/s]2025-07-27 03:28:33,654 - src.llm_client - ERROR - Gemini request failed: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 29
}
]
2025-07-27 03:28:33,731 - src.llm_client - ERROR - Gemini request failed: 429 You exceeded your current quota, please check your plan and billing d


✅ Classification with P4 completed in 0.16 minutes

--- Classification Report for Prompt ID: P4 ---

               precision    recall  f1-score   support

functionality       0.00      0.00      0.00         0
        other       0.00      0.00      0.00         0
  performance       0.33      1.00      0.50         2
  portability       1.00      0.33      0.50         3
  reliability       0.00      0.00      0.00         6
    usability       1.00      1.00      1.00         4

     accuracy                           0.47        15
    macro avg       0.39      0.39      0.33        15
 weighted avg       0.51      0.47      0.43        15


--- End Report for Prompt ID: P4 ---

First 5 predictions for P4:
                                              review ground_truth  \
0  it takes about 45 seconds for the page to turn...  performance   
1  on my no anyone use whatsapp id , when the id ...  reliability   
2  can't find a book unless i know exctly what bo...    usability   
3 

Classifying with P5:   0%|                                                                      | 0/25 [00:00<?, ?it/s]2025-07-27 03:28:35,828 - src.llm_client - ERROR - Gemini request failed: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 26
}
]
2025-07-27 03:28:35,893 - src.llm_client - ERROR - Gemini request failed: 429 You exceeded your current quota, please check your plan and billing d


✅ Classification with P5 completed in 0.08 minutes


ValueError: max() iterable argument is empty

In [None]:
client = LLMClient()

predictions = []
for i, row in data.iterrows():
    response = client.classify_nfr(row['review'])
    pred = response.classification if response.success else "Failed"
    predictions.append(pred)

data['predicted'] = predictions

# Filter failed responses
filtered = data[data['predicted'] != 'Failed']

# Evaluate
print(classification_report(filtered['ground_truth'], filtered['predicted']))