In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
# You can update this list to use your custom GPU-optimized models if you wish
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

print(f"Loaded {len(nfr_data)} non-functional reviews.")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. The ZERO-SHOT Prompt ---
# This is a clean version of your prompt without any examples.
zero_shot_prompt_text = """
You are a software requirements expert. Your task is to classify user reviews into one of the following Non-Functional Requirement (NFR) types.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Instructions:**
1.  Analyze the 'User Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list that the review most closely aligns with, based on the definitions.
3.  Your final output MUST be only the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability', 'RL: Reliability', 'OT: Other'). Do NOT include any other text or reasoning.

**User Review:** '''{review_text}'''

**Classification:**
"""

# Define the list of NFR categories as a string for the prompt
all_nfr_labels_str = ", ".join([
    "Usability", "Reliability", "Performance", "Portability", "Security", "Other"
])

# --- 5. LLM Interaction Function (Re-using from your code) ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for ZERO-SHOT ---
all_models_zero_shot_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting ZERO-SHOT Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=zero_shot_prompt_text # Use the new zero-shot prompt
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ ZERO-SHOT Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of ZERO-SHOT Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- ZERO-SHOT Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_zero_shot_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_zero_shot_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} ZERO-SHOT Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL ZERO-SHOT MODELS EVALUATION COMPLETE ==========\n")
print("Summary of ZERO-SHOT Accuracies:")
for model, metrics in all_models_zero_shot_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final ZERO-SHOT Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews.
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████| 1278/1278 [2:29:37<00:00,  7.02s/it]



✅ ZERO-SHOT Classification with llama2 completed in 149.62 minutes

--- Sample of ZERO-SHOT Predictions for llama2 ---
                                              review ground_truth    predicted
0  without this the video calls could potentially...     security     security
1  collects way too much unneeded information abo...     security  performance
2  why exctly do you need full read access to my ...     security    usability
3                     more private than fb messenger     security  performance
4  this app is the best message and chat service,...     security    usability

--- ZERO-SHOT Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.43      0.99      0.60       431
 reliability       0.93      0.09      0.16       585
 performance       0.37      0.66      0.47       121
 portability       0.00      0.00      0.00       118
    security       0.89      0.42      0.57        19
       other       0.00      

Classifying reviews with mistral: 100%|██████████████████████████████████████████| 1278/1278 [4:06:36<00:00, 11.58s/it]



✅ ZERO-SHOT Classification with mistral completed in 246.61 minutes

--- Sample of ZERO-SHOT Predictions for mistral ---
                                              review ground_truth  \
0  without this the video calls could potentially...     security   
1  collects way too much unneeded information abo...     security   
2  why exctly do you need full read access to my ...     security   
3                     more private than fb messenger     security   
4  this app is the best message and chat service,...     security   

        predicted  
0        security  
1        security  
2        security  
3        security  
4  Failed Parsing  

--- ZERO-SHOT Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.85      0.53      0.65       379
 reliability       0.81      0.38      0.51       532
 performance       0.28      0.95      0.44       121
 portability       0.58      0.24      0.34       109
    security       

Classifying reviews with llama3:8b: 100%|████████████████████████████████████████| 1278/1278 [1:30:10<00:00,  4.23s/it]



✅ ZERO-SHOT Classification with llama3:8b completed in 90.17 minutes

--- Sample of ZERO-SHOT Predictions for llama3:8b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- ZERO-SHOT Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       0.87      0.68      0.76       432
 reliability       0.85      0.41      0.55       587
 performance       0.24      0.97      0.38       121
 portability       0.39      0.23      0.29       119
    security       0.20      0.95      0.34        19
       other       0.00      0.00      

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████| 1278/1278 [1:23:39<00:00,  3.93s/it]



✅ ZERO-SHOT Classification with gemma:7b completed in 83.66 minutes

--- Sample of ZERO-SHOT Predictions for gemma:7b ---
                                              review ground_truth    predicted
0  without this the video calls could potentially...     security     security
1  collects way too much unneeded information abo...     security  portability
2  why exctly do you need full read access to my ...     security     security
3                     more private than fb messenger     security     security
4  this app is the best message and chat service,...     security     security

--- ZERO-SHOT Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.87      0.50      0.64       430
 reliability       0.87      0.33      0.48       582
 performance       0.19      0.98      0.31       121
 portability       0.48      0.19      0.27       118
    security       0.65      0.89      0.76        19
       other       0.00 

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████| 1278/1278 [50:40<00:00,  2.38s/it]


✅ ZERO-SHOT Classification with phi3:mini completed in 50.67 minutes

--- Sample of ZERO-SHOT Predictions for phi3:mini ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- ZERO-SHOT Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       0.85      0.50      0.63       296
 reliability       0.80      0.36      0.50       475
 performance       0.43      0.96      0.59       119
 portability       0.19      0.45      0.27       109
    security       0.37      0.95      0.53        19
       other       0.00      0.00      


