In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of top-performing Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Sampled NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

# --- IMPORTANT: Sample the data for a quick test run (30 reviews) ---
# REMEMBER TO REMOVE OR COMMENT OUT THIS LINE FOR THE FULL DATASET RUN!
# nfr_data = nfr_data.sample(n=min(30, len(nfr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded {len(nfr_data)} non-functional reviews (sampled for pilot run).")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. Hybrid Prompt Template (Role-Based + Few-Shot + Constraint-Based) ---
FEW_SHOT_EXAMPLES = [
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "PE: Performance"},
    {"review": "keeps crashing when I try to save a new record", "classification": "RL: Reliability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "PO: Portability"}
]

formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"App Review: {ex['review']}\nClassification: {ex['classification']}\n\n"

hybrid_prompt_text = """
You are a highly skilled software requirements expert, specializing in the classification of user feedback. Your task is to precisely classify the provided app review into one of the following Non-Functional Requirement (NFR) types and output the result in a strictly defined format.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Examples:**
{few_shot_examples_text}

**Instructions:**
1.  Read the 'App Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list based on the definitions and examples.
3.  Your final output MUST be ONLY the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability'). Do NOT include any other text, reasoning, or punctuation.

**App Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str, few_shot_examples_text: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(
        review_text=review_text,
        few_shot_examples_text=few_shot_examples_text
    )

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Hybrid Prompting ---
all_models_hybrid_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting HYBRID Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=hybrid_prompt_text,
            few_shot_examples_text=formatted_few_shot_text
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ HYBRID Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of HYBRID Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- HYBRID Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_hybrid_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_hybrid_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} HYBRID Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL HYBRID NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of HYBRID NFR Accuracies:")
for model, metrics in all_models_hybrid_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final HYBRID Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews (sampled for pilot run).
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████| 1278/1278 [1:08:45<00:00,  3.23s/it]



✅ HYBRID Classification with llama2 completed in 68.76 minutes

--- Sample of HYBRID Predictions for llama2 ---
                                              review ground_truth  predicted
0  without this the video calls could potentially...     security   security
1  collects way too much unneeded information abo...     security      other
2  why exctly do you need full read access to my ...     security      other
3                     more private than fb messenger     security      other
4  this app is the best message and chat service,...     security  usability

--- HYBRID Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.54      0.77      0.64       431
 reliability       0.76      0.33      0.46       587
 performance       0.66      0.45      0.53       121
 portability       0.00      0.00      0.00       119
    security       0.14      0.21      0.17        19
       other       0.00      0.00      0.00        

Classifying reviews with mistral: 100%|██████████████████████████████████████████| 1278/1278 [1:02:09<00:00,  2.92s/it]



✅ HYBRID Classification with mistral completed in 62.16 minutes

--- Sample of HYBRID Predictions for mistral ---
                                              review ground_truth  \
0  without this the video calls could potentially...     security   
1  collects way too much unneeded information abo...     security   
2  why exctly do you need full read access to my ...     security   
3                     more private than fb messenger     security   
4  this app is the best message and chat service,...     security   

        predicted  
0        security  
1        security  
2        security  
3        security  
4  Failed Parsing  

--- HYBRID Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.76      0.64      0.70       410
 reliability       0.81      0.33      0.47       537
 performance       0.34      0.91      0.50       117
 portability       0.68      0.21      0.32       108
    security       0.11      

Classifying reviews with llama3:8b: 100%|████████████████████████████████████████| 1278/1278 [1:06:00<00:00,  3.10s/it]



✅ HYBRID Classification with llama3:8b completed in 66.02 minutes

--- Sample of HYBRID Predictions for llama3:8b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- HYBRID Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       0.81      0.11      0.20       432
 reliability       0.81      0.41      0.54       587
 performance       0.47      0.73      0.57       121
 portability       0.66      0.23      0.34       119
    security       0.07      0.89      0.12        19
       other       0.00      0.00      0.00     

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████| 1278/1278 [1:20:41<00:00,  3.79s/it]



✅ HYBRID Classification with gemma:7b completed in 80.69 minutes

--- Sample of HYBRID Predictions for gemma:7b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- HYBRID Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.76      0.09      0.17       399
 reliability       0.79      0.21      0.33       582
 performance       0.58      0.72      0.64       121
 portability       0.43      0.29      0.35       119
    security       0.04      0.95      0.07        19
       other       0.00      0.00      0.00        

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████| 1278/1278 [49:28<00:00,  2.32s/it]


✅ HYBRID Classification with phi3:mini completed in 49.47 minutes

--- Sample of HYBRID Predictions for phi3:mini ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- HYBRID Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       0.90      0.15      0.26       414
 reliability       0.80      0.49      0.61       580
 performance       0.59      0.82      0.69       120
 portability       0.64      0.32      0.43       118
    security       0.18      0.95      0.31        19
       other       0.00      0.00      0.00     




In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of top-performing Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Sampled NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

# --- IMPORTANT: Sample the data for a quick test run (30 reviews) ---
# REMEMBER TO REMOVE OR COMMENT OUT THIS LINE FOR THE FULL DATASET RUN!
# nfr_data = nfr_data.sample(n=min(30, len(nfr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded {len(nfr_data)} non-functional reviews (sampled for pilot run).")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. Hybrid Prompt Template (Role-Based + Few-Shot + Constraint-Based) ---
FEW_SHOT_EXAMPLES = [
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "PE: Performance"},
    {"review": "keeps crashing when I try to save a new record", "classification": "RL: Reliability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "PO: Portability"}
]

formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"App Review: {ex['review']}\nClassification: {ex['classification']}\n\n"

hybrid_prompt_text = """
You are a highly skilled software requirements expert, specializing in the classification of user feedback. Your task is to precisely classify the provided app review into one of the following Non-Functional Requirement (NFR) types and output the result in a strictly defined format.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Examples:**
{few_shot_examples_text}

**Instructions:**
1.  Read the 'App Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list based on the definitions and examples.
3.  Your final output MUST be ONLY the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability'). Do NOT include any other text, reasoning, or punctuation.

**App Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str, few_shot_examples_text: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(
        review_text=review_text,
        few_shot_examples_text=few_shot_examples_text
    )

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Hybrid Prompting ---
all_models_hybrid_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting HYBRID Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=hybrid_prompt_text,
            few_shot_examples_text=formatted_few_shot_text
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ HYBRID Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of HYBRID Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- HYBRID Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_hybrid_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_hybrid_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} HYBRID Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL HYBRID NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of HYBRID NFR Accuracies:")
for model, metrics in all_models_hybrid_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final HYBRID Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews (sampled for pilot run).
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|█████████████████████████████████████████████| 1278/1278 [56:57<00:00,  2.67s/it]



✅ HYBRID Classification with llama2 completed in 56.96 minutes

--- Sample of HYBRID Predictions for llama2 ---
                                              review ground_truth  predicted
0  without this the video calls could potentially...     security   security
1  collects way too much unneeded information abo...     security      other
2  why exctly do you need full read access to my ...     security      other
3                     more private than fb messenger     security      other
4  this app is the best message and chat service,...     security  usability

--- HYBRID Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.54      0.78      0.64       432
 reliability       0.75      0.32      0.45       587
 performance       0.65      0.45      0.53       121
 portability       0.00      0.00      0.00       119
    security       0.15      0.21      0.18        19
       other       0.00      0.00      0.00        

Classifying reviews with mistral: 100%|████████████████████████████████████████████| 1278/1278 [53:14<00:00,  2.50s/it]



✅ HYBRID Classification with mistral completed in 53.24 minutes

--- Sample of HYBRID Predictions for mistral ---
                                              review ground_truth  \
0  without this the video calls could potentially...     security   
1  collects way too much unneeded information abo...     security   
2  why exctly do you need full read access to my ...     security   
3                     more private than fb messenger     security   
4  this app is the best message and chat service,...     security   

        predicted  
0        security  
1        security  
2        security  
3        security  
4  Failed Parsing  

--- HYBRID Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.76      0.64      0.69       409
 reliability       0.81      0.33      0.47       538
 performance       0.34      0.91      0.50       117
 portability       0.70      0.21      0.33       108
    security       0.11      

Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████| 1278/1278 [56:24<00:00,  2.65s/it]



✅ HYBRID Classification with llama3:8b completed in 56.41 minutes

--- Sample of HYBRID Predictions for llama3:8b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- HYBRID Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       0.82      0.11      0.20       432
 reliability       0.82      0.40      0.54       587
 performance       0.45      0.74      0.56       121
 portability       0.66      0.23      0.34       119
    security       0.07      0.89      0.12        19
       other       0.00      0.00      0.00     

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████| 1278/1278 [1:06:06<00:00,  3.10s/it]



✅ HYBRID Classification with gemma:7b completed in 66.10 minutes

--- Sample of HYBRID Predictions for gemma:7b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- HYBRID Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.76      0.09      0.17       399
 reliability       0.79      0.21      0.33       582
 performance       0.58      0.72      0.64       121
 portability       0.43      0.29      0.35       119
    security       0.04      0.95      0.07        19
       other       0.00      0.00      0.00        

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████| 1278/1278 [49:19<00:00,  2.32s/it]


✅ HYBRID Classification with phi3:mini completed in 49.32 minutes

--- Sample of HYBRID Predictions for phi3:mini ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- HYBRID Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       0.90      0.15      0.26       414
 reliability       0.80      0.49      0.61       580
 performance       0.59      0.82      0.69       120
 portability       0.64      0.32      0.43       118
    security       0.18      0.95      0.31        19
       other       0.00      0.00      0.00     




In [2]:

import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of top-performing Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Sampled NFR.xlsx Dataset) ---
print("--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---")
excel_path = "datasets/NFR.xlsx"
sheets = pd.read_excel(excel_path, sheet_name=None)

data_raw = pd.concat(sheets.values(), ignore_index=True)

data = data_raw[['User Review Sentence', 'NFR class']].rename(
    columns={'User Review Sentence': 'review', 'NFR class': 'ground_truth'}
)

VALID_NFR_LABELS = ["usability", "reliability", "performance", "portability", "security", "other"]

data['ground_truth'] = data['ground_truth'].str.strip().str.lower()

initial_len = len(data)
nfr_data = data[data['ground_truth'].isin(VALID_NFR_LABELS)].reset_index(drop=True)

if len(nfr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(nfr_data)} rows with unknown or invalid 'ground_truth' labels.")

# --- IMPORTANT: Sample the data for a quick test run (30 reviews) ---
# REMEMBER TO REMOVE OR COMMENT OUT THIS LINE FOR THE FULL DATASET RUN!
# nfr_data = nfr_data.sample(n=min(30, len(nfr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded {len(nfr_data)} non-functional reviews (sampled for pilot run).")
print("Sample of loaded NFR data:")
print(nfr_data.head())
print("-" * 40)


# --- 4. Hybrid Prompt Template (Role-Based + Few-Shot + Constraint-Based) ---
FEW_SHOT_EXAMPLES = [
    {"review": "it takes about 45 seconds for the page to turn after I clicked a button", "classification": "PE: Performance"},
    {"review": "keeps crashing when I try to save a new record", "classification": "RL: Reliability"},
    {"review": "I want to be able to use the app in my tablet not just on my phone", "classification": "PO: Portability"}
]

formatted_few_shot_text = ""
for ex in FEW_SHOT_EXAMPLES:
    formatted_few_shot_text += f"App Review: {ex['review']}\nClassification: {ex['classification']}\n\n"

hybrid_prompt_text = """
You are a highly skilled software requirements expert, specializing in the classification of user feedback. Your task is to precisely classify the provided app review into one of the following Non-Functional Requirement (NFR) types and output the result in a strictly defined format.

**NFR Categories:**
Usability (US), Reliability (RL), Performance (PE), Portability (PO), Security (SE), Other (OT).

**Definitions:**
- Usability (US): Ease of use, learnability, user interface.
- Reliability (RL): Dependability, availability, fault tolerance, recovery.
- Performance (PE): Speed, efficiency, response time, resource consumption.
- Portability (PO): Adaptability to different environments, ease of transfer.
- Security (SE): Protection of data, access control, privacy.
- Other (OT): Does not fit clearly into the above categories.

**Examples:**
{few_shot_examples_text}

**Instructions:**
1.  Read the 'App Review' carefully.
2.  Determine the single best NFR category from the 'NFR Categories' list based on the definitions and examples.
3.  Your final output MUST be ONLY the two-letter abbreviation for the category, followed by a colon and the full category name (e.g., 'US: Usability'). Do NOT include any other text, reasoning, or punctuation.

**App Review:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_nfr_with_ollama_model(review_text: str, model_name: str, prompt_template: str, few_shot_examples_text: str) -> dict:
    """
    Sends an NFR classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(
        review_text=review_text,
        few_shot_examples_text=few_shot_examples_text
    )

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Hybrid Prompting ---
all_models_hybrid_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting HYBRID Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(nfr_data.iterrows(), total=len(nfr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_nfr_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=hybrid_prompt_text,
            few_shot_examples_text=formatted_few_shot_text
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            match = re.search(
                r"^(?:US|RL|PE|PO|SE|OT):\s*(Usability|Reliability|Performance|Portability|Security|Other)$",
                predicted_raw,
                re.IGNORECASE | re.MULTILINE
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ HYBRID Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = nfr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_NFR_LABELS))
    ]

    print(f"\n--- Sample of HYBRID Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- HYBRID Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_NFR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_hybrid_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_hybrid_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} HYBRID Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL HYBRID NFR MODELS EVALUATION COMPLETE ==========\n")
print("Summary of HYBRID NFR Accuracies:")
for model, metrics in all_models_hybrid_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final HYBRID Evaluation End ---")

--- Loading and preparing Non-Functional Requirements data from NFR.xlsx ---
Loaded 1278 non-functional reviews (sampled for pilot run).
Sample of loaded NFR data:
                                              review ground_truth
0  without this the video calls could potentially...     security
1  collects way too much unneeded information abo...     security
2  why exctly do you need full read access to my ...     security
3                     more private than fb messenger     security
4  this app is the best message and chat service,...     security
----------------------------------------



Classifying reviews with llama2: 100%|█████████████████████████████████████████████| 1278/1278 [56:53<00:00,  2.67s/it]



✅ HYBRID Classification with llama2 completed in 56.89 minutes

--- Sample of HYBRID Predictions for llama2 ---
                                              review ground_truth  predicted
0  without this the video calls could potentially...     security   security
1  collects way too much unneeded information abo...     security      other
2  why exctly do you need full read access to my ...     security      other
3                     more private than fb messenger     security      other
4  this app is the best message and chat service,...     security  usability

--- HYBRID Classification Report for llama2 ---
              precision    recall  f1-score   support

   usability       0.54      0.78      0.64       432
 reliability       0.75      0.32      0.45       587
 performance       0.67      0.46      0.55       121
 portability       0.00      0.00      0.00       119
    security       0.15      0.21      0.17        19
       other       0.00      0.00      0.00        

Classifying reviews with mistral: 100%|████████████████████████████████████████████| 1278/1278 [55:06<00:00,  2.59s/it]



✅ HYBRID Classification with mistral completed in 55.11 minutes

--- Sample of HYBRID Predictions for mistral ---
                                              review ground_truth  \
0  without this the video calls could potentially...     security   
1  collects way too much unneeded information abo...     security   
2  why exctly do you need full read access to my ...     security   
3                     more private than fb messenger     security   
4  this app is the best message and chat service,...     security   

        predicted  
0        security  
1        security  
2        security  
3        security  
4  Failed Parsing  

--- HYBRID Classification Report for mistral ---
              precision    recall  f1-score   support

   usability       0.76      0.64      0.70       409
 reliability       0.81      0.33      0.47       538
 performance       0.34      0.91      0.50       117
 portability       0.70      0.21      0.32       109
    security       0.11      

Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████| 1278/1278 [56:24<00:00,  2.65s/it]



✅ HYBRID Classification with llama3:8b completed in 56.41 minutes

--- Sample of HYBRID Predictions for llama3:8b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- HYBRID Classification Report for llama3:8b ---
              precision    recall  f1-score   support

   usability       0.81      0.11      0.20       432
 reliability       0.81      0.41      0.54       587
 performance       0.47      0.73      0.57       121
 portability       0.66      0.23      0.34       119
    security       0.07      0.89      0.12        19
       other       0.00      0.00      0.00     

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████| 1278/1278 [1:06:15<00:00,  3.11s/it]



✅ HYBRID Classification with gemma:7b completed in 66.26 minutes

--- Sample of HYBRID Predictions for gemma:7b ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- HYBRID Classification Report for gemma:7b ---
              precision    recall  f1-score   support

   usability       0.76      0.09      0.17       399
 reliability       0.79      0.21      0.33       582
 performance       0.58      0.72      0.64       121
 portability       0.43      0.29      0.35       119
    security       0.04      0.95      0.07        19
       other       0.00      0.00      0.00        

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████| 1278/1278 [49:06<00:00,  2.31s/it]


✅ HYBRID Classification with phi3:mini completed in 49.11 minutes

--- Sample of HYBRID Predictions for phi3:mini ---
                                              review ground_truth predicted
0  without this the video calls could potentially...     security  security
1  collects way too much unneeded information abo...     security  security
2  why exctly do you need full read access to my ...     security  security
3                     more private than fb messenger     security  security
4  this app is the best message and chat service,...     security  security

--- HYBRID Classification Report for phi3:mini ---
              precision    recall  f1-score   support

   usability       0.90      0.15      0.26       414
 reliability       0.80      0.49      0.61       580
 performance       0.59      0.82      0.69       120
 portability       0.64      0.32      0.43       118
    security       0.18      0.95      0.31        19
       other       0.00      0.00      0.00     


