In [2]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import os

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b", # Assuming you pulled llama3:8b
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full Dataset) ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test.txt"

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Ignore malformed lines
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- Standardize labels from the file to match VALID_FR_LABELS ---
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

# Filter out rows where ground_truth is not one of our valid labels after mapping
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

print(f"Loaded {len(fr_data)} functional reviews from the full dataset for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. Define Prompting Strategies (Now only includes Chain-of-Thought) ---

PROMPT_STRATEGIES = {
    "Zero_Shot_Prompt": """
You are an expert in software requirements analysis for user feedback. Classify the given app review into exactly ONE of:
- Feature Request
- Bug Report
- Other

DEFINITIONS (concise):
- Feature Request: Asks for a new capability or an enhancement to something that currently works.
- Bug Report: Describes an error, crash, failure, or behavior not working as intended.
- Other: Vague praise/complaint, questions, off-topic, or not enough info to decide.

TIE-BREAK RULES:
- If any error/failure is stated or implied → Bug Report.
- If any explicit request/suggestion for new/improved functionality and no error → Feature Request.
- Otherwise → Other.

OUTPUT FORMAT:
Return ONLY one of these exact strings with no extra text or punctuation:
Feature Request | Bug Report | Other

App Review Segment: '''{review_text}'''
Answer:
"""
}


# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends a classification request to the local Ollama model using a specified prompt template.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 256 # Increase num_predict for CoT to allow for more output
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models and Prompt Strategies ---
all_evaluation_results = {}

for prompt_name, prompt_template in PROMPT_STRATEGIES.items():
    print(f"\n\n{'='*15} Evaluating with Prompt Strategy: {prompt_name} {'='*15}")
    all_evaluation_results[prompt_name] = {}

    for current_model_name in OLLAMA_MODELS_TO_TEST:
        print(f"\n{'='*10} Starting Classification for Model: {current_model_name} {'='*10}")
        
        predictions = []
        start_time = time.time()

        for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name} ({prompt_name})"):
            response_data = classify_with_ollama_model(row['review'], current_model_name, prompt_template)
            
            if response_data["success"]:
                predicted_raw = response_data["raw_response"].strip()
                
                # Regex specifically for Chain-of-Thought, looking for "FINAL CLASSIFICATION:"
                # This makes parsing more robust when the LLM provides reasoning first.
                match = re.search(
                    r"FINAL CLASSIFICATION:\s*(feature request|bug report|other)",
                    predicted_raw,
                    re.IGNORECASE | re.DOTALL
                )
                
                if match:
                    pred = match.group(1).strip().lower()
                else:
                    # Fallback: If "FINAL CLASSIFICATION:" isn't found, try to find any valid label
                    # in the last few lines, as models might sometimes deviate.
                    # This is less ideal for CoT but helps catch cases where the model
                    # doesn't strictly follow the "FINAL CLASSIFICATION" format.
                    lines = predicted_raw.split('\n')
                    found_valid_label = False
                    for line in reversed(lines): # Check from bottom up
                        for label in VALID_FR_LABELS:
                            if label in line.lower():
                                pred = label
                                found_valid_label = True
                                break
                        if found_valid_label:
                            break
                    if not found_valid_label:
                        pred = "Failed Parsing"
                        logger.warning(f"\nFailed Parsing for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}'")
            else:
                pred = "Failed"
                logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name} using prompt {prompt_name}")
                
            predictions.append(pred)

        elapsed = time.time() - start_time
        print(f"\n✅ Classification with {current_model_name} using {prompt_name} completed in {elapsed/60:.2f} minutes")

        # --- 7. Prepare Results and Generate Classification Report for current model and prompt ---
        results_df_current_run = fr_data.copy()
        results_df_current_run['predicted'] = predictions

        filtered_results = results_df_current_run[
            (results_df_current_run['predicted'] != 'failed') &
            (results_df_current_run['predicted'].isin(VALID_FR_LABELS))
        ]

        print(f"\n--- Sample of Predictions for {current_model_name} ({prompt_name}) ---")
        print(results_df_current_run.head())

        print(f"\n--- Classification Report for {current_model_name} ({prompt_name}) ---")
        if not filtered_results.empty:
            report = classification_report(
                filtered_results['ground_truth'],
                filtered_results['predicted'],
                labels=VALID_FR_LABELS,
                zero_division=0
            )
            print(report)
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
                'report': report # Store the full report string
            }
        else:
            print(f"No valid predictions to generate a classification report for {current_model_name} ({prompt_name}).")
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': 0.0,
                'report': "No valid predictions."
            }
        
        print(f"\n{'='*10} Evaluation for {current_model_name} ({prompt_name}) Complete {'='*10}\n")

print("\n\n========== ALL EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies across all Prompt Strategies and Models:")

for prompt_name, models_data in all_evaluation_results.items():
    print(f"\n--- Prompt Strategy: {prompt_name} ---")
    for model, metrics in models_data.items():
        print(f"  {model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final Evaluation End ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---
Loaded 512 functional reviews from the full dataset for testing.
Sample of loaded FR data:
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------





Classifying reviews with llama2 (Zero_Shot_Prompt): 100%|██████████████████████████| 512/512 [1:06:02<00:00,  7.74s/it]



✅ Classification with llama2 using Zero_Shot_Prompt completed in 66.04 minutes

--- Sample of Predictions for llama2 (Zero_Shot_Prompt) ---
                                              review ground_truth  \
0                'this version crashes all the time'   bug report   
1                    'it take a lot time in loading'   bug report   
2                               'pages freeze often'   bug report   
3  'still having problems uploading sometimes tho...   bug report   
4  'it wont load any of my notifications when i c...   bug report   

         predicted  
0  feature request  
1  feature request  
2  feature request  
3  feature request  
4  feature request  

--- Classification Report for llama2 (Zero_Shot_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.10      0.98      0.18        50
     bug report       0.00      0.00      0.00       288
          other       0.75      0.10      0.18       174

       accuracy            

Classifying reviews with mistral (Zero_Shot_Prompt): 100%|█████████████████████████| 512/512 [2:02:22<00:00, 14.34s/it]



✅ Classification with mistral using Zero_Shot_Prompt completed in 122.38 minutes

--- Sample of Predictions for mistral (Zero_Shot_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report       other
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for mistral (Zero_Shot_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.27      0.70      0.38        50
     bug report       0.92      0.49      0.63       288
          other       0.61      0.79      0.69       174

       accuracy                           0.61       512
      macro avg       0.60      0.66   

Classifying reviews with llama3:8b (Zero_Shot_Prompt): 100%|███████████████████████| 512/512 [2:13:22<00:00, 15.63s/it]



✅ Classification with llama3:8b using Zero_Shot_Prompt completed in 133.38 minutes

--- Sample of Predictions for llama3:8b (Zero_Shot_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report       other
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama3:8b (Zero_Shot_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.78      0.36      0.49        50
     bug report       0.79      0.91      0.84       288
          other       0.77      0.70      0.73       174

       accuracy                           0.78       512
      macro avg       0.78      0

Classifying reviews with gemma:7b (Zero_Shot_Prompt): 100%|████████████████████████| 512/512 [3:40:16<00:00, 25.81s/it]



✅ Classification with gemma:7b using Zero_Shot_Prompt completed in 220.28 minutes

--- Sample of Predictions for gemma:7b (Zero_Shot_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for gemma:7b (Zero_Shot_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.33      0.70      0.45        50
     bug report       0.86      0.79      0.82       288
          other       0.78      0.64      0.70       174

       accuracy                           0.73       512
      macro avg       0.66      0.71

Classifying reviews with phi3:mini (Zero_Shot_Prompt): 100%|███████████████████████| 512/512 [2:38:07<00:00, 18.53s/it]


✅ Classification with phi3:mini using Zero_Shot_Prompt completed in 158.13 minutes

--- Sample of Predictions for phi3:mini (Zero_Shot_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for phi3:mini (Zero_Shot_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.57      0.08      0.14        50
     bug report       0.69      0.95      0.80       288
          other       0.80      0.51      0.62       174

       accuracy                           0.71       512
      macro avg       0.69      0




In [None]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import os

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b", # Assuming you pulled llama3:8b
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full Dataset) ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test.txt"

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Ignore malformed lines
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- Standardize labels from the file to match VALID_FR_LABELS ---
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

# Filter out rows where ground_truth is not one of our valid labels after mapping
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

print(f"Loaded {len(fr_data)} functional reviews from the full dataset for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. Define Prompting Strategies (Now only includes Chain-of-Thought) ---

PROMPT_STRATEGIES = {
    "Zero_Shot_Prompt": """
You are an expert in software requirements analysis for user feedback. Classify the given app review into exactly ONE of:
- Feature Request
- Bug Report
- Other

DEFINITIONS (concise):
- Feature Request: Asks for a new capability or an enhancement to something that currently works.
- Bug Report: Describes an error, crash, failure, or behavior not working as intended.
- Other: Vague praise/complaint, questions, off-topic, or not enough info to decide.

TIE-BREAK RULES:
- If any error/failure is stated or implied → Bug Report.
- If any explicit request/suggestion for new/improved functionality and no error → Feature Request.
- Otherwise → Other.

OUTPUT FORMAT:
Return ONLY one of these exact strings with no extra text or punctuation:
Feature Request | Bug Report | Other

App Review Segment: '''{review_text}'''
Answer:
"""
}


# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends a classification request to the local Ollama model using a specified prompt template.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 256 # Increase num_predict for CoT to allow for more output
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models and Prompt Strategies ---
all_evaluation_results = {}

for prompt_name, prompt_template in PROMPT_STRATEGIES.items():
    print(f"\n\n{'='*15} Evaluating with Prompt Strategy: {prompt_name} {'='*15}")
    all_evaluation_results[prompt_name] = {}

    for current_model_name in OLLAMA_MODELS_TO_TEST:
        print(f"\n{'='*10} Starting Classification for Model: {current_model_name} {'='*10}")
        
        predictions = []
        start_time = time.time()

        for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name} ({prompt_name})"):
            response_data = classify_with_ollama_model(row['review'], current_model_name, prompt_template)
            
            if response_data["success"]:
                predicted_raw = response_data["raw_response"].strip()
                
                # Regex specifically for Chain-of-Thought, looking for "FINAL CLASSIFICATION:"
                # This makes parsing more robust when the LLM provides reasoning first.
                match = re.search(
                    r"FINAL CLASSIFICATION:\s*(feature request|bug report|other)",
                    predicted_raw,
                    re.IGNORECASE | re.DOTALL
                )
                
                if match:
                    pred = match.group(1).strip().lower()
                else:
                    # Fallback: If "FINAL CLASSIFICATION:" isn't found, try to find any valid label
                    # in the last few lines, as models might sometimes deviate.
                    # This is less ideal for CoT but helps catch cases where the model
                    # doesn't strictly follow the "FINAL CLASSIFICATION" format.
                    lines = predicted_raw.split('\n')
                    found_valid_label = False
                    for line in reversed(lines): # Check from bottom up
                        for label in VALID_FR_LABELS:
                            if label in line.lower():
                                pred = label
                                found_valid_label = True
                                break
                        if found_valid_label:
                            break
                    if not found_valid_label:
                        pred = "Failed Parsing"
                        logger.warning(f"\nFailed Parsing for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}'")
            else:
                pred = "Failed"
                logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name} using prompt {prompt_name}")
                
            predictions.append(pred)

        elapsed = time.time() - start_time
        print(f"\n✅ Classification with {current_model_name} using {prompt_name} completed in {elapsed/60:.2f} minutes")

        # --- 7. Prepare Results and Generate Classification Report for current model and prompt ---
        results_df_current_run = fr_data.copy()
        results_df_current_run['predicted'] = predictions

        filtered_results = results_df_current_run[
            (results_df_current_run['predicted'] != 'failed') &
            (results_df_current_run['predicted'].isin(VALID_FR_LABELS))
        ]

        print(f"\n--- Sample of Predictions for {current_model_name} ({prompt_name}) ---")
        print(results_df_current_run.head())

        print(f"\n--- Classification Report for {current_model_name} ({prompt_name}) ---")
        if not filtered_results.empty:
            report = classification_report(
                filtered_results['ground_truth'],
                filtered_results['predicted'],
                labels=VALID_FR_LABELS,
                zero_division=0
            )
            print(report)
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
                'report': report # Store the full report string
            }
        else:
            print(f"No valid predictions to generate a classification report for {current_model_name} ({prompt_name}).")
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': 0.0,
                'report': "No valid predictions."
            }
        
        print(f"\n{'='*10} Evaluation for {current_model_name} ({prompt_name}) Complete {'='*10}\n")

print("\n\n========== ALL EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies across all Prompt Strategies and Models:")

for prompt_name, models_data in all_evaluation_results.items():
    print(f"\n--- Prompt Strategy: {prompt_name} ---")
    for model, metrics in models_data.items():
        print(f"  {model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final Evaluation End ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---
Loaded 512 functional reviews from the full dataset for testing.
Sample of loaded FR data:
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------





Classifying reviews with llama2 (Zero_Shot_Prompt):  90%|███████████████████████▎  | 460/512 [1:32:43<12:28, 14.39s/it]2025-09-28 14:44:24,662 - __main__ - ERROR - Ollama request timed out for review: ''With YouTube playing while screens locked'...' with model llama2
Classifying reviews with llama2 (Zero_Shot_Prompt):  90%|███████████████████████▍  | 461/512 [1:34:47<40:12, 47.30s/it]2025-09-28 14:46:29,871 - __main__ - ERROR - Ollama request timed out for review: ''With iOS allowing content blocking in Safari and ...' with model llama2
Classifying reviews with llama2 (Zero_Shot_Prompt): 100%|██████████████████████████| 512/512 [1:53:03<00:00, 13.25s/it]



✅ Classification with llama2 using Zero_Shot_Prompt completed in 113.06 minutes

--- Sample of Predictions for llama2 (Zero_Shot_Prompt) ---
                                              review ground_truth  \
0                'this version crashes all the time'   bug report   
1                    'it take a lot time in loading'   bug report   
2                               'pages freeze often'   bug report   
3  'still having problems uploading sometimes tho...   bug report   
4  'it wont load any of my notifications when i c...   bug report   

         predicted  
0  feature request  
1  feature request  
2  feature request  
3  feature request  
4  feature request  

--- Classification Report for llama2 (Zero_Shot_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.10      0.96      0.18        50
     bug report       0.00      0.00      0.00       288
          other       0.71      0.12      0.20       172

       accuracy           

Classifying reviews with mistral (Zero_Shot_Prompt):  17%|████▎                     | 85/512 [35:44<4:21:10, 36.70s/it]2025-09-28 15:40:39,398 - __main__ - ERROR - Ollama request timed out for review: ''will not load this sucks try again '...' with model mistral
Classifying reviews with mistral (Zero_Shot_Prompt): 100%|█████████████████████████| 512/512 [2:18:10<00:00, 16.19s/it]



✅ Classification with mistral using Zero_Shot_Prompt completed in 138.18 minutes

--- Sample of Predictions for mistral (Zero_Shot_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report       other
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for mistral (Zero_Shot_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.26      0.70      0.38        50
     bug report       0.91      0.48      0.63       287
          other       0.61      0.79      0.69       174

       accuracy                           0.61       511
      macro avg       0.59      0.66   

Classifying reviews with llama3:8b (Zero_Shot_Prompt): 100%|███████████████████████| 512/512 [2:53:53<00:00, 20.38s/it]



✅ Classification with llama3:8b using Zero_Shot_Prompt completed in 173.89 minutes

--- Sample of Predictions for llama3:8b (Zero_Shot_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report       other
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama3:8b (Zero_Shot_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.78      0.36      0.49        50
     bug report       0.79      0.91      0.84       288
          other       0.77      0.70      0.73       174

       accuracy                           0.78       512
      macro avg       0.78      0

Classifying reviews with gemma:7b (Zero_Shot_Prompt):  11%|██▊                      | 58/512 [30:04<4:14:48, 33.68s/it]2025-09-28 21:17:10,306 - __main__ - ERROR - Ollama request timed out for review: ''keeps force closing all the time'...' with model gemma:7b
Classifying reviews with gemma:7b (Zero_Shot_Prompt):  56%|████████████▎         | 286/512 [3:07:25<1:57:52, 31.30s/it]

In [None]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import os

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b", # Assuming you pulled llama3:8b
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full Dataset) ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test.txt"

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Ignore malformed lines
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- Standardize labels from the file to match VALID_FR_LABELS ---
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

# Filter out rows where ground_truth is not one of our valid labels after mapping
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

print(f"Loaded {len(fr_data)} functional reviews from the full dataset for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. Define Prompting Strategies (Now only includes Chain-of-Thought) ---

PROMPT_STRATEGIES = {
    "Zero_Shot_Prompt": """
You are an expert in software requirements analysis for user feedback. Classify the given app review into exactly ONE of:
- Feature Request
- Bug Report
- Other

DEFINITIONS (concise):
- Feature Request: Asks for a new capability or an enhancement to something that currently works.
- Bug Report: Describes an error, crash, failure, or behavior not working as intended.
- Other: Vague praise/complaint, questions, off-topic, or not enough info to decide.

TIE-BREAK RULES:
- If any error/failure is stated or implied → Bug Report.
- If any explicit request/suggestion for new/improved functionality and no error → Feature Request.
- Otherwise → Other.

OUTPUT FORMAT:
Return ONLY one of these exact strings with no extra text or punctuation:
Feature Request | Bug Report | Other

App Review Segment: '''{review_text}'''
Answer:
"""
}


# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends a classification request to the local Ollama model using a specified prompt template.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 256 # Increase num_predict for CoT to allow for more output
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models and Prompt Strategies ---
all_evaluation_results = {}

for prompt_name, prompt_template in PROMPT_STRATEGIES.items():
    print(f"\n\n{'='*15} Evaluating with Prompt Strategy: {prompt_name} {'='*15}")
    all_evaluation_results[prompt_name] = {}

    for current_model_name in OLLAMA_MODELS_TO_TEST:
        print(f"\n{'='*10} Starting Classification for Model: {current_model_name} {'='*10}")
        
        predictions = []
        start_time = time.time()

        for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name} ({prompt_name})"):
            response_data = classify_with_ollama_model(row['review'], current_model_name, prompt_template)
            
            if response_data["success"]:
                predicted_raw = response_data["raw_response"].strip()
                
                # Regex specifically for Chain-of-Thought, looking for "FINAL CLASSIFICATION:"
                # This makes parsing more robust when the LLM provides reasoning first.
                match = re.search(
                    r"FINAL CLASSIFICATION:\s*(feature request|bug report|other)",
                    predicted_raw,
                    re.IGNORECASE | re.DOTALL
                )
                
                if match:
                    pred = match.group(1).strip().lower()
                else:
                    # Fallback: If "FINAL CLASSIFICATION:" isn't found, try to find any valid label
                    # in the last few lines, as models might sometimes deviate.
                    # This is less ideal for CoT but helps catch cases where the model
                    # doesn't strictly follow the "FINAL CLASSIFICATION" format.
                    lines = predicted_raw.split('\n')
                    found_valid_label = False
                    for line in reversed(lines): # Check from bottom up
                        for label in VALID_FR_LABELS:
                            if label in line.lower():
                                pred = label
                                found_valid_label = True
                                break
                        if found_valid_label:
                            break
                    if not found_valid_label:
                        pred = "Failed Parsing"
                        logger.warning(f"\nFailed Parsing for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}'")
            else:
                pred = "Failed"
                logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name} using prompt {prompt_name}")
                
            predictions.append(pred)

        elapsed = time.time() - start_time
        print(f"\n✅ Classification with {current_model_name} using {prompt_name} completed in {elapsed/60:.2f} minutes")

        # --- 7. Prepare Results and Generate Classification Report for current model and prompt ---
        results_df_current_run = fr_data.copy()
        results_df_current_run['predicted'] = predictions

        filtered_results = results_df_current_run[
            (results_df_current_run['predicted'] != 'failed') &
            (results_df_current_run['predicted'].isin(VALID_FR_LABELS))
        ]

        print(f"\n--- Sample of Predictions for {current_model_name} ({prompt_name}) ---")
        print(results_df_current_run.head())

        print(f"\n--- Classification Report for {current_model_name} ({prompt_name}) ---")
        if not filtered_results.empty:
            report = classification_report(
                filtered_results['ground_truth'],
                filtered_results['predicted'],
                labels=VALID_FR_LABELS,
                zero_division=0
            )
            print(report)
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
                'report': report # Store the full report string
            }
        else:
            print(f"No valid predictions to generate a classification report for {current_model_name} ({prompt_name}).")
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': 0.0,
                'report': "No valid predictions."
            }
        
        print(f"\n{'='*10} Evaluation for {current_model_name} ({prompt_name}) Complete {'='*10}\n")

print("\n\n========== ALL EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies across all Prompt Strategies and Models:")

for prompt_name, models_data in all_evaluation_results.items():
    print(f"\n--- Prompt Strategy: {prompt_name} ---")
    for model, metrics in models_data.items():
        print(f"  {model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final Evaluation End ---")