In [2]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import os

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b", # Assuming you pulled llama3:8b
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full Dataset) ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test.txt"

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Ignore malformed lines
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- Standardize labels from the file to match VALID_FR_LABELS ---
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

# Filter out rows where ground_truth is not one of our valid labels after mapping
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

print(f"Loaded {len(fr_data)} functional reviews from the full dataset for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. Define Prompting Strategies (Now only includes Chain-of-Thought) ---

PROMPT_STRATEGIES = {
    "Chain_of_Thought_Prompt": """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Think step-by-step about which category (Feature Request, Bug Report, Other) the review segment best fits, based on the definitions. Explain your reasoning.
3.  After your step-by-step thinking process, state your final classification clearly, preceded by "FINAL CLASSIFICATION:".
4.  Your final output for classification MUST be only the category name, without any additional text, explanation, or punctuation after "FINAL CLASSIFICATION:".

**App Review Segment:** '''{review_text}'''

**Thinking Process:**
"""
}

# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends a classification request to the local Ollama model using a specified prompt template.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 256 # Increase num_predict for CoT to allow for more output
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models and Prompt Strategies ---
all_evaluation_results = {}

for prompt_name, prompt_template in PROMPT_STRATEGIES.items():
    print(f"\n\n{'='*15} Evaluating with Prompt Strategy: {prompt_name} {'='*15}")
    all_evaluation_results[prompt_name] = {}

    for current_model_name in OLLAMA_MODELS_TO_TEST:
        print(f"\n{'='*10} Starting Classification for Model: {current_model_name} {'='*10}")
        
        predictions = []
        start_time = time.time()

        for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name} ({prompt_name})"):
            response_data = classify_with_ollama_model(row['review'], current_model_name, prompt_template)
            
            if response_data["success"]:
                predicted_raw = response_data["raw_response"].strip()
                
                # Regex specifically for Chain-of-Thought, looking for "FINAL CLASSIFICATION:"
                # This makes parsing more robust when the LLM provides reasoning first.
                match = re.search(
                    r"FINAL CLASSIFICATION:\s*(feature request|bug report|other)",
                    predicted_raw,
                    re.IGNORECASE | re.DOTALL
                )
                
                if match:
                    pred = match.group(1).strip().lower()
                else:
                    # Fallback: If "FINAL CLASSIFICATION:" isn't found, try to find any valid label
                    # in the last few lines, as models might sometimes deviate.
                    # This is less ideal for CoT but helps catch cases where the model
                    # doesn't strictly follow the "FINAL CLASSIFICATION" format.
                    lines = predicted_raw.split('\n')
                    found_valid_label = False
                    for line in reversed(lines): # Check from bottom up
                        for label in VALID_FR_LABELS:
                            if label in line.lower():
                                pred = label
                                found_valid_label = True
                                break
                        if found_valid_label:
                            break
                    if not found_valid_label:
                        pred = "Failed Parsing"
                        logger.warning(f"\nFailed Parsing for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}'")
            else:
                pred = "Failed"
                logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name} using prompt {prompt_name}")
                
            predictions.append(pred)

        elapsed = time.time() - start_time
        print(f"\n✅ Classification with {current_model_name} using {prompt_name} completed in {elapsed/60:.2f} minutes")

        # --- 7. Prepare Results and Generate Classification Report for current model and prompt ---
        results_df_current_run = fr_data.copy()
        results_df_current_run['predicted'] = predictions

        filtered_results = results_df_current_run[
            (results_df_current_run['predicted'] != 'failed') &
            (results_df_current_run['predicted'].isin(VALID_FR_LABELS))
        ]

        print(f"\n--- Sample of Predictions for {current_model_name} ({prompt_name}) ---")
        print(results_df_current_run.head())

        print(f"\n--- Classification Report for {current_model_name} ({prompt_name}) ---")
        if not filtered_results.empty:
            report = classification_report(
                filtered_results['ground_truth'],
                filtered_results['predicted'],
                labels=VALID_FR_LABELS,
                zero_division=0
            )
            print(report)
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
                'report': report # Store the full report string
            }
        else:
            print(f"No valid predictions to generate a classification report for {current_model_name} ({prompt_name}).")
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': 0.0,
                'report': "No valid predictions."
            }
        
        print(f"\n{'='*10} Evaluation for {current_model_name} ({prompt_name}) Complete {'='*10}\n")

print("\n\n========== ALL EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies across all Prompt Strategies and Models:")

for prompt_name, models_data in all_evaluation_results.items():
    print(f"\n--- Prompt Strategy: {prompt_name} ---")
    for model, metrics in models_data.items():
        print(f"  {model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final Evaluation End ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---
Loaded 512 functional reviews from the full dataset for testing.
Sample of loaded FR data:
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------





Classifying reviews with llama2 (Chain_of_Thought_Prompt): 100%|█████████████████████| 512/512 [26:04<00:00,  3.06s/it]



✅ Classification with llama2 using Chain_of_Thought_Prompt completed in 26.07 minutes

--- Sample of Predictions for llama2 (Chain_of_Thought_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama2 (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.22      0.20      0.21        50
     bug report       0.63      0.99      0.77       288
          other       0.93      0.08      0.15       174

       accuracy                           0.60       512
      macro avg       

Classifying reviews with mistral (Chain_of_Thought_Prompt): 100%|██████████████████| 512/512 [1:02:38<00:00,  7.34s/it]



✅ Classification with mistral using Chain_of_Thought_Prompt completed in 62.64 minutes

--- Sample of Predictions for mistral (Chain_of_Thought_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report       other
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for mistral (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.37      0.78      0.50        50
     bug report       0.83      0.80      0.82       288
          other       0.76      0.57      0.65       174

       accuracy                           0.72       512
      macro avg    

Classifying reviews with llama3:8b (Chain_of_Thought_Prompt): 100%|████████████████| 512/512 [1:48:48<00:00, 12.75s/it]



✅ Classification with llama3:8b using Chain_of_Thought_Prompt completed in 108.81 minutes

--- Sample of Predictions for llama3:8b (Chain_of_Thought_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama3:8b (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.55      0.58      0.56        50
     bug report       0.80      0.87      0.84       288
          other       0.77      0.65      0.71       174

       accuracy                           0.77       512
      macro 

Classifying reviews with gemma:7b (Chain_of_Thought_Prompt): 100%|█████████████████| 512/512 [1:22:26<00:00,  9.66s/it]



✅ Classification with gemma:7b using Chain_of_Thought_Prompt completed in 82.45 minutes

--- Sample of Predictions for gemma:7b (Chain_of_Thought_Prompt) ---
                                              review ground_truth  \
0                'this version crashes all the time'   bug report   
1                    'it take a lot time in loading'   bug report   
2                               'pages freeze often'   bug report   
3  'still having problems uploading sometimes tho...   bug report   
4  'it wont load any of my notifications when i c...   bug report   

         predicted  
0       bug report  
1  feature request  
2       bug report  
3       bug report  
4       bug report  

--- Classification Report for gemma:7b (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.20      0.86      0.32        50
     bug report       0.85      0.78      0.81       288
          other       0.90      0.16      0.26       174



Failed Parsing for: ''Why am I getting ads that pop up''
Raw LLM Output: '1. The review segment mentions receiving unwanted advertisements, which implies a negative experience with the app's current state of affairs regarding in-app purchases or monetization methods through ads. This does not suggest that there is an error within the functionality itself but rather indicates dissatisfaction and possibly misalignment between user expectations (e.g., no advertisements) and what they are currently experien0bting, which could be a feature of the app or its settings/preferences as designed by developers.

2. Since there is an indication that something within the current functionality isn't meeting users’ desires for their experience (i.e., not wanting ads), and no explicit mention of bugs like crashes, incorrect behavior due to a feature malfunctioning or errors in code execution related directly to these pop-up advertisements being presented as an issue with existing features working incor


✅ Classification with phi3:mini using Chain_of_Thought_Prompt completed in 39.99 minutes

--- Sample of Predictions for phi3:mini (Chain_of_Thought_Prompt) ---
                                              review ground_truth  \
0                'this version crashes all the time'   bug report   
1                    'it take a lot time in loading'   bug report   
2                               'pages freeze often'   bug report   
3  'still having problems uploading sometimes tho...   bug report   
4  'it wont load any of my notifications when i c...   bug report   

         predicted  
0       bug report  
1  feature request  
2       bug report  
3       bug report  
4       bug report  

--- Classification Report for phi3:mini (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.38      0.68      0.49        50
     bug report       0.82      0.83      0.82       288
          other       0.81      0.60      0.69       17




In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import os

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b", # Assuming you pulled llama3:8b
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full Dataset) ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test.txt"

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Ignore malformed lines
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- Standardize labels from the file to match VALID_FR_LABELS ---
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

# Filter out rows where ground_truth is not one of our valid labels after mapping
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

print(f"Loaded {len(fr_data)} functional reviews from the full dataset for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. Define Prompting Strategies (Now only includes Chain-of-Thought) ---

PROMPT_STRATEGIES = {
    "Chain_of_Thought_Prompt": """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Think step-by-step about which category (Feature Request, Bug Report, Other) the review segment best fits, based on the definitions. Explain your reasoning.
3.  After your step-by-step thinking process, state your final classification clearly, preceded by "FINAL CLASSIFICATION:".
4.  Your final output for classification MUST be only the category name, without any additional text, explanation, or punctuation after "FINAL CLASSIFICATION:".

**App Review Segment:** '''{review_text}'''

**Thinking Process:**
"""
}

# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends a classification request to the local Ollama model using a specified prompt template.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 256 # Increase num_predict for CoT to allow for more output
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models and Prompt Strategies ---
all_evaluation_results = {}

for prompt_name, prompt_template in PROMPT_STRATEGIES.items():
    print(f"\n\n{'='*15} Evaluating with Prompt Strategy: {prompt_name} {'='*15}")
    all_evaluation_results[prompt_name] = {}

    for current_model_name in OLLAMA_MODELS_TO_TEST:
        print(f"\n{'='*10} Starting Classification for Model: {current_model_name} {'='*10}")
        
        predictions = []
        start_time = time.time()

        for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name} ({prompt_name})"):
            response_data = classify_with_ollama_model(row['review'], current_model_name, prompt_template)
            
            if response_data["success"]:
                predicted_raw = response_data["raw_response"].strip()
                
                # Regex specifically for Chain-of-Thought, looking for "FINAL CLASSIFICATION:"
                # This makes parsing more robust when the LLM provides reasoning first.
                match = re.search(
                    r"FINAL CLASSIFICATION:\s*(feature request|bug report|other)",
                    predicted_raw,
                    re.IGNORECASE | re.DOTALL
                )
                
                if match:
                    pred = match.group(1).strip().lower()
                else:
                    # Fallback: If "FINAL CLASSIFICATION:" isn't found, try to find any valid label
                    # in the last few lines, as models might sometimes deviate.
                    # This is less ideal for CoT but helps catch cases where the model
                    # doesn't strictly follow the "FINAL CLASSIFICATION" format.
                    lines = predicted_raw.split('\n')
                    found_valid_label = False
                    for line in reversed(lines): # Check from bottom up
                        for label in VALID_FR_LABELS:
                            if label in line.lower():
                                pred = label
                                found_valid_label = True
                                break
                        if found_valid_label:
                            break
                    if not found_valid_label:
                        pred = "Failed Parsing"
                        logger.warning(f"\nFailed Parsing for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}'")
            else:
                pred = "Failed"
                logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name} using prompt {prompt_name}")
                
            predictions.append(pred)

        elapsed = time.time() - start_time
        print(f"\n✅ Classification with {current_model_name} using {prompt_name} completed in {elapsed/60:.2f} minutes")

        # --- 7. Prepare Results and Generate Classification Report for current model and prompt ---
        results_df_current_run = fr_data.copy()
        results_df_current_run['predicted'] = predictions

        filtered_results = results_df_current_run[
            (results_df_current_run['predicted'] != 'failed') &
            (results_df_current_run['predicted'].isin(VALID_FR_LABELS))
        ]

        print(f"\n--- Sample of Predictions for {current_model_name} ({prompt_name}) ---")
        print(results_df_current_run.head())

        print(f"\n--- Classification Report for {current_model_name} ({prompt_name}) ---")
        if not filtered_results.empty:
            report = classification_report(
                filtered_results['ground_truth'],
                filtered_results['predicted'],
                labels=VALID_FR_LABELS,
                zero_division=0
            )
            print(report)
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
                'report': report # Store the full report string
            }
        else:
            print(f"No valid predictions to generate a classification report for {current_model_name} ({prompt_name}).")
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': 0.0,
                'report': "No valid predictions."
            }
        
        print(f"\n{'='*10} Evaluation for {current_model_name} ({prompt_name}) Complete {'='*10}\n")

print("\n\n========== ALL EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies across all Prompt Strategies and Models:")

for prompt_name, models_data in all_evaluation_results.items():
    print(f"\n--- Prompt Strategy: {prompt_name} ---")
    for model, metrics in models_data.items():
        print(f"  {model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final Evaluation End ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---
Loaded 512 functional reviews from the full dataset for testing.
Sample of loaded FR data:
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------





Classifying reviews with llama2 (Chain_of_Thought_Prompt): 100%|█████████████████████| 512/512 [27:12<00:00,  3.19s/it]



✅ Classification with llama2 using Chain_of_Thought_Prompt completed in 27.21 minutes

--- Sample of Predictions for llama2 (Chain_of_Thought_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama2 (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.21      0.20      0.21        50
     bug report       0.63      0.99      0.77       288
          other       0.93      0.08      0.15       174

       accuracy                           0.60       512
      macro avg       

Classifying reviews with mistral (Chain_of_Thought_Prompt): 100%|██████████████████| 512/512 [1:28:58<00:00, 10.43s/it]



✅ Classification with mistral using Chain_of_Thought_Prompt completed in 88.98 minutes

--- Sample of Predictions for mistral (Chain_of_Thought_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report       other
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for mistral (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.38      0.80      0.52        50
     bug report       0.83      0.77      0.80       288
          other       0.75      0.60      0.67       174

       accuracy                           0.72       512
      macro avg    

Classifying reviews with llama3:8b (Chain_of_Thought_Prompt): 100%|████████████████| 512/512 [2:13:52<00:00, 15.69s/it]



✅ Classification with llama3:8b using Chain_of_Thought_Prompt completed in 133.87 minutes

--- Sample of Predictions for llama3:8b (Chain_of_Thought_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report       other
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama3:8b (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.48      0.58      0.52        50
     bug report       0.79      0.82      0.81       288
          other       0.72      0.63      0.67       174

       accuracy                           0.73       512
      macro 

Classifying reviews with gemma:7b (Chain_of_Thought_Prompt):  44%|██████▋        | 227/512 [1:04:31<1:19:36, 16.76s/it]2025-09-19 06:40:30,011 - __main__ - ERROR - HTTP error occurred: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate - {"error":"llama runner process has terminated: cudaMalloc failed: out of memory"} with model gemma:7b
Classifying reviews with gemma:7b (Chain_of_Thought_Prompt): 100%|█████████████████| 512/512 [2:04:16<00:00, 14.56s/it]



✅ Classification with gemma:7b using Chain_of_Thought_Prompt completed in 124.28 minutes

--- Sample of Predictions for gemma:7b (Chain_of_Thought_Prompt) ---
                                              review ground_truth  \
0                'this version crashes all the time'   bug report   
1                    'it take a lot time in loading'   bug report   
2                               'pages freeze often'   bug report   
3  'still having problems uploading sometimes tho...   bug report   
4  'it wont load any of my notifications when i c...   bug report   

         predicted  
0       bug report  
1  feature request  
2       bug report  
3       bug report  
4       bug report  

--- Classification Report for gemma:7b (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.19      0.86      0.31        50
     bug report       0.85      0.77      0.81       287
          other       0.92      0.13      0.22       174


Failed Parsing for: ''It feels bad reseting 2000 custom words''
Raw LLM Output: '1. The review segment mentions a negative experience with the app's functionality, specifically regarding saving and retrieving data (custom words). This indicates that there is an issue related to how personal information or user-generated content within the application should be handled by the software system. It does not suggest adding new features but rather improving existing ones for better performance in terms of preserving custom settings without loss upon resetting, which implies a flaw with data handling mechanisms that could lead users' work to disappear unexpectedly when they perform certain actions (like reset).

2. The feedback is pointing out an unintended behavior where the app does not retain user-generated content after performing what seems like routine maintenance or cleanup tasks, which should be a standard feature without causing data loss for personalized settings that users have cre


✅ Classification with phi3:mini using Chain_of_Thought_Prompt completed in 74.64 minutes

--- Sample of Predictions for phi3:mini (Chain_of_Thought_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for phi3:mini (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.37      0.60      0.45        50
     bug report       0.80      0.82      0.81       288
          other       0.80      0.62      0.70       173

       accuracy                           0.73       511
      macro a




In [None]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import os

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b", # Assuming you pulled llama3:8b
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full Dataset) ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test.txt"

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Ignore malformed lines
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- Standardize labels from the file to match VALID_FR_LABELS ---
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

# Filter out rows where ground_truth is not one of our valid labels after mapping
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

print(f"Loaded {len(fr_data)} functional reviews from the full dataset for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. Define Prompting Strategies (Now only includes Chain-of-Thought) ---

PROMPT_STRATEGIES = {
    "Chain_of_Thought_Prompt": """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Think step-by-step about which category (Feature Request, Bug Report, Other) the review segment best fits, based on the definitions. Explain your reasoning.
3.  After your step-by-step thinking process, state your final classification clearly, preceded by "FINAL CLASSIFICATION:".
4.  Your final output for classification MUST be only the category name, without any additional text, explanation, or punctuation after "FINAL CLASSIFICATION:".

**App Review Segment:** '''{review_text}'''

**Thinking Process:**
"""
}

# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends a classification request to the local Ollama model using a specified prompt template.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 256 # Increase num_predict for CoT to allow for more output
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models and Prompt Strategies ---
all_evaluation_results = {}

for prompt_name, prompt_template in PROMPT_STRATEGIES.items():
    print(f"\n\n{'='*15} Evaluating with Prompt Strategy: {prompt_name} {'='*15}")
    all_evaluation_results[prompt_name] = {}

    for current_model_name in OLLAMA_MODELS_TO_TEST:
        print(f"\n{'='*10} Starting Classification for Model: {current_model_name} {'='*10}")
        
        predictions = []
        start_time = time.time()

        for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name} ({prompt_name})"):
            response_data = classify_with_ollama_model(row['review'], current_model_name, prompt_template)
            
            if response_data["success"]:
                predicted_raw = response_data["raw_response"].strip()
                
                # Regex specifically for Chain-of-Thought, looking for "FINAL CLASSIFICATION:"
                # This makes parsing more robust when the LLM provides reasoning first.
                match = re.search(
                    r"FINAL CLASSIFICATION:\s*(feature request|bug report|other)",
                    predicted_raw,
                    re.IGNORECASE | re.DOTALL
                )
                
                if match:
                    pred = match.group(1).strip().lower()
                else:
                    # Fallback: If "FINAL CLASSIFICATION:" isn't found, try to find any valid label
                    # in the last few lines, as models might sometimes deviate.
                    # This is less ideal for CoT but helps catch cases where the model
                    # doesn't strictly follow the "FINAL CLASSIFICATION" format.
                    lines = predicted_raw.split('\n')
                    found_valid_label = False
                    for line in reversed(lines): # Check from bottom up
                        for label in VALID_FR_LABELS:
                            if label in line.lower():
                                pred = label
                                found_valid_label = True
                                break
                        if found_valid_label:
                            break
                    if not found_valid_label:
                        pred = "Failed Parsing"
                        logger.warning(f"\nFailed Parsing for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}'")
            else:
                pred = "Failed"
                logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name} using prompt {prompt_name}")
                
            predictions.append(pred)

        elapsed = time.time() - start_time
        print(f"\n✅ Classification with {current_model_name} using {prompt_name} completed in {elapsed/60:.2f} minutes")

        # --- 7. Prepare Results and Generate Classification Report for current model and prompt ---
        results_df_current_run = fr_data.copy()
        results_df_current_run['predicted'] = predictions

        filtered_results = results_df_current_run[
            (results_df_current_run['predicted'] != 'failed') &
            (results_df_current_run['predicted'].isin(VALID_FR_LABELS))
        ]

        print(f"\n--- Sample of Predictions for {current_model_name} ({prompt_name}) ---")
        print(results_df_current_run.head())

        print(f"\n--- Classification Report for {current_model_name} ({prompt_name}) ---")
        if not filtered_results.empty:
            report = classification_report(
                filtered_results['ground_truth'],
                filtered_results['predicted'],
                labels=VALID_FR_LABELS,
                zero_division=0
            )
            print(report)
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
                'report': report # Store the full report string
            }
        else:
            print(f"No valid predictions to generate a classification report for {current_model_name} ({prompt_name}).")
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': 0.0,
                'report': "No valid predictions."
            }
        
        print(f"\n{'='*10} Evaluation for {current_model_name} ({prompt_name}) Complete {'='*10}\n")

print("\n\n========== ALL EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies across all Prompt Strategies and Models:")

for prompt_name, models_data in all_evaluation_results.items():
    print(f"\n--- Prompt Strategy: {prompt_name} ---")
    for model, metrics in models_data.items():
        print(f"  {model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final Evaluation End ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---
Loaded 512 functional reviews from the full dataset for testing.
Sample of loaded FR data:
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------





Classifying reviews with llama2 (Chain_of_Thought_Prompt): 100%|█████████████████████| 512/512 [35:20<00:00,  4.14s/it]



✅ Classification with llama2 using Chain_of_Thought_Prompt completed in 35.34 minutes

--- Sample of Predictions for llama2 (Chain_of_Thought_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama2 (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.21      0.20      0.21        50
     bug report       0.64      0.99      0.78       288
          other       0.94      0.10      0.18       174

       accuracy                           0.61       512
      macro avg       

Classifying reviews with mistral (Chain_of_Thought_Prompt):  17%|███▎               | 88/512 [16:56<1:25:13, 12.06s/it]2025-09-19 09:48:51,740 - __main__ - ERROR - Ollama request timed out for review: ''it worked perfectly fine for me until yesturday w...' with model mistral
Classifying reviews with mistral (Chain_of_Thought_Prompt): 100%|██████████████████| 512/512 [1:35:01<00:00, 11.14s/it]



✅ Classification with mistral using Chain_of_Thought_Prompt completed in 95.02 minutes

--- Sample of Predictions for mistral (Chain_of_Thought_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report       other
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for mistral (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.39      0.84      0.54        50
     bug report       0.83      0.76      0.79       287
          other       0.72      0.58      0.64       174

       accuracy                           0.70       511
      macro avg    

Classifying reviews with llama3:8b (Chain_of_Thought_Prompt): 100%|████████████████| 512/512 [2:14:54<00:00, 15.81s/it]



✅ Classification with llama3:8b using Chain_of_Thought_Prompt completed in 134.90 minutes

--- Sample of Predictions for llama3:8b (Chain_of_Thought_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama3:8b (Chain_of_Thought_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.48      0.54      0.51        50
     bug report       0.81      0.85      0.83       288
          other       0.76      0.67      0.71       174

       accuracy                           0.76       512
      macro 

Classifying reviews with gemma:7b (Chain_of_Thought_Prompt):  39%|█████▊         | 198/512 [1:08:59<2:11:38, 25.15s/it]

In [None]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import os

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b", # Assuming you pulled llama3:8b
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full Dataset) ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test.txt"

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Ignore malformed lines
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- Standardize labels from the file to match VALID_FR_LABELS ---
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

# Filter out rows where ground_truth is not one of our valid labels after mapping
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

print(f"Loaded {len(fr_data)} functional reviews from the full dataset for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. Define Prompting Strategies (Now only includes Chain-of-Thought) ---

PROMPT_STRATEGIES = {
    "Chain_of_Thought_Prompt": """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Think step-by-step about which category (Feature Request, Bug Report, Other) the review segment best fits, based on the definitions. Explain your reasoning.
3.  After your step-by-step thinking process, state your final classification clearly, preceded by "FINAL CLASSIFICATION:".
4.  Your final output for classification MUST be only the category name, without any additional text, explanation, or punctuation after "FINAL CLASSIFICATION:".

**App Review Segment:** '''{review_text}'''

**Thinking Process:**
"""
}

# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    """
    Sends a classification request to the local Ollama model using a specified prompt template.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 256 # Increase num_predict for CoT to allow for more output
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models and Prompt Strategies ---
all_evaluation_results = {}

for prompt_name, prompt_template in PROMPT_STRATEGIES.items():
    print(f"\n\n{'='*15} Evaluating with Prompt Strategy: {prompt_name} {'='*15}")
    all_evaluation_results[prompt_name] = {}

    for current_model_name in OLLAMA_MODELS_TO_TEST:
        print(f"\n{'='*10} Starting Classification for Model: {current_model_name} {'='*10}")
        
        predictions = []
        start_time = time.time()

        for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name} ({prompt_name})"):
            response_data = classify_with_ollama_model(row['review'], current_model_name, prompt_template)
            
            if response_data["success"]:
                predicted_raw = response_data["raw_response"].strip()
                
                # Regex specifically for Chain-of-Thought, looking for "FINAL CLASSIFICATION:"
                # This makes parsing more robust when the LLM provides reasoning first.
                match = re.search(
                    r"FINAL CLASSIFICATION:\s*(feature request|bug report|other)",
                    predicted_raw,
                    re.IGNORECASE | re.DOTALL
                )
                
                if match:
                    pred = match.group(1).strip().lower()
                else:
                    # Fallback: If "FINAL CLASSIFICATION:" isn't found, try to find any valid label
                    # in the last few lines, as models might sometimes deviate.
                    # This is less ideal for CoT but helps catch cases where the model
                    # doesn't strictly follow the "FINAL CLASSIFICATION" format.
                    lines = predicted_raw.split('\n')
                    found_valid_label = False
                    for line in reversed(lines): # Check from bottom up
                        for label in VALID_FR_LABELS:
                            if label in line.lower():
                                pred = label
                                found_valid_label = True
                                break
                        if found_valid_label:
                            break
                    if not found_valid_label:
                        pred = "Failed Parsing"
                        logger.warning(f"\nFailed Parsing for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}'")
            else:
                pred = "Failed"
                logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name} using prompt {prompt_name}")
                
            predictions.append(pred)

        elapsed = time.time() - start_time
        print(f"\n✅ Classification with {current_model_name} using {prompt_name} completed in {elapsed/60:.2f} minutes")

        # --- 7. Prepare Results and Generate Classification Report for current model and prompt ---
        results_df_current_run = fr_data.copy()
        results_df_current_run['predicted'] = predictions

        filtered_results = results_df_current_run[
            (results_df_current_run['predicted'] != 'failed') &
            (results_df_current_run['predicted'].isin(VALID_FR_LABELS))
        ]

        print(f"\n--- Sample of Predictions for {current_model_name} ({prompt_name}) ---")
        print(results_df_current_run.head())

        print(f"\n--- Classification Report for {current_model_name} ({prompt_name}) ---")
        if not filtered_results.empty:
            report = classification_report(
                filtered_results['ground_truth'],
                filtered_results['predicted'],
                labels=VALID_FR_LABELS,
                zero_division=0
            )
            print(report)
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
                'report': report # Store the full report string
            }
        else:
            print(f"No valid predictions to generate a classification report for {current_model_name} ({prompt_name}).")
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': 0.0,
                'report': "No valid predictions."
            }
        
        print(f"\n{'='*10} Evaluation for {current_model_name} ({prompt_name}) Complete {'='*10}\n")

print("\n\n========== ALL EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies across all Prompt Strategies and Models:")

for prompt_name, models_data in all_evaluation_results.items():
    print(f"\n--- Prompt Strategy: {prompt_name} ---")
    for model, metrics in models_data.items():
        print(f"  {model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final Evaluation End ---")