In [1]:
import sys
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import time
import logging
from src.llm_client import LLMClient
from src.config import Config

# Set up basic logging for clarity in notebook output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# --- CHANGE THIS LINE ---
# Set the llm_client's logging level to INFO or WARNING to prevent excessive debug messages
logging.getLogger('src.llm_client').setLevel(logging.INFO) # Or logging.WARNING for even less verbosity
# --- END CHANGE ---


# --- 1. Data Loading and Preparation for Functional Requirements ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---")
# Load the data from BOW_test.txt
# Assuming BOW_test.txt has 'review_text,classification' format
fr_data = pd.read_csv(
    "datasets/BOW_test_sample.txt",
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Handle potential malformed lines
)

# Standardize ground_truth labels to lowercase and strip whitespace
fr_data['ground_truth'] = fr_data['ground_truth'].str.strip().str.lower()


--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---


In [2]:
len(fr_data)

20

In [3]:
print(fr_data['ground_truth'].value_counts())

ground_truth
featurerequest    7
other             7
bugreport         6
Name: count, dtype: int64


In [4]:
# Define valid FR categories for standardization and filtering later
VALID_FR_LABELS = ["featurerequest", "bugreport", "other"]

# Filter out any ground_truth labels that are not in our defined VALID_FR_LABELS
initial_len = len(fr_data)
fr_data = fr_data[fr_data['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)
if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown 'ground_truth' labels.")

print(f"Loaded {len(fr_data)} functional reviews.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

Loaded 20 functional reviews.
Sample of loaded FR data:
                                              review ground_truth
0  'the current fb app is not good at all for tab...    bugreport
1  'the problem is with the way items displaypics...    bugreport
2             'not to mention it force closes often'    bugreport
3  'also every time i open it asks me if i want t...    bugreport
4                     'now i cannot view any photos'    bugreport
----------------------------------------


In [5]:
# --- 2. LLM Client Initialization ---
print("--- Initializing LLM Client ---")
client = LLMClient()
if not client.test_connection():
    print("❌ LLM connection failed at initialization. Please check API key in config, model name, and network.")
    # It's good practice to exit or raise an exception here if connection is mandatory
else:
    print("✅ LLM Client initialized and connected.")
print("-" * 40)

# --- 3. Define FR Definitions & Few-Shot Examples (for FC2, FC3) ---
# Definitions for FC3
FR_DEFINITIONS = {
    "feature request": "A suggestion for new functionality, enhancements, or improvements to existing features.",
    "bug report": "Describes an error, fault, or flaw in the system that causes it to produce an incorrect or unexpected result, or to behave in unintended ways.",
    "other": "The review does not clearly fall into Feature Request or Bug Report."
}

# Few-Shot Examples for FC2
# IMPORTANT: Replace these with actual examples from your BOW_test.txt
# if you have good, diverse examples that fit your categories.
# These are placeholders for now.
FR_FEW_SHOT_EXAMPLES = [
    {"review": "I wish there was a dark mode option in the settings.", "classification": "Feature Request"},
    {"review": "The app crashes every time I try to upload a photo.", "classification": "Bug Report"},
    {"review": "Great app, keep up the good work!", "classification": "Other"},
    {"review": "Please add a way to export data to CSV.", "classification": "Feature Request"},
    {"review": "The search filter does not work correctly for dates.", "classification": "Bug Report"},
    {"review": "Can you make the 'save' button more prominent?", "classification": "Feature Request"}
]

# Format these examples into a single string for the prompt
formatted_fr_few_shot_text = ""
for ex in FR_FEW_SHOT_EXAMPLES:
    formatted_fr_few_shot_text += f"Review: {ex['review']}\nClassification: {ex['classification']}\n\n"

# Define the list of FR categories as a string for prompts
all_fr_labels_str = ", ".join([label.capitalize() for label in VALID_FR_LABELS])
print("Defined FR definitions and few-shot examples.")
print("-" * 40)

--- Initializing LLM Client ---


INFO:src.llm_client:Initialized Gemini model: models/gemini-1.5-flash-latest
INFO:src.llm_client:Initialized LLM client with https://generativelanguage.googleapis.com, model: models/gemini-1.5-flash-latest, provider: gemini
INFO:src.llm_client:Testing Gemini API connection...
INFO:src.llm_client:Successfully connected to Gemini API using model 'models/gemini-1.5-flash-latest'.


✅ LLM Client initialized and connected.
----------------------------------------
Defined FR definitions and few-shot examples.
----------------------------------------


In [6]:
# --- 4. Evaluation Function Definition (Generalized for FR) ---
def evaluate_classification_prompt_strategy(
    prompt_id: str,
    data_df: pd.DataFrame,
    client_instance: LLMClient,
    category_name: str = "FUNCTIONAL_REQUIREMENTS", # Changed default category
    **prompt_kwargs
) -> pd.DataFrame:
    """
    Evaluates the classification performance using a specific prompt ID for FRs.

    Args:
        prompt_id (str): The ID of the prompt to activate from prompts.json.
        data_df (pd.DataFrame): The DataFrame containing 'review' and 'ground_truth' columns.
        client_instance (LLMClient): An initialized instance of the LLMClient.
        category_name (str): The category name for the prompt in prompts.json (e.g., "FUNCTIONAL_REQUIREMENTS").
        **prompt_kwargs: Additional keyword arguments to pass to the prompt's .format() method.

    Returns:
        pd.DataFrame: A DataFrame with 'review', 'ground_truth', 'predicted' columns,
                      and a performance report for the given prompt.
    """

    Config.set_active_prompt_id(category_name, prompt_id)

    print(f"\n--- Starting evaluation for Prompt ID: {prompt_id} ---")

    predictions = []
    start_time = time.time()

    for i, row in tqdm(data_df.iterrows(), total=len(data_df), desc=f"Classifying with {prompt_id}"):
        # The classify_nfr method is generic enough to handle any text and kwargs
        # The prompt itself determines how the classification is done.
        response = client_instance.classify_nfr(row['review'], **prompt_kwargs)
        pred = response.classification if response.success else "Failed"
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {prompt_id} completed in {elapsed/60:.2f} minutes")

    results_df = data_df.copy()
    results_df['predicted'] = predictions

    # Standardize predicted labels for accurate comparison
    results_df['predicted'] = results_df['predicted'].str.strip().str.lower()

    # Filter out failed responses and predictions not in VALID_FR_LABELS for accurate report
    filtered_results = results_df[
        (results_df['predicted'] != 'failed') &
        (results_df['predicted'].isin(VALID_FR_LABELS))
    ]

    # Align the labels for classification report
    # Ensure all labels (ground truth and predicted) are present in the report
    unique_labels = sorted(list(set(filtered_results['ground_truth'].tolist() + filtered_results['predicted'].tolist())))

    report = classification_report(
        filtered_results['ground_truth'],
        filtered_results['predicted'],
        labels=VALID_FR_LABELS, # Explicitly list labels to ensure all are shown, even if no predictions
        zero_division=0
    )

    print(f"\n--- Classification Report for Prompt ID: {prompt_id} ---\n")
    print(f"{report}\n")
    print(f"--- End Report for Prompt ID: {prompt_id} ---\n")

    return results_df, report

print("Logic setup complete. Ready to run evaluations.")

Logic setup complete. Ready to run evaluations.


In [7]:
# --- 5. Evaluation for FC1 (Direct Multi-Class User Review Classification) ---
print("\n========== EVALUATING PROMPT FC1 (Direct Multi-Class) ==========")
results_fc1_df, report_fc1 = evaluate_classification_prompt_strategy(
    prompt_id="FC1",
    data_df=fr_data, # <--- Add .head(15) here
    client_instance=client,
    category_name="FUNCTIONAL_REQUIREMENTS" # Specify the category
    # No extra kwargs needed for FC1 as per its definition
)
print("First 5 predictions for FC1:")
print(results_fc1_df.head())

INFO:src.config:Loaded all prompts from prompts.json
INFO:src.config:Active prompt for 'FUNCTIONAL_REQUIREMENTS' set to 'FC1'.




--- Starting evaluation for Prompt ID: FC1 ---


Classifying with FC1:  75%|█████████████████████████████████████████████               | 15/20 [00:08<00:02,  1.84it/s]ERROR:src.llm_client:Gemini request failed: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 15
}
]
ERROR:src.llm_client:Gemini request failed: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https:/


✅ Classification with FC1 completed in 0.16 minutes

--- Classification Report for Prompt ID: FC1 ---

                precision    recall  f1-score   support

featurerequest       0.00      0.00      0.00       1.0
     bugreport       0.00      0.00      0.00       1.0
         other       0.00      0.00      0.00       0.0

      accuracy                           0.00       2.0
     macro avg       0.00      0.00      0.00       2.0
  weighted avg       0.00      0.00      0.00       2.0


--- End Report for Prompt ID: FC1 ---

First 5 predictions for FC1:
                                              review ground_truth  \
0  'the current fb app is not good at all for tab...    bugreport   
1  'the problem is with the way items displaypics...    bugreport   
2             'not to mention it force closes often'    bugreport   
3  'also every time i open it asks me if i want t...    bugreport   
4                     'now i cannot view any photos'    bugreport   

         predicte




In [8]:
results_fc1_df

Unnamed: 0,review,ground_truth,predicted
0,'the current fb app is not good at all for tab...,bugreport,usability
1,'the problem is with the way items displaypics...,bugreport,look and feel
2,'not to mention it force closes often',bugreport,fault tolerance
3,'also every time i open it asks me if i want t...,bugreport,usability
4,'now i cannot view any photos',bugreport,other
5,'loading takes time even on 3g con it totaly s...,bugreport,performance
6,'it should have multi tabs ',featurerequest,usability
7,'for me to be able to post a status i need to ...,featurerequest,operability
8,'i use my phone almost exclusivly to log into...,featurerequest,usability
9,'cant turn off location tracking',featurerequest,security


In [12]:
import pandas as pd
import requests
import json
import logging
import re
import os # Included for completeness, though not strictly used for LLM interaction in this independent cell

# --- Basic Logging Setup ---
# This ensures logging messages from this cell are formatted.
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- LLM Configuration (Hardcoded for this independent test) ---
# IMPORTANT: This configuration is LOCAL to this cell and does NOT affect src/config.py
LLAMA2_MODEL_NAME = "llama2"
OLLAMA_BASE_URL = "http://localhost:11434" # Default Ollama API endpoint

# --- Custom Input Data (Feature Requests) ---
custom_feature_request_data = pd.DataFrame({
    'review': [
        "I'd love to see a dark mode option in the settings for better night viewing.",
        "Could you add a feature to export all my saved articles to PDF?",
        "Please implement a search filter to sort results by publication date.",
        "It would be very convenient if I could set custom notification sounds for different contacts.",
        "A 'mark all as read' button for messages would greatly improve efficiency."
    ]
})

# --- Custom Chain-of-Thought Prompt with STRICT Output Format ---
cot_prompt_text = """
You are an expert in software requirements analysis. Your task is to classify user reviews.
The allowed functional requirement categories are: Feature Request, Bug Report, Other.
Do NOT use non-functional requirement categories like Usability, Performance, Security, etc.

For the following user review, first, explain your reasoning step-by-step why you are classifying it as a 'Feature Request', 'Bug Report', or 'Other'.
Finally, on a new line, provide the classification in the EXACT format: "CLASSIFICATION: [Category Name]". For example, "CLASSIFICATION: Feature Request".

User Review: {review_text}

Reasoning:
1. Analyze the user's statement for intent (new functionality, problem, or general comment).
2. Determine if it explicitly asks for something new or improved (Feature Request).
3. Determine if it describes a fault, error, or unintended behavior (Bug Report).
4. If neither, classify as Other.
"""

print(f"\n========== Testing Custom Inputs with Llama 2 ({LLAMA2_MODEL_NAME}) and Chain of Thought ==========")

# --- Llama 2 (Ollama) Interaction Logic ---
def classify_with_llama2_cot(review_text: str) -> dict:
    """
    Sends a request to the local Ollama Llama 2 model with the CoT prompt
    and returns the raw response.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    # Format the CoT prompt with the current review text
    formatted_prompt = cot_prompt_text.format(review_text=review_text)

    data = {
        "model": LLAMA2_MODEL_NAME,
        "prompt": formatted_prompt,
        "stream": False # We want the full response at once
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data))
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        result = response.json()
        return {"success": True, "raw_response": result.get("response", ""), "classification": None}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable.", "classification": None}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}", "classification": None}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}", "classification": None}

# --- Run Classification for Custom Inputs ---
cot_predictions = []
for i, row in custom_feature_request_data.iterrows():
    print(f"\n--- Review {i+1}: {row['review']} ---")
    response_data = classify_with_llama2_cot(row['review'])
    
    if response_data["success"]:
        # Print the raw model output to see the reasoning
        print("\nLLM Raw Output (with Reasoning):\n", response_data["raw_response"])
        
        # --- SIMPLIFIED REGEX FOR PARSING (due to strict prompt instruction) ---
        # Now we expect "CLASSIFICATION: [Category Name]"
        match = re.search(
            r"CLASSIFICATION:\s*(Feature Request|Bug Report|Other)",
            response_data["raw_response"],
            re.IGNORECASE | re.DOTALL # DOTALL is useful if CLASSIFICATION: is on a different line
        )
        # --- END SIMPLIFIED REGEX ---

        pred_class = match.group(1).strip().lower() if match else "Failed Parsing"
        cot_predictions.append(pred_class)
        print(f"Parsed Classification: {pred_class}")
    else:
        print(f"LLM Call Failed: {response_data['raw_response']}")
        cot_predictions.append("Failed")

print("\n--- Custom Test Complete ---")

# You can manually inspect cot_predictions list here to see parsed results
print("\nFinal Parsed Classifications:", cot_predictions)

# Optional: To see how it would perform in a report, assuming ground truth is correct
# from sklearn.metrics import classification_report
#
# # Filter out failed predictions for reporting
# results_df_temp = custom_feature_request_data.copy()
# results_df_temp['predicted'] = cot_predictions
#
# # Assign ground truth for this specific test case (all are 'feature request')
# results_df_temp['ground_truth'] = ['feature request'] * len(custom_feature_request_data)
#
# filtered_results = results_df_temp[results_df_temp['predicted'] != 'failed']
# print("\n--- Classification Report for Custom Test ---")
# print(classification_report(
#     filtered_results['ground_truth'],
#     filtered_results['predicted'],
#     labels=["feature request", "bug report", "other"],
#     zero_division=0
# ))



--- Review 1: I'd love to see a dark mode option in the settings for better night viewing. ---

LLM Raw Output (with Reasoning):
 CLASSIFICATION: Feature Request

Step 1: Analyze the user's statement for intent
The user explicitly states their desire for a new functionality, specifically a dark mode option in the settings for better night viewing. This indicates that they are making a request for something new or improved, which falls under the category of Feature Request.

Step 2: Determine if it explicitly asks for something new or improved
Yes, the user is asking for a new feature related to the dark mode option in the settings. They want the option to be available for better night viewing, indicating that they would like to see this functionality added or improved.

Step 3: Determine if it describes a fault, error, or unintended behavior (Bug Report)
No, the user's statement does not describe any fault, error, or unintended behavior. It is purely a request for new functionality, 

In [13]:
import pandas as pd
# Assuming classify_with_llama2_cot and cot_prompt_text are defined from the previous cell

# --- Custom Input Data (Bug Reports) ---
custom_bug_report_data = pd.DataFrame({
    'review': [
        "The app crashes every time I try to open a PDF document.",
        "When I click 'save', the changes are not actually reflected, and the old data remains.",
        "The search bar freezes and becomes unresponsive after typing more than 10 characters.",
        "Users are unable to log in from Android devices; it constantly shows an 'invalid credentials' error.",
        "The push notifications are not delivering consistently, sometimes they arrive hours late."
    ],
    'ground_truth': [ # Assign ground truth for this specific test
        "bug report",
        "bug report",
        "bug report",
        "bug report",
        "bug report"
    ]
})

print(f"\n========== Testing Custom Inputs with Llama 2 (llama2) and Chain of Thought - Bug Reports ==========")

# --- Run Classification for Custom Bug Report Inputs ---
bug_report_predictions = []
for i, row in custom_bug_report_data.iterrows():
    print(f"\n--- Review {i+1}: {row['review']} ---")
    response_data = classify_with_llama2_cot(row['review']) # Reusing the function from previous cell
    
    if response_data["success"]:
        print("\nLLM Raw Output (with Reasoning):\n", response_data["raw_response"])
        
        # Reusing the same parsing logic, which should now be robust
        match = re.search(
            r"CLASSIFICATION:\s*(Feature Request|Bug Report|Other)",
            response_data["raw_response"],
            re.IGNORECASE | re.DOTALL
        )
        pred_class = match.group(1).strip().lower() if match else "Failed Parsing"
        bug_report_predictions.append(pred_class)
        print(f"Parsed Classification: {pred_class}")
    else:
        print(f"LLM Call Failed: {response_data['raw_response']}")
        bug_report_predictions.append("Failed")

print("\n--- Custom Bug Report Test Complete ---")

# You can manually inspect bug_report_predictions list here
print("\nFinal Parsed Classifications (Bug Reports):", bug_report_predictions)

# Optional: Classification Report for this specific test set
from sklearn.metrics import classification_report

results_df_bug_report = custom_bug_report_data.copy()
results_df_bug_report['predicted'] = bug_report_predictions

filtered_results_bug_report = results_df_bug_report[results_df_bug_report['predicted'] != 'failed']
print("\n--- Classification Report for Custom Bug Report Test ---")
print(classification_report(
    filtered_results_bug_report['ground_truth'],
    filtered_results_bug_report['predicted'],
    labels=["feature request", "bug report", "other"], # Ensure all possible labels are listed
    zero_division=0
))



--- Review 1: The app crashes every time I try to open a PDF document. ---

LLM Raw Output (with Reasoning):
 CLASSIFICATION: Feature Request

Reasoning:
Step 1: The user's statement is focused on a problem they are experiencing with the app, specifically that it crashes every time they try to open a PDF document. This suggests that the user is asking for improved functionality or a new feature to address this issue.

Step 2: The user's statement does not explicitly ask for something new or improved. It simply describes a problem they are experiencing with the app, which indicates a Feature Request classification.

Step 3: The user's statement does not describe any fault, error, or unintended behavior that would warrant a Bug Report classification.

Step 4: Since the user's statement is focused on a problem they are experiencing with the app and does not request any new functionality or improved features, it is classified as a Feature Request.
Parsed Classification: feature request



In [5]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report
import time
from tqdm import tqdm # Import tqdm

# --- 1. Logging Setup (for this self-contained cell) ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration (Hardcoded for this independent test) ---
LLAMA2_MODEL_NAME = "llama2"
OLLAMA_BASE_URL = "http://localhost:11434"

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---")
excel_path = "datasets/BOW_test_sample.txt" # Changed to BOW_test_sample.txt

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    excel_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip'
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- NEW: Standardize labels from the file to match VALID_FR_LABELS ---
# Create a mapping for your specific input labels
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other' # Ensure 'other' also goes through for consistency
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)
# --- END NEW ---

# Filter out rows where ground_truth is not one of our valid labels (should be fewer now)
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

# Sample data to keep it less than or equal to 20 for quick testing
# min(20, len(fr_data)) ensures we don't try to sample more rows than available
fr_data = fr_data.sample(n=min(20, len(fr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded and sampled {len(fr_data)} functional reviews for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. The Classification Prompt ---
classification_prompt_text = """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Determine which of the three categories (Feature Request, Bug Report, Other) it *most accurately* fits based on the provided definitions.
3.  Your final output MUST be only the category name, without any additional text, explanation, or punctuation.

**App Review Segment:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function (adapted for this cell) ---
def classify_with_llama2(review_text: str) -> dict:
    """
    Sends a classification request to the local Ollama Llama 2 model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    # Format the prompt with the current review text
    formatted_prompt = classification_prompt_text.format(review_text=review_text)

    data = {
        "model": LLAMA2_MODEL_NAME,
        "prompt": formatted_prompt,
        "stream": False, # Get full response at once
        "options": {
            "temperature": 0.0, # Keep temperature low for consistent classification
            "num_predict": 100 # Limit output length to prevent rambling
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120) # Added timeout
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: {review_text[:50]}...")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Evaluation Loop ---
print("\n========== Starting Classification Evaluation ==========")

predictions = []
start_time = time.time()

for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {LLAMA2_MODEL_NAME}"):
    response_data = classify_with_llama2(row['review'])
    
    if response_data["success"]:
        predicted_raw = response_data["raw_response"].strip()
        
        # Regex to capture the exact category name after "Classification:"
        match = re.search(
            r"(?:CLASSIFICATION:\s*)?(Feature Request|Bug Report|Other)", # Non-capturing group for "CLASSIFICATION: " is optional
            predicted_raw,
            re.IGNORECASE | re.DOTALL
        )
        
        pred = match.group(1).strip().lower() if match else "Failed Parsing"
        
        # Optional: Print raw output for debugging if needed (comment out for clean runs)
        # print(f"Raw LLM Output for '{row['review'][:50]}...': '{predicted_raw}' -> Parsed: '{pred}'")

    else:
        pred = "Failed"
        logger.warning(f"Classification failed for review: {row['review'][:50]}... Error: {response_data['raw_response']}")
    
    predictions.append(pred)

elapsed = time.time() - start_time
print(f"\n✅ Classification completed in {elapsed/60:.2f} minutes")

# --- 7. Prepare Results and Generate Classification Report ---
results_df = fr_data.copy()
results_df['predicted'] = predictions

# Filter out failed responses and predictions not in VALID_FR_LABELS for accurate report
filtered_results = results_df[
    (results_df['predicted'] != 'failed') &
    (results_df['predicted'].isin(VALID_FR_LABELS))
]

print("\n--- Sample of Predictions ---")
print(results_df.head())

print(f"\n--- Classification Report for {LLAMA2_MODEL_NAME} ---")
if not filtered_results.empty:
    report = classification_report(
        filtered_results['ground_truth'],
        filtered_results['predicted'],
        labels=VALID_FR_LABELS, # Explicitly list labels to ensure all are shown, even if no predictions
        zero_division=0
    )
    print(report)
else:
    print("No valid predictions to generate a classification report.")
    
print("\n--- Evaluation Complete ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---
Loaded and sampled 20 functional reviews for testing.
Sample of loaded FR data:
                                              review     ground_truth
0  'the current fb app is not good at all for tab...       bug report
1  'I often find myself accidentally deleting words'            other
2                    'It also takes a lot of memory'            other
3  'the problem is with the way items displaypics...       bug report
4  'i use my phone almost  exclusivly to log into...  feature request
----------------------------------------



Classifying reviews with llama2: 100%|█████████████████████████████████████████████████| 20/20 [01:03<00:00,  3.19s/it]


✅ Classification completed in 1.06 minutes

--- Sample of Predictions ---
                                              review     ground_truth  \
0  'the current fb app is not good at all for tab...       bug report   
1  'I often find myself accidentally deleting words'            other   
2                    'It also takes a lot of memory'            other   
3  'the problem is with the way items displaypics...       bug report   
4  'i use my phone almost  exclusivly to log into...  feature request   

    predicted  
0  bug report  
1  bug report  
2  bug report  
3  bug report  
4  bug report  

--- Classification Report for llama2 ---
                 precision    recall  f1-score   support

feature request       0.60      0.43      0.50         7
     bug report       0.33      0.83      0.48         6
          other       0.00      0.00      0.00         7

       accuracy                           0.40        20
      macro avg       0.31      0.42      0.33        20
   w




In [6]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report
import time
from tqdm import tqdm # Import tqdm

# --- 1. Logging Setup (for this self-contained cell) ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration (Hardcoded for this independent test) ---
LLAMA2_MODEL_NAME = "mistral"
OLLAMA_BASE_URL = "http://localhost:11434"

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---")
excel_path = "datasets/BOW_test_sample.txt" # Changed to BOW_test_sample.txt

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    excel_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip'
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- NEW: Standardize labels from the file to match VALID_FR_LABELS ---
# Create a mapping for your specific input labels
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other' # Ensure 'other' also goes through for consistency
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)
# --- END NEW ---

# Filter out rows where ground_truth is not one of our valid labels (should be fewer now)
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

# Sample data to keep it less than or equal to 20 for quick testing
# min(20, len(fr_data)) ensures we don't try to sample more rows than available
fr_data = fr_data.sample(n=min(20, len(fr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded and sampled {len(fr_data)} functional reviews for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. The Classification Prompt ---
classification_prompt_text = """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Determine which of the three categories (Feature Request, Bug Report, Other) it *most accurately* fits based on the provided definitions.
3.  Your final output MUST be only the category name, without any additional text, explanation, or punctuation.

**App Review Segment:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function (adapted for this cell) ---
def classify_with_llama2(review_text: str) -> dict:
    """
    Sends a classification request to the local Ollama Llama 2 model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    # Format the prompt with the current review text
    formatted_prompt = classification_prompt_text.format(review_text=review_text)

    data = {
        "model": LLAMA2_MODEL_NAME,
        "prompt": formatted_prompt,
        "stream": False, # Get full response at once
        "options": {
            "temperature": 0.0, # Keep temperature low for consistent classification
            "num_predict": 100 # Limit output length to prevent rambling
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120) # Added timeout
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: {review_text[:50]}...")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Evaluation Loop ---
print("\n========== Starting Classification Evaluation ==========")

predictions = []
start_time = time.time()

for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {LLAMA2_MODEL_NAME}"):
    response_data = classify_with_llama2(row['review'])
    
    if response_data["success"]:
        predicted_raw = response_data["raw_response"].strip()
        
        # Regex to capture the exact category name after "Classification:"
        match = re.search(
            r"(?:CLASSIFICATION:\s*)?(Feature Request|Bug Report|Other)", # Non-capturing group for "CLASSIFICATION: " is optional
            predicted_raw,
            re.IGNORECASE | re.DOTALL
        )
        
        pred = match.group(1).strip().lower() if match else "Failed Parsing"
        
        # Optional: Print raw output for debugging if needed (comment out for clean runs)
        # print(f"Raw LLM Output for '{row['review'][:50]}...': '{predicted_raw}' -> Parsed: '{pred}'")

    else:
        pred = "Failed"
        logger.warning(f"Classification failed for review: {row['review'][:50]}... Error: {response_data['raw_response']}")
    
    predictions.append(pred)

elapsed = time.time() - start_time
print(f"\n✅ Classification completed in {elapsed/60:.2f} minutes")

# --- 7. Prepare Results and Generate Classification Report ---
results_df = fr_data.copy()
results_df['predicted'] = predictions

# Filter out failed responses and predictions not in VALID_FR_LABELS for accurate report
filtered_results = results_df[
    (results_df['predicted'] != 'failed') &
    (results_df['predicted'].isin(VALID_FR_LABELS))
]

print("\n--- Sample of Predictions ---")
print(results_df.head())

print(f"\n--- Classification Report for {LLAMA2_MODEL_NAME} ---")
if not filtered_results.empty:
    report = classification_report(
        filtered_results['ground_truth'],
        filtered_results['predicted'],
        labels=VALID_FR_LABELS, # Explicitly list labels to ensure all are shown, even if no predictions
        zero_division=0
    )
    print(report)
else:
    print("No valid predictions to generate a classification report.")
    
print("\n--- Evaluation Complete ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---
Loaded and sampled 20 functional reviews for testing.
Sample of loaded FR data:
                                              review     ground_truth
0  'the current fb app is not good at all for tab...       bug report
1  'I often find myself accidentally deleting words'            other
2                    'It also takes a lot of memory'            other
3  'the problem is with the way items displaypics...       bug report
4  'i use my phone almost  exclusivly to log into...  feature request
----------------------------------------



Classifying reviews with mistral: 100%|████████████████████████████████████████████████| 20/20 [00:56<00:00,  2.84s/it]


✅ Classification completed in 0.95 minutes

--- Sample of Predictions ---
                                              review     ground_truth  \
0  'the current fb app is not good at all for tab...       bug report   
1  'I often find myself accidentally deleting words'            other   
2                    'It also takes a lot of memory'            other   
3  'the problem is with the way items displaypics...       bug report   
4  'i use my phone almost  exclusivly to log into...  feature request   

         predicted  
0            other  
1            other  
2            other  
3            other  
4  feature request  

--- Classification Report for mistral ---
                 precision    recall  f1-score   support

feature request       1.00      0.71      0.83         7
     bug report       0.60      0.50      0.55         6
          other       0.60      0.86      0.71         7

       accuracy                           0.70        20
      macro avg       0.73    




In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report
import time
from tqdm import tqdm # Import tqdm

# --- 1. Logging Setup (for this self-contained cell) ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration (Hardcoded for this independent test) ---
LLAMA2_MODEL_NAME = "llama3:8b"
OLLAMA_BASE_URL = "http://localhost:11434"

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---")
excel_path = "datasets/BOW_test_sample.txt" # Changed to BOW_test_sample.txt

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    excel_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip'
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- NEW: Standardize labels from the file to match VALID_FR_LABELS ---
# Create a mapping for your specific input labels
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other' # Ensure 'other' also goes through for consistency
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)
# --- END NEW ---

# Filter out rows where ground_truth is not one of our valid labels (should be fewer now)
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

# Sample data to keep it less than or equal to 20 for quick testing
# min(20, len(fr_data)) ensures we don't try to sample more rows than available
fr_data = fr_data.sample(n=min(20, len(fr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded and sampled {len(fr_data)} functional reviews for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. The Classification Prompt ---
classification_prompt_text = """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Determine which of the three categories (Feature Request, Bug Report, Other) it *most accurately* fits based on the provided definitions.
3.  Your final output MUST be only the category name, without any additional text, explanation, or punctuation.

**App Review Segment:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function (adapted for this cell) ---
def classify_with_llama2(review_text: str) -> dict:
    """
    Sends a classification request to the local Ollama Llama 2 model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    # Format the prompt with the current review text
    formatted_prompt = classification_prompt_text.format(review_text=review_text)

    data = {
        "model": LLAMA2_MODEL_NAME,
        "prompt": formatted_prompt,
        "stream": False, # Get full response at once
        "options": {
            "temperature": 0.0, # Keep temperature low for consistent classification
            "num_predict": 100 # Limit output length to prevent rambling
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120) # Added timeout
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: {review_text[:50]}...")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Evaluation Loop ---
print("\n========== Starting Classification Evaluation ==========")

predictions = []
start_time = time.time()

for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {LLAMA2_MODEL_NAME}"):
    response_data = classify_with_llama2(row['review'])
    
    if response_data["success"]:
        predicted_raw = response_data["raw_response"].strip()
        
        # Regex to capture the exact category name after "Classification:"
        match = re.search(
            r"(?:CLASSIFICATION:\s*)?(Feature Request|Bug Report|Other)", # Non-capturing group for "CLASSIFICATION: " is optional
            predicted_raw,
            re.IGNORECASE | re.DOTALL
        )
        
        pred = match.group(1).strip().lower() if match else "Failed Parsing"
        
        # Optional: Print raw output for debugging if needed (comment out for clean runs)
        # print(f"Raw LLM Output for '{row['review'][:50]}...': '{predicted_raw}' -> Parsed: '{pred}'")

    else:
        pred = "Failed"
        logger.warning(f"Classification failed for review: {row['review'][:50]}... Error: {response_data['raw_response']}")
    
    predictions.append(pred)

elapsed = time.time() - start_time
print(f"\n✅ Classification completed in {elapsed/60:.2f} minutes")

# --- 7. Prepare Results and Generate Classification Report ---
results_df = fr_data.copy()
results_df['predicted'] = predictions

# Filter out failed responses and predictions not in VALID_FR_LABELS for accurate report
filtered_results = results_df[
    (results_df['predicted'] != 'failed') &
    (results_df['predicted'].isin(VALID_FR_LABELS))
]

print("\n--- Sample of Predictions ---")
print(results_df.head())

print(f"\n--- Classification Report for {LLAMA2_MODEL_NAME} ---")
if not filtered_results.empty:
    report = classification_report(
        filtered_results['ground_truth'],
        filtered_results['predicted'],
        labels=VALID_FR_LABELS, # Explicitly list labels to ensure all are shown, even if no predictions
        zero_division=0
    )
    print(report)
else:
    print("No valid predictions to generate a classification report.")
    
print("\n--- Evaluation Complete ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---
Loaded and sampled 20 functional reviews for testing.
Sample of loaded FR data:
                                              review     ground_truth
0  'the current fb app is not good at all for tab...       bug report
1  'I often find myself accidentally deleting words'            other
2                    'It also takes a lot of memory'            other
3  'the problem is with the way items displaypics...       bug report
4  'i use my phone almost  exclusivly to log into...  feature request
----------------------------------------



Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████████| 20/20 [01:06<00:00,  3.31s/it]


✅ Classification completed in 1.10 minutes

--- Sample of Predictions ---
                                              review     ground_truth  \
0  'the current fb app is not good at all for tab...       bug report   
1  'I often find myself accidentally deleting words'            other   
2                    'It also takes a lot of memory'            other   
3  'the problem is with the way items displaypics...       bug report   
4  'i use my phone almost  exclusivly to log into...  feature request   

    predicted  
0  bug report  
1  bug report  
2  bug report  
3  bug report  
4  bug report  

--- Classification Report for llama3:8b ---
                 precision    recall  f1-score   support

feature request       1.00      0.14      0.25         7
     bug report       0.32      1.00      0.48         6
          other       0.00      0.00      0.00         7

       accuracy                           0.35        20
      macro avg       0.44      0.38      0.24        20
 




In [2]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report
import time
from tqdm import tqdm # Import tqdm

# --- 1. Logging Setup (for this self-contained cell) ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration (Hardcoded for this independent test) ---
LLAMA2_MODEL_NAME = "gemma:7b"
OLLAMA_BASE_URL = "http://localhost:11434"

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---")
excel_path = "datasets/BOW_test_sample.txt" # Changed to BOW_test_sample.txt

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    excel_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip'
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- NEW: Standardize labels from the file to match VALID_FR_LABELS ---
# Create a mapping for your specific input labels
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other' # Ensure 'other' also goes through for consistency
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)
# --- END NEW ---

# Filter out rows where ground_truth is not one of our valid labels (should be fewer now)
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

# Sample data to keep it less than or equal to 20 for quick testing
# min(20, len(fr_data)) ensures we don't try to sample more rows than available
fr_data = fr_data.sample(n=min(20, len(fr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded and sampled {len(fr_data)} functional reviews for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. The Classification Prompt ---
classification_prompt_text = """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Determine which of the three categories (Feature Request, Bug Report, Other) it *most accurately* fits based on the provided definitions.
3.  Your final output MUST be only the category name, without any additional text, explanation, or punctuation.

**App Review Segment:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function (adapted for this cell) ---
def classify_with_llama2(review_text: str) -> dict:
    """
    Sends a classification request to the local Ollama Llama 2 model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    # Format the prompt with the current review text
    formatted_prompt = classification_prompt_text.format(review_text=review_text)

    data = {
        "model": LLAMA2_MODEL_NAME,
        "prompt": formatted_prompt,
        "stream": False, # Get full response at once
        "options": {
            "temperature": 0.0, # Keep temperature low for consistent classification
            "num_predict": 100 # Limit output length to prevent rambling
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120) # Added timeout
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: {review_text[:50]}...")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Evaluation Loop ---
print("\n========== Starting Classification Evaluation ==========")

predictions = []
start_time = time.time()

for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {LLAMA2_MODEL_NAME}"):
    response_data = classify_with_llama2(row['review'])
    
    if response_data["success"]:
        predicted_raw = response_data["raw_response"].strip()
        
        # Regex to capture the exact category name after "Classification:"
        match = re.search(
            r"(?:CLASSIFICATION:\s*)?(Feature Request|Bug Report|Other)", # Non-capturing group for "CLASSIFICATION: " is optional
            predicted_raw,
            re.IGNORECASE | re.DOTALL
        )
        
        pred = match.group(1).strip().lower() if match else "Failed Parsing"
        
        # Optional: Print raw output for debugging if needed (comment out for clean runs)
        # print(f"Raw LLM Output for '{row['review'][:50]}...': '{predicted_raw}' -> Parsed: '{pred}'")

    else:
        pred = "Failed"
        logger.warning(f"Classification failed for review: {row['review'][:50]}... Error: {response_data['raw_response']}")
    
    predictions.append(pred)

elapsed = time.time() - start_time
print(f"\n✅ Classification completed in {elapsed/60:.2f} minutes")

# --- 7. Prepare Results and Generate Classification Report ---
results_df = fr_data.copy()
results_df['predicted'] = predictions

# Filter out failed responses and predictions not in VALID_FR_LABELS for accurate report
filtered_results = results_df[
    (results_df['predicted'] != 'failed') &
    (results_df['predicted'].isin(VALID_FR_LABELS))
]

print("\n--- Sample of Predictions ---")
print(results_df.head())

print(f"\n--- Classification Report for {LLAMA2_MODEL_NAME} ---")
if not filtered_results.empty:
    report = classification_report(
        filtered_results['ground_truth'],
        filtered_results['predicted'],
        labels=VALID_FR_LABELS, # Explicitly list labels to ensure all are shown, even if no predictions
        zero_division=0
    )
    print(report)
else:
    print("No valid predictions to generate a classification report.")
    
print("\n--- Evaluation Complete ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---
Loaded and sampled 20 functional reviews for testing.
Sample of loaded FR data:
                                              review     ground_truth
0  'the current fb app is not good at all for tab...       bug report
1  'I often find myself accidentally deleting words'            other
2                    'It also takes a lot of memory'            other
3  'the problem is with the way items displaypics...       bug report
4  'i use my phone almost  exclusivly to log into...  feature request
----------------------------------------



Classifying reviews with gemma:7b: 100%|███████████████████████████████████████████████| 20/20 [01:18<00:00,  3.95s/it]


✅ Classification completed in 1.32 minutes

--- Sample of Predictions ---
                                              review     ground_truth  \
0  'the current fb app is not good at all for tab...       bug report   
1  'I often find myself accidentally deleting words'            other   
2                    'It also takes a lot of memory'            other   
3  'the problem is with the way items displaypics...       bug report   
4  'i use my phone almost  exclusivly to log into...  feature request   

         predicted  
0  feature request  
1       bug report  
2       bug report  
3  feature request  
4  feature request  

--- Classification Report for gemma:7b ---
                 precision    recall  f1-score   support

feature request       0.42      0.71      0.53         7
     bug report       0.38      0.50      0.43         6
          other       0.00      0.00      0.00         7

       accuracy                           0.40        20
      macro avg       0.26   




In [3]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report
import time
from tqdm import tqdm # Import tqdm

# --- 1. Logging Setup (for this self-contained cell) ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration (Hardcoded for this independent test) ---
LLAMA2_MODEL_NAME = "phi3:mini"
OLLAMA_BASE_URL = "http://localhost:11434"

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---")
excel_path = "datasets/BOW_test_sample.txt" # Changed to BOW_test_sample.txt

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    excel_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip'
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- NEW: Standardize labels from the file to match VALID_FR_LABELS ---
# Create a mapping for your specific input labels
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other' # Ensure 'other' also goes through for consistency
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)
# --- END NEW ---

# Filter out rows where ground_truth is not one of our valid labels (should be fewer now)
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

# Sample data to keep it less than or equal to 20 for quick testing
# min(20, len(fr_data)) ensures we don't try to sample more rows than available
fr_data = fr_data.sample(n=min(20, len(fr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded and sampled {len(fr_data)} functional reviews for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. The Classification Prompt ---
classification_prompt_text = """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Determine which of the three categories (Feature Request, Bug Report, Other) it *most accurately* fits based on the provided definitions.
3.  Your final output MUST be only the category name, without any additional text, explanation, or punctuation.

**App Review Segment:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function (adapted for this cell) ---
def classify_with_llama2(review_text: str) -> dict:
    """
    Sends a classification request to the local Ollama Llama 2 model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    # Format the prompt with the current review text
    formatted_prompt = classification_prompt_text.format(review_text=review_text)

    data = {
        "model": LLAMA2_MODEL_NAME,
        "prompt": formatted_prompt,
        "stream": False, # Get full response at once
        "options": {
            "temperature": 0.0, # Keep temperature low for consistent classification
            "num_predict": 100 # Limit output length to prevent rambling
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120) # Added timeout
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: {review_text[:50]}...")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Evaluation Loop ---
print("\n========== Starting Classification Evaluation ==========")

predictions = []
start_time = time.time()

for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {LLAMA2_MODEL_NAME}"):
    response_data = classify_with_llama2(row['review'])
    
    if response_data["success"]:
        predicted_raw = response_data["raw_response"].strip()
        
        # Regex to capture the exact category name after "Classification:"
        match = re.search(
            r"(?:CLASSIFICATION:\s*)?(Feature Request|Bug Report|Other)", # Non-capturing group for "CLASSIFICATION: " is optional
            predicted_raw,
            re.IGNORECASE | re.DOTALL
        )
        
        pred = match.group(1).strip().lower() if match else "Failed Parsing"
        
        # Optional: Print raw output for debugging if needed (comment out for clean runs)
        # print(f"Raw LLM Output for '{row['review'][:50]}...': '{predicted_raw}' -> Parsed: '{pred}'")

    else:
        pred = "Failed"
        logger.warning(f"Classification failed for review: {row['review'][:50]}... Error: {response_data['raw_response']}")
    
    predictions.append(pred)

elapsed = time.time() - start_time
print(f"\n✅ Classification completed in {elapsed/60:.2f} minutes")

# --- 7. Prepare Results and Generate Classification Report ---
results_df = fr_data.copy()
results_df['predicted'] = predictions

# Filter out failed responses and predictions not in VALID_FR_LABELS for accurate report
filtered_results = results_df[
    (results_df['predicted'] != 'failed') &
    (results_df['predicted'].isin(VALID_FR_LABELS))
]

print("\n--- Sample of Predictions ---")
print(results_df.head())

print(f"\n--- Classification Report for {LLAMA2_MODEL_NAME} ---")
if not filtered_results.empty:
    report = classification_report(
        filtered_results['ground_truth'],
        filtered_results['predicted'],
        labels=VALID_FR_LABELS, # Explicitly list labels to ensure all are shown, even if no predictions
        zero_division=0
    )
    print(report)
else:
    print("No valid predictions to generate a classification report.")
    
print("\n--- Evaluation Complete ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt ---
Loaded and sampled 20 functional reviews for testing.
Sample of loaded FR data:
                                              review     ground_truth
0  'the current fb app is not good at all for tab...       bug report
1  'I often find myself accidentally deleting words'            other
2                    'It also takes a lot of memory'            other
3  'the problem is with the way items displaypics...       bug report
4  'i use my phone almost  exclusivly to log into...  feature request
----------------------------------------



Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████████| 20/20 [00:49<00:00,  2.47s/it]


✅ Classification completed in 0.82 minutes

--- Sample of Predictions ---
                                              review     ground_truth  \
0  'the current fb app is not good at all for tab...       bug report   
1  'I often find myself accidentally deleting words'            other   
2                    'It also takes a lot of memory'            other   
3  'the problem is with the way items displaypics...       bug report   
4  'i use my phone almost  exclusivly to log into...  feature request   

    predicted  
0       other  
1  bug report  
2  bug report  
3  bug report  
4  bug report  

--- Classification Report for phi3:mini ---
                 precision    recall  f1-score   support

feature request       1.00      0.43      0.60         7
     bug report       0.33      0.83      0.48         6
          other       0.50      0.14      0.22         7

       accuracy                           0.45        20
      macro avg       0.61      0.47      0.43        20
 




In [5]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import os # Just in case it's needed for path resolution or similar

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b", # Assuming you pulled llama3:8b
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full Dataset) ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test_sample.txt"

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Ignore malformed lines
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- Standardize labels from the file to match VALID_FR_LABELS ---
# This mapping covers common variations found in your dataset
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)
# --- END NEW ---

# Filter out rows where ground_truth is not one of our valid labels after mapping
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

# --- IMPORTANT: Removed the sampling line here to process the full dataset ---
# fr_data = fr_data.sample(n=min(20, len(fr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded {len(fr_data)} functional reviews from the full dataset for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. The Consistent Classification Prompt ---
classification_prompt_text = """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Determine which of the three categories (Feature Request, Bug Report, Other) it *most accurately* fits based on the provided definitions.
3.  Your final output MUST be only the category name, without any additional text, explanation, or punctuation.

**App Review Segment:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str) -> dict:
    """
    Sends a classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = classification_prompt_text.format(review_text=review_text)

    data = {
        "model": model_name, # Use the model name passed as argument
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models ---
all_models_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_with_ollama_model(row['review'], current_model_name)
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # Regex to capture the exact category name after "Classification:"
            match = re.search(
                r"(?:CLASSIFICATION:\s*)?(Feature Request|Bug Report|Other)",
                predicted_raw,
                re.IGNORECASE | re.DOTALL
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
            
            # Optional: Print raw output for debugging if needed (comment out for clean runs)
            # if pred == "Failed Parsing":
            #    print(f"\nFailed Parsing for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}'")

        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = fr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_FR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_FR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report # Store the full report string
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL MODELS EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies:")
# Using accuracy_score here requires it to be imported. Add it to imports:
# from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import accuracy_score # Add this import
for model, metrics in all_models_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final Evaluation End ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---
Loaded 20 functional reviews from the full dataset for testing.
Sample of loaded FR data:
                                              review ground_truth
0  'the current fb app is not good at all for tab...   bug report
1  'the problem is with the way items displaypics...   bug report
2             'not to mention it force closes often'   bug report
3  'also every time i open it asks me if i want t...   bug report
4                     'now i cannot view any photos'   bug report
----------------------------------------



Classifying reviews with llama2: 100%|█████████████████████████████████████████████████| 20/20 [00:55<00:00,  2.77s/it]



✅ Classification with llama2 completed in 0.93 minutes

--- Sample of Predictions for llama2 ---
                                              review ground_truth  \
0  'the current fb app is not good at all for tab...   bug report   
1  'the problem is with the way items displaypics...   bug report   
2             'not to mention it force closes often'   bug report   
3  'also every time i open it asks me if i want t...   bug report   
4                     'now i cannot view any photos'   bug report   

         predicted  
0       bug report  
1       bug report  
2       bug report  
3  feature request  
4       bug report  

--- Classification Report for llama2 ---
                 precision    recall  f1-score   support

feature request       0.60      0.43      0.50         7
     bug report       0.33      0.83      0.48         6
          other       0.00      0.00      0.00         7

       accuracy                           0.40        20
      macro avg       0.31      

Classifying reviews with mistral: 100%|████████████████████████████████████████████████| 20/20 [00:52<00:00,  2.64s/it]



✅ Classification with mistral completed in 0.88 minutes

--- Sample of Predictions for mistral ---
                                              review ground_truth   predicted
0  'the current fb app is not good at all for tab...   bug report       other
1  'the problem is with the way items displaypics...   bug report       other
2             'not to mention it force closes often'   bug report  bug report
3  'also every time i open it asks me if i want t...   bug report       other
4                     'now i cannot view any photos'   bug report  bug report

--- Classification Report for mistral ---
                 precision    recall  f1-score   support

feature request       1.00      0.71      0.83         7
     bug report       0.60      0.50      0.55         6
          other       0.60      0.86      0.71         7

       accuracy                           0.70        20
      macro avg       0.73      0.69      0.69        20
   weighted avg       0.74      0.70      0.7

Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████████| 20/20 [01:02<00:00,  3.14s/it]



✅ Classification with llama3:8b completed in 1.05 minutes

--- Sample of Predictions for llama3:8b ---
                                              review ground_truth   predicted
0  'the current fb app is not good at all for tab...   bug report  bug report
1  'the problem is with the way items displaypics...   bug report  bug report
2             'not to mention it force closes often'   bug report  bug report
3  'also every time i open it asks me if i want t...   bug report  bug report
4                     'now i cannot view any photos'   bug report  bug report

--- Classification Report for llama3:8b ---
                 precision    recall  f1-score   support

feature request       1.00      0.14      0.25         7
     bug report       0.32      1.00      0.48         6
          other       0.00      0.00      0.00         7

       accuracy                           0.35        20
      macro avg       0.44      0.38      0.24        20
   weighted avg       0.44      0.35   

Classifying reviews with gemma:7b: 100%|███████████████████████████████████████████████| 20/20 [01:17<00:00,  3.86s/it]



✅ Classification with gemma:7b completed in 1.29 minutes

--- Sample of Predictions for gemma:7b ---
                                              review ground_truth  \
0  'the current fb app is not good at all for tab...   bug report   
1  'the problem is with the way items displaypics...   bug report   
2             'not to mention it force closes often'   bug report   
3  'also every time i open it asks me if i want t...   bug report   
4                     'now i cannot view any photos'   bug report   

         predicted  
0  feature request  
1  feature request  
2       bug report  
3  feature request  
4       bug report  

--- Classification Report for gemma:7b ---
                 precision    recall  f1-score   support

feature request       0.42      0.71      0.53         7
     bug report       0.38      0.50      0.43         6
          other       0.00      0.00      0.00         7

       accuracy                           0.40        20
      macro avg       0.26

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████████| 20/20 [00:49<00:00,  2.46s/it]


✅ Classification with phi3:mini completed in 0.82 minutes

--- Sample of Predictions for phi3:mini ---
                                              review ground_truth   predicted
0  'the current fb app is not good at all for tab...   bug report       other
1  'the problem is with the way items displaypics...   bug report  bug report
2             'not to mention it force closes often'   bug report  bug report
3  'also every time i open it asks me if i want t...   bug report  bug report
4                     'now i cannot view any photos'   bug report  bug report

--- Classification Report for phi3:mini ---
                 precision    recall  f1-score   support

feature request       1.00      0.43      0.60         7
     bug report       0.33      0.83      0.48         6
          other       0.50      0.14      0.22         7

       accuracy                           0.45        20
      macro avg       0.61      0.47      0.43        20
   weighted avg       0.62      0.45   




In [6]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
import os # Just in case it's needed for path resolution or similar

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b", # Assuming you pulled llama3:8b
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation (Full Dataset) ---
print("--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test.txt"

# Load the data, assuming 'review_text,classification' format
fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Ignore malformed lines
)

# Define valid FR categories (lowercase for standardization)
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# --- Standardize labels from the file to match VALID_FR_LABELS ---
# This mapping covers common variations found in your dataset
label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)
# --- END NEW ---

# Filter out rows where ground_truth is not one of our valid labels after mapping
initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown or invalid 'ground_truth' labels after mapping.")

# --- IMPORTANT: Removed the sampling line here to process the full dataset ---
# fr_data = fr_data.sample(n=min(20, len(fr_data)), random_state=42).reset_index(drop=True)

print(f"Loaded {len(fr_data)} functional reviews from the full dataset for testing.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 4. The Consistent Classification Prompt ---
classification_prompt_text = """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a **NEW** functionality, an **enhancement**, or an **improvement** to existing features that are **NOT currently broken or causing an error**. It describes something the user *wants the app to do* that it doesn't do yet, or a way to make an existing, working feature better.
* **Bug Report**: This category is for user feedback that describes an **ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR** in the app. It highlights something that is **BROKEN**, not working as designed, or causing an incorrect/unexpected result.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Determine which of the three categories (Feature Request, Bug Report, Other) it *most accurately* fits based on the provided definitions.
3.  Your final output MUST be only the category name, without any additional text, explanation, or punctuation.

**App Review Segment:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str) -> dict:
    """
    Sends a classification request to the local Ollama model.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = classification_prompt_text.format(review_text=review_text)

    data = {
        "model": model_name, # Use the model name passed as argument
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0, # Keep low for consistent classification
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for All Models ---
all_models_results = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_with_ollama_model(row['review'], current_model_name)
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # Regex to capture the exact category name after "Classification:"
            match = re.search(
                r"(?:CLASSIFICATION:\s*)?(Feature Request|Bug Report|Other)",
                predicted_raw,
                re.IGNORECASE | re.DOTALL
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
            
            # Optional: Print raw output for debugging if needed (comment out for clean runs)
            # if pred == "Failed Parsing":
            #    print(f"\nFailed Parsing for: '{row['review']}'\nRaw LLM Output: '{predicted_raw}'")

        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report for current model ---
    results_df_current_model = fr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_FR_LABELS))
    ]

    print(f"\n--- Sample of Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_FR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_results[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report # Store the full report string
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_results[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL MODELS EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies:")
# Using accuracy_score here requires it to be imported. Add it to imports:
# from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import accuracy_score # Add this import
for model, metrics in all_models_results.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final Evaluation End ---")

--- Loading and preparing Functional Requirements data from BOW_test_sample.txt (Full Dataset) ---
Loaded 512 functional reviews from the full dataset for testing.
Sample of loaded FR data:
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------



Classifying reviews with llama2:   1%|▎                                                | 3/512 [00:16<46:36,  5.49s/it]


KeyboardInterrupt: 