In [None]:
import os
print(os.getenv("GEMINI_API_KEY"))

In [None]:
import sys
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import time
import logging

# Set up basic logging for clarity in notebook output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# --- CHANGE THIS LINE ---
# Set the llm_client's logging level to INFO or WARNING to prevent excessive debug messages
logging.getLogger('src.llm_client').setLevel(logging.INFO) # Or logging.WARNING for even less verbosity
# --- END CHANGE ---

# Adjust sys.path to ensure modules in 'src' can be imported
if "src" not in sys.path:
    sys.path.append("src")

from src.llm_client import LLMClient
from src.config import Config

# --- 1. Data Loading and Preparation for Functional Requirements ---
print("--- Loading and preparing Functional Requirements data from BOW_test.txt ---")
# Load the data from BOW_test.txt
# Assuming BOW_test.txt has 'review_text,classification' format
fr_data = pd.read_csv(
    "datasets/BOW_test_sample.txt",
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip' # Handle potential malformed lines
)

# Standardize ground_truth labels to lowercase and strip whitespace
fr_data['ground_truth'] = fr_data['ground_truth'].str.strip().str.lower()

# Define valid FR categories for standardization and filtering later
VALID_FR_LABELS = ["feature request", "bug report", "other"]

# Filter out any ground_truth labels that are not in our defined VALID_FR_LABELS
initial_len = len(fr_data)
fr_data = fr_data[fr_data['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)
if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} rows with unknown 'ground_truth' labels.")

print(f"Loaded {len(fr_data)} functional reviews.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)

# --- 2. LLM Client Initialization ---
print("--- Initializing LLM Client ---")
client = LLMClient()
if not client.test_connection():
    print("❌ LLM connection failed at initialization. Please check API key in config, model name, and network.")
    # It's good practice to exit or raise an exception here if connection is mandatory
else:
    print("✅ LLM Client initialized and connected.")
print("-" * 40)

# --- 3. Define FR Definitions & Few-Shot Examples (for FC2, FC3) ---
# Definitions for FC3
FR_DEFINITIONS = {
    "feature request": "A suggestion for new functionality, enhancements, or improvements to existing features.",
    "bug report": "Describes an error, fault, or flaw in the system that causes it to produce an incorrect or unexpected result, or to behave in unintended ways.",
    "other": "The review does not clearly fall into Feature Request or Bug Report."
}

# Few-Shot Examples for FC2
# IMPORTANT: Replace these with actual examples from your BOW_test.txt
# if you have good, diverse examples that fit your categories.
# These are placeholders for now.
FR_FEW_SHOT_EXAMPLES = [
    {"review": "I wish there was a dark mode option in the settings.", "classification": "Feature Request"},
    {"review": "The app crashes every time I try to upload a photo.", "classification": "Bug Report"},
    {"review": "Great app, keep up the good work!", "classification": "Other"},
    {"review": "Please add a way to export data to CSV.", "classification": "Feature Request"},
    {"review": "The search filter does not work correctly for dates.", "classification": "Bug Report"},
    {"review": "Can you make the 'save' button more prominent?", "classification": "Feature Request"}
]

# Format these examples into a single string for the prompt
formatted_fr_few_shot_text = ""
for ex in FR_FEW_SHOT_EXAMPLES:
    formatted_fr_few_shot_text += f"Review: {ex['review']}\nClassification: {ex['classification']}\n\n"

# Define the list of FR categories as a string for prompts
all_fr_labels_str = ", ".join([label.capitalize() for label in VALID_FR_LABELS])
print("Defined FR definitions and few-shot examples.")
print("-" * 40)


# --- 4. Evaluation Function Definition (Generalized for FR) ---
def evaluate_classification_prompt_strategy(
    prompt_id: str,
    data_df: pd.DataFrame,
    client_instance: LLMClient,
    category_name: str = "FUNCTIONAL_REQUIREMENTS", # Changed default category
    **prompt_kwargs
) -> pd.DataFrame:
    """
    Evaluates the classification performance using a specific prompt ID for FRs.

    Args:
        prompt_id (str): The ID of the prompt to activate from prompts.json.
        data_df (pd.DataFrame): The DataFrame containing 'review' and 'ground_truth' columns.
        client_instance (LLMClient): An initialized instance of the LLMClient.
        category_name (str): The category name for the prompt in prompts.json (e.g., "FUNCTIONAL_REQUIREMENTS").
        **prompt_kwargs: Additional keyword arguments to pass to the prompt's .format() method.

    Returns:
        pd.DataFrame: A DataFrame with 'review', 'ground_truth', 'predicted' columns,
                      and a performance report for the given prompt.
    """

    Config.set_active_prompt_id(category_name, prompt_id)

    print(f"\n--- Starting evaluation for Prompt ID: {prompt_id} ---")

    predictions = []
    start_time = time.time()

    for i, row in tqdm(data_df.iterrows(), total=len(data_df), desc=f"Classifying with {prompt_id}"):
        # The classify_nfr method is generic enough to handle any text and kwargs
        # The prompt itself determines how the classification is done.
        response = client_instance.classify_nfr(row['review'], **prompt_kwargs)
        pred = response.classification if response.success else "Failed"
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ Classification with {prompt_id} completed in {elapsed/60:.2f} minutes")

    results_df = data_df.copy()
    results_df['predicted'] = predictions

    # Standardize predicted labels for accurate comparison
    results_df['predicted'] = results_df['predicted'].str.strip().str.lower()

    # Filter out failed responses and predictions not in VALID_FR_LABELS for accurate report
    filtered_results = results_df[
        (results_df['predicted'] != 'failed') &
        (results_df['predicted'].isin(VALID_FR_LABELS))
    ]

    # Align the labels for classification report
    # Ensure all labels (ground truth and predicted) are present in the report
    unique_labels = sorted(list(set(filtered_results['ground_truth'].tolist() + filtered_results['predicted'].tolist())))

    report = classification_report(
        filtered_results['ground_truth'],
        filtered_results['predicted'],
        labels=VALID_FR_LABELS, # Explicitly list labels to ensure all are shown, even if no predictions
        zero_division=0
    )

    print(f"\n--- Classification Report for Prompt ID: {prompt_id} ---\n")
    print(f"{report}\n")
    print(f"--- End Report for Prompt ID: {prompt_id} ---\n")

    return results_df, report

print("Logic setup complete. Ready to run evaluations.")

In [None]:
# --- 5. Evaluation for FC1 (Direct Multi-Class User Review Classification) ---
print("\n========== EVALUATING PROMPT FC1 (Direct Multi-Class) ==========")
results_fc1_df, report_fc1 = evaluate_classification_prompt_strategy(
    prompt_id="P1",
    data_df=fr_data, # <--- Add .head(15) here
    client_instance=client,
    category_name="FUNCTIONAL_REQUIREMENTS" # Specify the category
    # No extra kwargs needed for FC1 as per its definition
)
print("First 5 predictions for FC1:")
print(results_fc1_df.head())

In [None]:
results_fc1_df

In [None]:
# --- 6. Evaluation for FC2 (Few-Shot User Review Classification) ---
print("\n========== EVALUATING PROMPT FC2 (Few-Shot) ==========")
results_fc2_df, report_fc2 = evaluate_classification_prompt_strategy(
    prompt_id="FC2",
    data_df=fr_data,
    client_instance=client,
    category_name="FUNCTIONAL_REQUIREMENTS", # Specify the category
    few_shot_examples_text=formatted_fr_few_shot_text # Pass the prepared few-shot examples
)
print("First 5 predictions for FC2:")
print(results_fc2_df.head())

In [None]:
results_fc2_df

In [None]:
# --- 7. Evaluation for FC3 (Definition-Based User Review Classification) ---
print("\n========== EVALUATING PROMPT FC3 (Definition-Based) ==========")
results_fc3_df, report_fc3 = evaluate_classification_prompt_strategy(
    prompt_id="FC3",
    data_df=fr_data,
    client_instance=client,
    category_name="FUNCTIONAL_REQUIREMENTS" # Specify the category
    # No extra kwargs needed for FC3, as definitions are hardcoded in the prompt itself
)
print("First 5 predictions for FC3:")
print(results_fc3_df.head())

print("\n--- All Functional Requirement evaluations completed ---")