In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
# You can use your custom GPU-optimized models here if you've created them.
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Functional Requirements data from BOW_test.txt ---")
data_file_path = "datasets/BOW_test.txt"

fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip'
)

VALID_FR_LABELS = ["feature request", "bug report", "other"]

label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

print(f"Loaded {len(fr_data)} functional reviews from the full dataset.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)


# --- 4. The Few-Shot Prompt Template and Examples ---
# Examples are based on the BOW_test_sample.txt provided for reference
FEW_SHOT_EXAMPLES_FR = [
    {"review": "the current fb app is not good at all for tabs with big screen", "classification": "Bug Report"},
    {"review": "the problem is with the way items displaypics are smalllarge empty space etc the display generally not attractive", "classification": "Bug Report"},
    {"review": "not to mention it force closes often", "classification": "Bug Report"},
    {"review": "it should have multi tabs", "classification": "Feature Request"},
    {"review": "i use my phone almost exclusivly to log into fb and not being able to delete or edit comments is unacceptable", "classification": "Feature Request"},
    {"review": "cant turn off location tracking", "classification": "Feature Request"},
    {"review": "my biggest pet peeve is that i cant like", "classification": "Feature Request"},
    {"review": "But why does it take up 27 MB of RAM on my Galaxy Nexus", "classification": "Other"},
    {"review": "I often find myself accidentally deleting words", "classification": "Other"}
]

formatted_few_shot_text_fr = ""
for ex in FEW_SHOT_EXAMPLES_FR:
    formatted_few_shot_text_fr += f"App Review Segment: {ex['review']}\nClassification: {ex['classification']}\n\n"

few_shot_prompt_text_fr = """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a NEW functionality, an enhancement, or an improvement to existing features that are NOT currently broken or causing an error.
* **Bug Report**: This category is for user feedback that describes an ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR in the app. It highlights something that is BROKEN or not working as designed.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**EXAMPLES:**
{few_shot_examples}

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Based on the definitions and examples, determine which of the three categories it most accurately fits.
3.  Your final output MUST be only the category name (e.g., 'Feature Request'), without any additional text, explanation, or punctuation.

**App Review Segment:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str, **kwargs) -> dict:
    """
    Sends a classification request to the local Ollama model using a specified prompt.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text, **kwargs)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Few-Shot ---
all_models_results_fr = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting FEW-SHOT Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=few_shot_prompt_text_fr,
            few_shot_examples=formatted_few_shot_text_fr
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # Regex to capture the exact category name
            match = re.search(
                r"(feature request|bug report|other)",
                predicted_raw,
                re.IGNORECASE | re.DOTALL
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ FEW-SHOT Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = fr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_FR_LABELS))
    ]

    print(f"\n--- Sample of FEW-SHOT Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- FEW-SHOT Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_FR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_results_fr[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_results_fr[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} FEW-SHOT Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL FEW-SHOT MODELS EVALUATION COMPLETE ==========\n")
print("Summary of FEW-SHOT Accuracies:")
for model, metrics in all_models_results_fr.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final FEW-SHOT Evaluation End ---")

--- Loading and preparing Functional Requirements data from BOW_test.txt ---
Loaded 512 functional reviews from the full dataset.
Sample of loaded FR data:
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████████| 512/512 [29:17<00:00,  3.43s/it]



✅ FEW-SHOT Classification with llama2 completed in 29.30 minutes

--- Sample of FEW-SHOT Predictions for llama2 ---
                                              review ground_truth  \
0                'this version crashes all the time'   bug report   
1                    'it take a lot time in loading'   bug report   
2                               'pages freeze often'   bug report   
3  'still having problems uploading sometimes tho...   bug report   
4  'it wont load any of my notifications when i c...   bug report   

         predicted  
0       bug report  
1            other  
2       bug report  
3  feature request  
4       bug report  

--- FEW-SHOT Classification Report for llama2 ---
                 precision    recall  f1-score   support

feature request       0.16      0.86      0.28        50
     bug report       0.86      0.62      0.72       288
          other       0.66      0.17      0.27       174

       accuracy                           0.49       512
    

Classifying reviews with mistral: 100%|████████████████████████████████████████████| 512/512 [1:35:03<00:00, 11.14s/it]



✅ FEW-SHOT Classification with mistral completed in 95.06 minutes

--- Sample of FEW-SHOT Predictions for mistral ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for mistral ---
                 precision    recall  f1-score   support

feature request       0.51      0.72      0.60        50
     bug report       0.83      0.82      0.82       288
          other       0.74      0.68      0.71       174

       accuracy                           0.76       512
      macro avg       0.70      0.74      0.71       512
   weighted avg  

Classifying reviews with llama3:8b: 100%|██████████████████████████████████████████| 512/512 [1:48:45<00:00, 12.74s/it]



✅ FEW-SHOT Classification with llama3:8b completed in 108.75 minutes

--- Sample of FEW-SHOT Predictions for llama3:8b ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report       other
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for llama3:8b ---
                 precision    recall  f1-score   support

feature request       0.49      0.72      0.59        50
     bug report       0.84      0.88      0.86       288
          other       0.82      0.65      0.72       174

       accuracy                           0.79       512
      macro avg       0.72      0.75      0.72       512
   weighte

Classifying reviews with gemma:7b: 100%|███████████████████████████████████████████| 512/512 [2:04:24<00:00, 14.58s/it]



✅ FEW-SHOT Classification with gemma:7b completed in 124.40 minutes

--- Sample of FEW-SHOT Predictions for gemma:7b ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for gemma:7b ---
                 precision    recall  f1-score   support

feature request       0.26      0.90      0.40        50
     bug report       0.84      0.84      0.84       288
          other       0.94      0.27      0.42       173

       accuracy                           0.66       511
      macro avg       0.68      0.67      0.56       511
   weighted a

Classifying reviews with phi3:mini: 100%|██████████████████████████████████████████| 512/512 [1:00:19<00:00,  7.07s/it]


✅ FEW-SHOT Classification with phi3:mini completed in 60.33 minutes

--- Sample of FEW-SHOT Predictions for phi3:mini ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for phi3:mini ---
                 precision    recall  f1-score   support

feature request       0.53      0.52      0.53        50
     bug report       0.80      0.86      0.83       288
          other       0.73      0.64      0.68       174

       accuracy                           0.75       512
      macro avg       0.68      0.67      0.68       512
   weighted




In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
# You can use your custom GPU-optimized models here if you've created them.
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Functional Requirements data from BOW_test.txt ---")
data_file_path = "datasets/BOW_test.txt"

fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip'
)

VALID_FR_LABELS = ["feature request", "bug report", "other"]

label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

print(f"Loaded {len(fr_data)} functional reviews from the full dataset.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)


# --- 4. The Few-Shot Prompt Template and Examples ---
# Examples are based on the BOW_test_sample.txt provided for reference
FEW_SHOT_EXAMPLES_FR = [
    {"review": "the current fb app is not good at all for tabs with big screen", "classification": "Bug Report"},
    {"review": "the problem is with the way items displaypics are smalllarge empty space etc the display generally not attractive", "classification": "Bug Report"},
    {"review": "not to mention it force closes often", "classification": "Bug Report"},
    {"review": "it should have multi tabs", "classification": "Feature Request"},
    {"review": "i use my phone almost exclusivly to log into fb and not being able to delete or edit comments is unacceptable", "classification": "Feature Request"},
    {"review": "cant turn off location tracking", "classification": "Feature Request"},
    {"review": "my biggest pet peeve is that i cant like", "classification": "Feature Request"},
    {"review": "But why does it take up 27 MB of RAM on my Galaxy Nexus", "classification": "Other"},
    {"review": "I often find myself accidentally deleting words", "classification": "Other"}
]

formatted_few_shot_text_fr = ""
for ex in FEW_SHOT_EXAMPLES_FR:
    formatted_few_shot_text_fr += f"App Review Segment: {ex['review']}\nClassification: {ex['classification']}\n\n"

few_shot_prompt_text_fr = """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a NEW functionality, an enhancement, or an improvement to existing features that are NOT currently broken or causing an error.
* **Bug Report**: This category is for user feedback that describes an ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR in the app. It highlights something that is BROKEN or not working as designed.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**EXAMPLES:**
{few_shot_examples}

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Based on the definitions and examples, determine which of the three categories it most accurately fits.
3.  Your final output MUST be only the category name (e.g., 'Feature Request'), without any additional text, explanation, or punctuation.

**App Review Segment:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str, **kwargs) -> dict:
    """
    Sends a classification request to the local Ollama model using a specified prompt.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text, **kwargs)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Few-Shot ---
all_models_results_fr = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting FEW-SHOT Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=few_shot_prompt_text_fr,
            few_shot_examples=formatted_few_shot_text_fr
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # Regex to capture the exact category name
            match = re.search(
                r"(feature request|bug report|other)",
                predicted_raw,
                re.IGNORECASE | re.DOTALL
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ FEW-SHOT Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = fr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_FR_LABELS))
    ]

    print(f"\n--- Sample of FEW-SHOT Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- FEW-SHOT Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_FR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_results_fr[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_results_fr[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} FEW-SHOT Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL FEW-SHOT MODELS EVALUATION COMPLETE ==========\n")
print("Summary of FEW-SHOT Accuracies:")
for model, metrics in all_models_results_fr.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final FEW-SHOT Evaluation End ---")

--- Loading and preparing Functional Requirements data from BOW_test.txt ---
Loaded 512 functional reviews from the full dataset.
Sample of loaded FR data:
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████████| 512/512 [21:33<00:00,  2.53s/it]



✅ FEW-SHOT Classification with llama2 completed in 21.55 minutes

--- Sample of FEW-SHOT Predictions for llama2 ---
                                              review ground_truth  \
0                'this version crashes all the time'   bug report   
1                    'it take a lot time in loading'   bug report   
2                               'pages freeze often'   bug report   
3  'still having problems uploading sometimes tho...   bug report   
4  'it wont load any of my notifications when i c...   bug report   

         predicted  
0       bug report  
1            other  
2       bug report  
3  feature request  
4       bug report  

--- FEW-SHOT Classification Report for llama2 ---
                 precision    recall  f1-score   support

feature request       0.17      0.88      0.28        50
     bug report       0.87      0.62      0.73       288
          other       0.67      0.17      0.27       174

       accuracy                           0.50       512
    

Classifying reviews with mistral: 100%|██████████████████████████████████████████████| 512/512 [20:03<00:00,  2.35s/it]



✅ FEW-SHOT Classification with mistral completed in 20.05 minutes

--- Sample of FEW-SHOT Predictions for mistral ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for mistral ---
                 precision    recall  f1-score   support

feature request       0.51      0.72      0.60        50
     bug report       0.83      0.82      0.82       288
          other       0.75      0.68      0.71       174

       accuracy                           0.76       512
      macro avg       0.69      0.74      0.71       512
   weighted avg  

Classifying reviews with llama3:8b: 100%|████████████████████████████████████████████| 512/512 [21:36<00:00,  2.53s/it]



✅ FEW-SHOT Classification with llama3:8b completed in 21.61 minutes

--- Sample of FEW-SHOT Predictions for llama3:8b ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report       other
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for llama3:8b ---
                 precision    recall  f1-score   support

feature request       0.49      0.72      0.59        50
     bug report       0.84      0.88      0.86       288
          other       0.82      0.65      0.72       174

       accuracy                           0.79       512
      macro avg       0.72      0.75      0.72       512
   weighted

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████████| 512/512 [24:24<00:00,  2.86s/it]



✅ FEW-SHOT Classification with gemma:7b completed in 24.41 minutes

--- Sample of FEW-SHOT Predictions for gemma:7b ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for gemma:7b ---
                 precision    recall  f1-score   support

feature request       0.26      0.90      0.41        50
     bug report       0.84      0.84      0.84       288
          other       0.94      0.27      0.42       173

       accuracy                           0.66       511
      macro avg       0.68      0.67      0.56       511
   weighted av

Classifying reviews with phi3:mini: 100%|████████████████████████████████████████████| 512/512 [19:45<00:00,  2.32s/it]


✅ FEW-SHOT Classification with phi3:mini completed in 19.76 minutes

--- Sample of FEW-SHOT Predictions for phi3:mini ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for phi3:mini ---
                 precision    recall  f1-score   support

feature request       0.53      0.52      0.53        50
     bug report       0.80      0.86      0.83       288
          other       0.73      0.64      0.68       174

       accuracy                           0.75       512
      macro avg       0.68      0.67      0.68       512
   weighted




In [1]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Define the list of Ollama models to test
# You can use your custom GPU-optimized models here if you've created them.
OLLAMA_MODELS_TO_TEST = [
    "llama2",
    "mistral",
    "llama3:8b",
    "gemma:7b",
    "phi3:mini"
]

# --- 3. Data Loading and Preparation ---
print("--- Loading and preparing Functional Requirements data from BOW_test.txt ---")
data_file_path = "datasets/BOW_test.txt"

fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip'
)

VALID_FR_LABELS = ["feature request", "bug report", "other"]

label_mapping = {
    'bugreport': 'bug report',
    'featurerequest': 'feature request',
    'other': 'other'
}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)

print(f"Loaded {len(fr_data)} functional reviews from the full dataset.")
print("Sample of loaded FR data:")
print(fr_data.head())
print("-" * 40)


# --- 4. The Few-Shot Prompt Template and Examples ---
# Examples are based on the BOW_test_sample.txt provided for reference
FEW_SHOT_EXAMPLES_FR = [
    {"review": "the current fb app is not good at all for tabs with big screen", "classification": "Bug Report"},
    {"review": "the problem is with the way items displaypics are smalllarge empty space etc the display generally not attractive", "classification": "Bug Report"},
    {"review": "not to mention it force closes often", "classification": "Bug Report"},
    {"review": "it should have multi tabs", "classification": "Feature Request"},
    {"review": "i use my phone almost exclusivly to log into fb and not being able to delete or edit comments is unacceptable", "classification": "Feature Request"},
    {"review": "cant turn off location tracking", "classification": "Feature Request"},
    {"review": "my biggest pet peeve is that i cant like", "classification": "Feature Request"},
    {"review": "But why does it take up 27 MB of RAM on my Galaxy Nexus", "classification": "Other"},
    {"review": "I often find myself accidentally deleting words", "classification": "Other"}
]

formatted_few_shot_text_fr = ""
for ex in FEW_SHOT_EXAMPLES_FR:
    formatted_few_shot_text_fr += f"App Review Segment: {ex['review']}\nClassification: {ex['classification']}\n\n"

few_shot_prompt_text_fr = """
You are an expert in software requirements analysis, specializing in user feedback. Your task is to precisely classify the provided app review segment into one of the following functional requirement categories: 'Feature Request', 'Bug Report', or 'Other'.

**DEFINITIONS:**
* **Feature Request**: This category is for user feedback that clearly suggests a NEW functionality, an enhancement, or an improvement to existing features that are NOT currently broken or causing an error.
* **Bug Report**: This category is for user feedback that describes an ERROR, FAULT, FLAW, or UNINTENDED BEHAVIOR in the app. It highlights something that is BROKEN or not working as designed.
* **Other**: This category is for general feedback, compliments, complaints that are not specific enough to be a bug or feature, questions, or irrelevant comments.

**EXAMPLES:**
{few_shot_examples}

**INSTRUCTIONS:**
1.  Read the "App Review Segment" carefully.
2.  Based on the definitions and examples, determine which of the three categories it most accurately fits.
3.  Your final output MUST be only the category name (e.g., 'Feature Request'), without any additional text, explanation, or punctuation.

**App Review Segment:** '''{review_text}'''

**Classification:**
"""

# --- 5. LLM Interaction Function ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str, **kwargs) -> dict:
    """
    Sends a classification request to the local Ollama model using a specified prompt.
    """
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    
    formatted_prompt = prompt_template.format(review_text=review_text, **kwargs)

    data = {
        "model": model_name,
        "prompt": formatted_prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 100
        }
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        response.raise_for_status()
        result = response.json()
        return {"success": True, "raw_response": result.get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama server at {OLLAMA_BASE_URL}. Is Ollama running?")
        return {"success": False, "raw_response": "Connection Error: Ollama server not reachable."}
    except requests.exceptions.Timeout:
        logger.error(f"Ollama request timed out for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout Error: Ollama request took too long."}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred: {http_err} - {response.text} with model {model_name}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred during Ollama call: {e} with model {model_name}")
        return {"success": False, "raw_response": f"Unexpected Error: {e}"}

# --- 6. Main Evaluation Loop for Few-Shot ---
all_models_results_fr = {}

for current_model_name in OLLAMA_MODELS_TO_TEST:
    print(f"\n{'='*20} Starting FEW-SHOT Classification Evaluation for Model: {current_model_name} {'='*20}")
    
    predictions = []
    start_time = time.time()

    for i, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying reviews with {current_model_name}"):
        response_data = classify_with_ollama_model(
            row['review'],
            current_model_name,
            prompt_template=few_shot_prompt_text_fr,
            few_shot_examples=formatted_few_shot_text_fr
        )
        
        if response_data["success"]:
            predicted_raw = response_data["raw_response"].strip()
            
            # Regex to capture the exact category name
            match = re.search(
                r"(feature request|bug report|other)",
                predicted_raw,
                re.IGNORECASE | re.DOTALL
            )
            
            pred = match.group(1).strip().lower() if match else "Failed Parsing"
        else:
            pred = "Failed"
            logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with model {current_model_name}")
        
        predictions.append(pred)

    elapsed = time.time() - start_time
    print(f"\n✅ FEW-SHOT Classification with {current_model_name} completed in {elapsed/60:.2f} minutes")

    # --- 7. Prepare Results and Generate Classification Report ---
    results_df_current_model = fr_data.copy()
    results_df_current_model['predicted'] = predictions

    filtered_results = results_df_current_model[
        (results_df_current_model['predicted'] != 'failed') &
        (results_df_current_model['predicted'].isin(VALID_FR_LABELS))
    ]

    print(f"\n--- Sample of FEW-SHOT Predictions for {current_model_name} ---")
    print(results_df_current_model.head())

    print(f"\n--- FEW-SHOT Classification Report for {current_model_name} ---")
    if not filtered_results.empty:
        report = classification_report(
            filtered_results['ground_truth'],
            filtered_results['predicted'],
            labels=VALID_FR_LABELS,
            zero_division=0
        )
        print(report)
        all_models_results_fr[current_model_name] = {
            'accuracy': accuracy_score(filtered_results['ground_truth'], filtered_results['predicted']),
            'report': report
        }
    else:
        print(f"No valid predictions to generate a classification report for {current_model_name}.")
        all_models_results_fr[current_model_name] = {
            'accuracy': 0.0,
            'report': "No valid predictions."
        }
    
    print(f"\n{'='*20} FEW-SHOT Evaluation for {current_model_name} Complete {'='*20}\n")

print("\n\n========== ALL FEW-SHOT MODELS EVALUATION COMPLETE ==========\n")
print("Summary of FEW-SHOT Accuracies:")
for model, metrics in all_models_results_fr.items():
    print(f"{model}: Accuracy = {metrics['accuracy']:.2f}")

print("\n--- Final FEW-SHOT Evaluation End ---")

--- Loading and preparing Functional Requirements data from BOW_test.txt ---
Loaded 512 functional reviews from the full dataset.
Sample of loaded FR data:
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------



Classifying reviews with llama2: 100%|███████████████████████████████████████████████| 512/512 [21:35<00:00,  2.53s/it]



✅ FEW-SHOT Classification with llama2 completed in 21.59 minutes

--- Sample of FEW-SHOT Predictions for llama2 ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report       other
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for llama2 ---
                 precision    recall  f1-score   support

feature request       0.17      0.88      0.28        50
     bug report       0.87      0.62      0.73       288
          other       0.67      0.17      0.27       174

       accuracy                           0.50       512
      macro avg       0.57      0.56      0.43       512
   weighted avg     

Classifying reviews with mistral: 100%|██████████████████████████████████████████████| 512/512 [20:21<00:00,  2.39s/it]



✅ FEW-SHOT Classification with mistral completed in 20.37 minutes

--- Sample of FEW-SHOT Predictions for mistral ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for mistral ---
                 precision    recall  f1-score   support

feature request       0.50      0.72      0.59        50
     bug report       0.84      0.82      0.83       288
          other       0.75      0.68      0.71       174

       accuracy                           0.76       512
      macro avg       0.69      0.74      0.71       512
   weighted avg  

Classifying reviews with llama3:8b: 100%|████████████████████████████████████████████| 512/512 [21:35<00:00,  2.53s/it]



✅ FEW-SHOT Classification with llama3:8b completed in 21.60 minutes

--- Sample of FEW-SHOT Predictions for llama3:8b ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report       other
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for llama3:8b ---
                 precision    recall  f1-score   support

feature request       0.49      0.72      0.59        50
     bug report       0.84      0.88      0.86       288
          other       0.82      0.65      0.72       174

       accuracy                           0.79       512
      macro avg       0.72      0.75      0.72       512
   weighted

Classifying reviews with gemma:7b: 100%|█████████████████████████████████████████████| 512/512 [24:16<00:00,  2.84s/it]



✅ FEW-SHOT Classification with gemma:7b completed in 24.27 minutes

--- Sample of FEW-SHOT Predictions for gemma:7b ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for gemma:7b ---
                 precision    recall  f1-score   support

feature request       0.26      0.90      0.41        50
     bug report       0.84      0.84      0.84       288
          other       0.94      0.27      0.42       173

       accuracy                           0.66       511
      macro avg       0.68      0.67      0.56       511
   weighted av

Classifying reviews with phi3:mini: 100%|████████████████████████████████████████████| 512/512 [19:44<00:00,  2.31s/it]


✅ FEW-SHOT Classification with phi3:mini completed in 19.74 minutes

--- Sample of FEW-SHOT Predictions for phi3:mini ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- FEW-SHOT Classification Report for phi3:mini ---
                 precision    recall  f1-score   support

feature request       0.53      0.52      0.53        50
     bug report       0.80      0.86      0.83       288
          other       0.73      0.64      0.68       174

       accuracy                           0.75       512
      macro avg       0.68      0.67      0.68       512
   weighted




In [None]:
+ 