In [2]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
from string import Template

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Models to test
OLLAMA_MODELS_TO_TEST = ["llama2", "mistral", "llama3:8b", "gemma:7b", "phi3:mini"]

# --- 3. Data Loading ---
print("--- Loading and preparing Functional Requirements data from BOW_test.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test.txt"

fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip'
)

VALID_FR_LABELS = ["feature request", "bug report", "other"]
label_mapping = {'bugreport': 'bug report', 'featurerequest': 'feature request', 'other': 'other'}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)
if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} invalid rows.")

print(f"Loaded {len(fr_data)} reviews.")
print(fr_data.head())
print("-" * 40)

# --- 4. Prompt (Constraint-Based) ---
PROMPT_STRATEGIES = {
    "Constraint_Based_Prompt": """
You are an expert classifier. Enforce the constraints below strictly.

TASK
Classify the app review into exactly ONE category:
- Feature Request
- Bug Report
- Other

CONSTRAINTS
1) Output MUST be valid JSON, no preface/suffix text.
2) "category" MUST be one of: ["Feature Request","Bug Report","Other"].
3) "evidence" MUST be an exact substring from the review (<=140 chars) justifying the label.
4) "justification" MUST be <= 20 words, no numbered steps or hidden reasoning.
5) "confidence" is a float in [0,1] (use 0.50, 0.75, 0.90, 0.95, or 0.99).
6) If error/failure implied → "Bug Report". If explicit ask for new/improved functionality without error → "Feature Request". Else → "Other".

OUTPUT JSON SCHEMA
{
  "category": "Feature Request|Bug Report|Other",
  "evidence": "substring from review",
  "justification": "≤20 words",
  "confidence": 0.50
}

REVIEW: '''${review_text}'''
ONLY RETURN THE JSON OBJECT:
"""
}

# --- Helpers ---
def format_prompt(template_str: str, review_text: str) -> str:
    return Template(template_str).substitute(review_text=review_text)

def extract_json_object(text: str):
    # Try direct parse
    try:
        return json.loads(text)
    except Exception:
        pass
    # Find first JSON object
    m = re.search(r'\{.*\}', text, flags=re.DOTALL)
    if not m:
        return None
    snippet = m.group(0)
    for candidate in (snippet, re.sub(r"'", '"', snippet)):  # fix single quotes if needed
        try:
            return json.loads(candidate)
        except Exception:
            continue
    return None

def normalize_category(cat: str) -> str:
    if not isinstance(cat, str):
        return "Failed Parsing"
    cat_l = cat.strip().lower()
    if cat_l in VALID_FR_LABELS:
        return cat_l
    # Title-case variants
    mapping = {
        "feature request": "feature request",
        "bug report": "bug report",
        "other": "other"
    }
    return mapping.get(cat_l, "Failed Parsing")

# --- 5. LLM Call ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    prompt = format_prompt(prompt_template, review_text)

    data = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": 0.0, "num_predict": 256}
    }

    try:
        r = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        r.raise_for_status()
        return {"success": True, "raw_response": r.json().get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama at {OLLAMA_BASE_URL}. Is it running?")
        return {"success": False, "raw_response": "Connection Error"}
    except requests.exceptions.Timeout:
        logger.error(f"Timeout for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout"}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error: {http_err} - {r.text}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        return {"success": False, "raw_response": f"Unexpected: {e}"}

# --- 6. Evaluation ---
all_evaluation_results = {}

for prompt_name, prompt_template in PROMPT_STRATEGIES.items():
    print(f"\n\n{'='*15} Evaluating with Prompt Strategy: {prompt_name} {'='*15}")
    all_evaluation_results[prompt_name] = {}

    for current_model_name in OLLAMA_MODELS_TO_TEST:
        print(f"\n{'='*10} Starting Classification for Model: {current_model_name} {'='*10}")
        predictions, start_time = [], time.time()

        for _, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying with {current_model_name} ({prompt_name})"):
            resp = classify_with_ollama_model(row['review'], current_model_name, prompt_template)
            if resp["success"]:
                raw = resp["raw_response"].strip()
                obj = extract_json_object(raw)
                if obj and isinstance(obj, dict) and "category" in obj:
                    pred = normalize_category(obj["category"])
                else:
                    # last-ditch: scan for label tokens
                    m = re.search(r"\b(feature request|bug report|other)\b", raw, re.IGNORECASE)
                    pred = m.group(1).strip().lower() if m else "Failed Parsing"
                    if pred == "Failed Parsing":
                        logger.warning(f"\nFailed Parsing for: '{row['review']}'\nRaw: '{raw}'")
            else:
                pred = "Failed"
                logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with {current_model_name}")

            predictions.append(pred)

        elapsed = time.time() - start_time
        print(f"\n✅ {current_model_name} using {prompt_name} finished in {elapsed/60:.2f} minutes")

        results_df = fr_data.copy()
        results_df['predicted'] = predictions

        filtered = results_df[
            (results_df['predicted'] != 'failed') &
            (results_df['predicted'].isin(VALID_FR_LABELS))
        ]

        print(f"\n--- Sample Predictions for {current_model_name} ({prompt_name}) ---")
        print(results_df.head())

        print(f"\n--- Classification Report for {current_model_name} ({prompt_name}) ---")
        if not filtered.empty:
            report = classification_report(
                filtered['ground_truth'],
                filtered['predicted'],
                labels=VALID_FR_LABELS,
                zero_division=0
            )
            print(report)
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': accuracy_score(filtered['ground_truth'], filtered['predicted']),
                'report': report
            }
        else:
            print(f"No valid predictions for {current_model_name} ({prompt_name}).")
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': 0.0,
                'report': "No valid predictions."
            }
        print(f"\n{'='*10} Done: {current_model_name} ({prompt_name}) {'='*10}\n")

print("\n\n========== ALL EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies:")
for prompt_name, models_data in all_evaluation_results.items():
    print(f"\n--- Prompt Strategy: {prompt_name} ---")
    for model, metrics in models_data.items():
        print(f"  {model}: Accuracy = {metrics['accuracy']:.2f}")
print("\n--- Final Evaluation End ---")


--- Loading and preparing Functional Requirements data from BOW_test.txt (Full Dataset) ---
Loaded 512 reviews.
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------





Classifying with llama2 (Constraint_Based_Prompt): 100%|███████████████████████████| 512/512 [2:13:43<00:00, 15.67s/it]



✅ llama2 using Constraint_Based_Prompt finished in 133.73 minutes

--- Sample Predictions for llama2 (Constraint_Based_Prompt) ---
                                              review ground_truth  \
0                'this version crashes all the time'   bug report   
1                    'it take a lot time in loading'   bug report   
2                               'pages freeze often'   bug report   
3  'still having problems uploading sometimes tho...   bug report   
4  'it wont load any of my notifications when i c...   bug report   

         predicted  
0       bug report  
1            other  
2  feature request  
3       bug report  
4       bug report  

--- Classification Report for llama2 (Constraint_Based_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.18      0.74      0.29        50
     bug report       0.84      0.48      0.61       288
          other       0.58      0.47      0.52       174

       accuracy              

Classifying with mistral (Constraint_Based_Prompt): 100%|██████████████████████████| 512/512 [2:00:28<00:00, 14.12s/it]



✅ mistral using Constraint_Based_Prompt finished in 120.48 minutes

--- Sample Predictions for mistral (Constraint_Based_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for mistral (Constraint_Based_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.34      0.84      0.49        50
     bug report       0.78      0.89      0.83       288
          other       0.98      0.33      0.49       174

       accuracy                           0.70       512
      macro avg       0.70      0.69      

Classifying with llama3:8b (Constraint_Based_Prompt): 100%|████████████████████████| 512/512 [2:36:38<00:00, 18.36s/it]



✅ llama3:8b using Constraint_Based_Prompt finished in 156.64 minutes

--- Sample Predictions for llama3:8b (Constraint_Based_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report       other
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama3:8b (Constraint_Based_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.24      0.80      0.37        50
     bug report       0.81      0.82      0.81       288
          other       0.87      0.26      0.41       174

       accuracy                           0.63       512
      macro avg       0.64      0.63

Classifying with gemma:7b (Constraint_Based_Prompt): 100%|█████████████████████████| 512/512 [4:17:56<00:00, 30.23s/it]



✅ gemma:7b using Constraint_Based_Prompt finished in 257.94 minutes

--- Sample Predictions for gemma:7b (Constraint_Based_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for gemma:7b (Constraint_Based_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.39      0.68      0.49        50
     bug report       0.73      0.94      0.82       288
          other       1.00      0.29      0.45       174

       accuracy                           0.70       512
      macro avg       0.71      0.64   

Classifying with phi3:mini (Constraint_Based_Prompt): 100%|██████████████████████████| 512/512 [59:28<00:00,  6.97s/it]



✅ phi3:mini using Constraint_Based_Prompt finished in 59.47 minutes

--- Sample Predictions for phi3:mini (Constraint_Based_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report       other
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for phi3:mini (Constraint_Based_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.37      0.80      0.51        50
     bug report       0.90      0.69      0.78       286
          other       0.66      0.69      0.68       174

       accuracy                           0.70       510
      macro avg       0.64      0.73 

In [None]:
import pandas as pd
import requests
import json
import logging
import re
from sklearn.metrics import classification_report, accuracy_score
import time
from tqdm import tqdm
from string import Template

# --- 1. Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. LLM Configuration ---
OLLAMA_BASE_URL = "http://localhost:11434"

# Models to test
OLLAMA_MODELS_TO_TEST = ["llama2", "mistral", "llama3:8b", "gemma:7b", "phi3:mini"]

# --- 3. Data Loading ---
print("--- Loading and preparing Functional Requirements data from BOW_test.txt (Full Dataset) ---")
data_file_path = "datasets/BOW_test.txt"

fr_data_raw = pd.read_csv(
    data_file_path,
    sep=',',
    header=None,
    names=['review', 'ground_truth'],
    on_bad_lines='skip'
)

VALID_FR_LABELS = ["feature request", "bug report", "other"]
label_mapping = {'bugreport': 'bug report', 'featurerequest': 'feature request', 'other': 'other'}
fr_data_raw['ground_truth'] = fr_data_raw['ground_truth'].str.strip().str.lower().replace(label_mapping)

initial_len = len(fr_data_raw)
fr_data = fr_data_raw[fr_data_raw['ground_truth'].isin(VALID_FR_LABELS)].reset_index(drop=True)
if len(fr_data) < initial_len:
    print(f"Warning: Removed {initial_len - len(fr_data)} invalid rows.")

print(f"Loaded {len(fr_data)} reviews.")
print(fr_data.head())
print("-" * 40)

# --- 4. Prompt (Constraint-Based) ---
PROMPT_STRATEGIES = {
    "Constraint_Based_Prompt": """
You are an expert classifier. Enforce the constraints below strictly.

TASK
Classify the app review into exactly ONE category:
- Feature Request
- Bug Report
- Other

CONSTRAINTS
1) Output MUST be valid JSON, no preface/suffix text.
2) "category" MUST be one of: ["Feature Request","Bug Report","Other"].
3) "evidence" MUST be an exact substring from the review (<=140 chars) justifying the label.
4) "justification" MUST be <= 20 words, no numbered steps or hidden reasoning.
5) "confidence" is a float in [0,1] (use 0.50, 0.75, 0.90, 0.95, or 0.99).
6) If error/failure implied → "Bug Report". If explicit ask for new/improved functionality without error → "Feature Request". Else → "Other".

OUTPUT JSON SCHEMA
{
  "category": "Feature Request|Bug Report|Other",
  "evidence": "substring from review",
  "justification": "≤20 words",
  "confidence": 0.50
}

REVIEW: '''${review_text}'''
ONLY RETURN THE JSON OBJECT:
"""
}

# --- Helpers ---
def format_prompt(template_str: str, review_text: str) -> str:
    return Template(template_str).substitute(review_text=review_text)

def extract_json_object(text: str):
    # Try direct parse
    try:
        return json.loads(text)
    except Exception:
        pass
    # Find first JSON object
    m = re.search(r'\{.*\}', text, flags=re.DOTALL)
    if not m:
        return None
    snippet = m.group(0)
    for candidate in (snippet, re.sub(r"'", '"', snippet)):  # fix single quotes if needed
        try:
            return json.loads(candidate)
        except Exception:
            continue
    return None

def normalize_category(cat: str) -> str:
    if not isinstance(cat, str):
        return "Failed Parsing"
    cat_l = cat.strip().lower()
    if cat_l in VALID_FR_LABELS:
        return cat_l
    # Title-case variants
    mapping = {
        "feature request": "feature request",
        "bug report": "bug report",
        "other": "other"
    }
    return mapping.get(cat_l, "Failed Parsing")

# --- 5. LLM Call ---
def classify_with_ollama_model(review_text: str, model_name: str, prompt_template: str) -> dict:
    url = f"{OLLAMA_BASE_URL}/api/generate"
    headers = {"Content-Type": "application/json"}
    prompt = format_prompt(prompt_template, review_text)

    data = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": 0.0, "num_predict": 256}
    }

    try:
        r = requests.post(url, headers=headers, data=json.dumps(data), timeout=120)
        r.raise_for_status()
        return {"success": True, "raw_response": r.json().get("response", "")}
    except requests.exceptions.ConnectionError:
        logger.error(f"Failed to connect to Ollama at {OLLAMA_BASE_URL}. Is it running?")
        return {"success": False, "raw_response": "Connection Error"}
    except requests.exceptions.Timeout:
        logger.error(f"Timeout for review: '{review_text[:50]}...' with model {model_name}")
        return {"success": False, "raw_response": "Timeout"}
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error: {http_err} - {r.text}")
        return {"success": False, "raw_response": f"HTTP Error: {http_err}"}
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        return {"success": False, "raw_response": f"Unexpected: {e}"}

# --- 6. Evaluation ---
all_evaluation_results = {}

for prompt_name, prompt_template in PROMPT_STRATEGIES.items():
    print(f"\n\n{'='*15} Evaluating with Prompt Strategy: {prompt_name} {'='*15}")
    all_evaluation_results[prompt_name] = {}

    for current_model_name in OLLAMA_MODELS_TO_TEST:
        print(f"\n{'='*10} Starting Classification for Model: {current_model_name} {'='*10}")
        predictions, start_time = [], time.time()

        for _, row in tqdm(fr_data.iterrows(), total=len(fr_data), desc=f"Classifying with {current_model_name} ({prompt_name})"):
            resp = classify_with_ollama_model(row['review'], current_model_name, prompt_template)
            if resp["success"]:
                raw = resp["raw_response"].strip()
                obj = extract_json_object(raw)
                if obj and isinstance(obj, dict) and "category" in obj:
                    pred = normalize_category(obj["category"])
                else:
                    # last-ditch: scan for label tokens
                    m = re.search(r"\b(feature request|bug report|other)\b", raw, re.IGNORECASE)
                    pred = m.group(1).strip().lower() if m else "Failed Parsing"
                    if pred == "Failed Parsing":
                        logger.warning(f"\nFailed Parsing for: '{row['review']}'\nRaw: '{raw}'")
            else:
                pred = "Failed"
                logger.warning(f"Classification failed for review: '{row['review'][:50]}...' with {current_model_name}")

            predictions.append(pred)

        elapsed = time.time() - start_time
        print(f"\n✅ {current_model_name} using {prompt_name} finished in {elapsed/60:.2f} minutes")

        results_df = fr_data.copy()
        results_df['predicted'] = predictions

        filtered = results_df[
            (results_df['predicted'] != 'failed') &
            (results_df['predicted'].isin(VALID_FR_LABELS))
        ]

        print(f"\n--- Sample Predictions for {current_model_name} ({prompt_name}) ---")
        print(results_df.head())

        print(f"\n--- Classification Report for {current_model_name} ({prompt_name}) ---")
        if not filtered.empty:
            report = classification_report(
                filtered['ground_truth'],
                filtered['predicted'],
                labels=VALID_FR_LABELS,
                zero_division=0
            )
            print(report)
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': accuracy_score(filtered['ground_truth'], filtered['predicted']),
                'report': report
            }
        else:
            print(f"No valid predictions for {current_model_name} ({prompt_name}).")
            all_evaluation_results[prompt_name][current_model_name] = {
                'accuracy': 0.0,
                'report': "No valid predictions."
            }
        print(f"\n{'='*10} Done: {current_model_name} ({prompt_name}) {'='*10}\n")

print("\n\n========== ALL EVALUATION COMPLETE ==========\n")
print("Summary of Accuracies:")
for prompt_name, models_data in all_evaluation_results.items():
    print(f"\n--- Prompt Strategy: {prompt_name} ---")
    for model, metrics in models_data.items():
        print(f"  {model}: Accuracy = {metrics['accuracy']:.2f}")
print("\n--- Final Evaluation End ---")


--- Loading and preparing Functional Requirements data from BOW_test.txt (Full Dataset) ---
Loaded 512 reviews.
                                              review ground_truth
0                'this version crashes all the time'   bug report
1                    'it take a lot time in loading'   bug report
2                               'pages freeze often'   bug report
3  'still having problems uploading sometimes tho...   bug report
4  'it wont load any of my notifications when i c...   bug report
----------------------------------------





Classifying with llama2 (Constraint_Based_Prompt):  43%|███████████▋               | 221/512 [52:48<1:10:03, 14.45s/it]2025-09-28 14:44:23,662 - __main__ - ERROR - Timeout for review: ''Great app but becomes sluggish every now and then...' with model llama2
Classifying with llama2 (Constraint_Based_Prompt):  43%|███████████▋               | 222/512 [54:51<3:47:21, 47.04s/it]2025-09-28 14:46:26,915 - __main__ - ERROR - Timeout for review: ''Its not a bad keyboard replacement but it insists...' with model llama2
Classifying with llama2 (Constraint_Based_Prompt):  44%|███████████▊               | 223/512 [56:55<5:38:45, 70.33s/it]2025-09-28 14:48:41,037 - __main__ - ERROR - Timeout for review: ''But the predictions bar doesnt appear for for the...' with model llama2
Classifying with llama2 (Constraint_Based_Prompt):  70%|█████████████████▍       | 358/512 [1:48:48<1:38:01, 38.19s/it]2025-09-28 15:40:39,399 - __main__ - ERROR - Timeout for review: ''i want my 20 fish bucks back'...' with m


✅ llama2 using Constraint_Based_Prompt finished in 168.20 minutes

--- Sample Predictions for llama2 (Constraint_Based_Prompt) ---
                                              review ground_truth  \
0                'this version crashes all the time'   bug report   
1                    'it take a lot time in loading'   bug report   
2                               'pages freeze often'   bug report   
3  'still having problems uploading sometimes tho...   bug report   
4  'it wont load any of my notifications when i c...   bug report   

         predicted  
0       bug report  
1            other  
2  feature request  
3       bug report  
4       bug report  

--- Classification Report for llama2 (Constraint_Based_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.18      0.72      0.28        50
     bug report       0.85      0.49      0.63       285
          other       0.62      0.49      0.55       173

       accuracy              

Classifying with mistral (Constraint_Based_Prompt): 100%|██████████████████████████| 512/512 [2:17:09<00:00, 16.07s/it]



✅ mistral using Constraint_Based_Prompt finished in 137.17 minutes

--- Sample Predictions for mistral (Constraint_Based_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for mistral (Constraint_Based_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.35      0.86      0.50        50
     bug report       0.78      0.89      0.83       288
          other       0.97      0.33      0.49       174

       accuracy                           0.70       512
      macro avg       0.70      0.69      

Classifying with llama3:8b (Constraint_Based_Prompt):  65%|██████████████▏       | 331/512 [1:49:29<1:33:23, 30.96s/it]2025-09-28 20:56:30,637 - __main__ - ERROR - Timeout for review: ''Great app but without emoji support its only sub ...' with model llama3:8b
Classifying with llama3:8b (Constraint_Based_Prompt): 100%|████████████████████████| 512/512 [4:02:31<00:00, 28.42s/it]



✅ llama3:8b using Constraint_Based_Prompt finished in 242.52 minutes

--- Sample Predictions for llama3:8b (Constraint_Based_Prompt) ---
                                              review ground_truth   predicted
0                'this version crashes all the time'   bug report  bug report
1                    'it take a lot time in loading'   bug report  bug report
2                               'pages freeze often'   bug report  bug report
3  'still having problems uploading sometimes tho...   bug report  bug report
4  'it wont load any of my notifications when i c...   bug report  bug report

--- Classification Report for llama3:8b (Constraint_Based_Prompt) ---
                 precision    recall  f1-score   support

feature request       0.23      0.80      0.36        49
     bug report       0.81      0.83      0.82       288
          other       0.88      0.25      0.39       174

       accuracy                           0.63       511
      macro avg       0.64      0.63

Classifying with gemma:7b (Constraint_Based_Prompt):   9%|██▍                       | 47/512 [24:42<4:05:14, 31.65s/it]