In [None]:
# (No installation required)

In [None]:
# Import required libraries for sentiment analysis with open-source LLMs
import warnings
import os

warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
    matthews_corrcoef,
)

# API setup for LLMs (Groq, dotenv)
from groq import Groq
from dotenv import load_dotenv
from transformers import pipeline
import torch

load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if GROQ_API_KEY:
    groq_client = Groq(api_key=GROQ_API_KEY)

# Load FinBERT model for local inference
print("Loading FinBERT model...")
device = 0 if torch.cuda.is_available() else -1
finbert_pipeline = pipeline(
    "sentiment-analysis", model="ProsusAI/finbert", device=device
)

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print("✓ Setup complete for open-source LLM sentiment analysis")
print(f"✓ Groq API configured: {bool(GROQ_API_KEY)}")
print(f"✓ FinBERT loaded on {'GPU' if device == 0 else 'CPU'}")

## 1. Load Dataset

Load the FinancialPhraseBank dataset for sentiment analysis.

In [None]:
# Load the 100% agreement dataset (highest quality)
data_path = "../../DatasetAnalysis_FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"

sentences = []
sentiments = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                sentiments.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "true_sentiment": sentiments})

print(f"Dataset loaded: {len(df)} sentences")
print(f"\nSentiment distribution:")
print(df["true_sentiment"].value_counts())

# Display sample
print("\n" + "=" * 80)
print("Sample sentences:")
print("=" * 80)
display(df.sample(5, random_state=42))

## 2. Few-Shot Examples

Curated examples (2 positive, 2 negative, 1 neutral) representing typical financial sentiment patterns.

In [None]:
# Curated few-shot examples - BALANCED with emphasis on negative detection
FEW_SHOT_EXAMPLES = [
    {
        "sentence": "Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007.",
        "sentiment": "positive",
        "rationale": "Operating profit increased significantly, indicating improved financial performance.",
    },
    {
        "sentence": "Net sales increased by 18.5% to EUR 167.8 million compared to the previous year.",
        "sentiment": "positive",
        "rationale": "Strong revenue growth of 18.5% signals business expansion and market success.",
    },
    {
        "sentence": "The company reported a net loss of EUR 2.5 million compared to a profit of EUR 1.2 million in the previous quarter.",
        "sentiment": "negative",
        "rationale": "Shift from profit to loss represents deteriorating financial health.",
    },
    {
        "sentence": "Sales decreased by 15% year-over-year due to weakening demand in key markets.",
        "sentiment": "negative",
        "rationale": "Significant sales decline indicates business challenges and market difficulties.",
    },
    {
        "sentence": "Operating loss widened to EUR 5.8 million from EUR 3.2 million in the same period last year.",
        "sentiment": "negative",
        "rationale": "Widening losses show worsening profitability and deteriorating business conditions.",
    },
    {
        "sentence": "The company announced the appointment of a new chief financial officer effective next month.",
        "sentiment": "neutral",
        "rationale": "Executive appointment is routine corporate news without clear financial impact.",
    },
]

print("Few-Shot Examples:")

print("=" * 80)


for i, ex in enumerate(FEW_SHOT_EXAMPLES, 1):
    print(f"Rationale: {ex['rationale']}")
    print(f"Sentence: {ex['sentence']}")
    print(f"\nExample {i} [{ex['sentiment'].upper()}]:")

## 3. Few-Shot Prompt Design

Prompt template for open-source LLMs using few-shot examples.

In [None]:
def create_few_shot_prompt(sentence):
    """
    Creates a few-shot prompt with 6 labeled examples (balanced representation).
    """
    examples_text = ""
    for i, ex in enumerate(FEW_SHOT_EXAMPLES, 1):
        examples_text += f"""\nExample {i}:
Sentence: "{ex["sentence"]}"
Analysis:
{{
    "sentiment": "{ex["sentiment"]}",
    "confidence": 0.95,
    "rationale": "{ex["rationale"]}"
}}
"""

    prompt = f"""You are a financial sentiment analysis expert. Analyze financial statements with precision.

Classify the sentiment as "positive", "negative", or "neutral" from an investor's perspective.

Guidelines:
- Positive: Financial improvements, growth, profits, revenue increases, cost reductions, successful expansions
- Negative: Financial declines, losses, revenue drops, cost increases, widening losses, failed ventures, layoffs
- Neutral: Factual statements with no clear financial impact, routine announcements, balanced mixed signals

⚠️ IMPORTANT: Pay special attention to negative indicators (losses, declines, decreases, deterioration).

Here are {len(FEW_SHOT_EXAMPLES)} examples to learn from:
{examples_text}

Now classify this new statement:
Sentence: "{sentence}"

Return ONLY valid JSON in this exact format:
{{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Brief explanation"
}}
"""
    return prompt


# Test prompt
test_sentence = "The company's quarterly revenue exceeded analyst expectations by 12%."
print("=" * 80)
print("FEW-SHOT PROMPT EXAMPLE")

print("=" * 80)print(create_few_shot_prompt(test_sentence)[:1000] + "...")

## 4. Model Inference Functions

Functions for calling open-source LLMs (Groq API) and parsing their responses.

In [None]:
# Call Llama or other OSS models via Groq API
def call_llama(prompt, model_name="finbert", temperature=0.0):
    """Call Llama or other OSS models via Groq API, with model selection"""
    max_retries = 3
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=model_name,
                temperature=temperature,
                max_tokens=500,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None


def parse_response(response_text):
    """Parse JSON response from model"""
    try:
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in response_text:
            json_str = response_text.split("```")[1].strip()
        else:
            json_str = response_text.strip()

        result = json.loads(json_str)
        return result
    except:
        response_lower = response_text.lower()
        if "positive" in response_lower and "negative" not in response_lower:
            return {
                "sentiment": "positive",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "negative" in response_lower:
            return {
                "sentiment": "negative",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "neutral" in response_lower:
            return {
                "sentiment": "neutral",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        return None


def call_finbert(sentence):
    """Call FinBERT for sentiment classification (local inference)"""
    try:
        result = finbert_pipeline(sentence[:512])  # FinBERT max length
        label_map = {
            "positive": "positive",
            "negative": "negative",
            "neutral": "neutral",
        }
        return {
            "sentiment": label_map.get(result[0]["label"].lower(), "neutral"),
            "confidence": result[0]["score"],
            "rationale": f"FinBERT classification: {result[0]['label']}",
        }
    except Exception as e:
        print(f"FinBERT error: {str(e)[:100]}")
        return None


print("✓ Inference functions defined")


## 5. Run Experiments

Run few-shot sentiment analysis using open-source LLMs:
- **E4:** Mixtral-8x7B (Few-Shot)
- **E5:** Llama-3.1-70B (Few-Shot)
- **E6:** FinBERT (Few-Shot)

### E4: Mixtral-8x7B (Few-Shot)

In [None]:
# Run on full dataset for comprehensive evaluation
test_df = df.copy()

# E4: Mixtral-8x7B (Few-Shot)
print("Running E4: Mixtral-8x7B (Few-Shot)...")
e4_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E4 Progress"):
    prompt = create_few_shot_prompt(row["sentence"])
    response = call_llama(prompt, model_name="mixtral-8x7b-32768")

    if response:
        parsed = parse_response(response)
        if parsed:
            e4_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            e4_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )

    time.sleep(0.5)  # Rate limiting

e4_df = pd.DataFrame(e4_results)
print(f"\n✓ E4 completed: {len(e4_df)} predictions")
display(e4_df.head())

### E5: Llama-3.1-70B (Few-Shot)

In [None]:
# E5: Llama-3.1-70B (Few-Shot)
print("Running E5: Llama-3.1-70B (Few-Shot)...")
e5_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E5 Progress"):
    prompt = create_few_shot_prompt(row["sentence"])
    response = call_llama(prompt, model_name="llama-3.1-70b-versatile")

    if response:
        parsed = parse_response(response)
        if parsed:
            e5_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            e5_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )

    time.sleep(0.5)

e5_df = pd.DataFrame(e5_results)
print(f"\n✓ E5 completed: {len(e5_df)} predictions")
display(e5_df.head())

### E6: FinBERT (Few-Shot)

⚠️ **Note:** FinBERT uses its pre-trained weights and cannot leverage few-shot examples (no in-context learning capability). This experiment uses the same FinBERT model as E3 (Zero-Shot) for consistency and cost-benefit comparison.

In [None]:
# E6: FinBERT (Few-Shot - Note: Cannot use few-shot examples)
print("Running E6: FinBERT (Few-Shot)...")
print("⚠️ Note: FinBERT uses pre-trained weights, cannot leverage few-shot examples")
e6_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E6 Progress"):
    # FinBERT uses direct inference (few-shot examples are not applicable)
    result = call_finbert(row["sentence"])

    if result:
        e6_results.append(
            {
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": result.get("sentiment", "unknown"),
                "confidence": result.get("confidence", 0),
                "rationale": result.get("rationale", ""),
            }
        )
    else:
        e6_results.append(
            {
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": "error",
                "confidence": 0,
                "rationale": "FinBERT inference error",
            }
        )

e6_df = pd.DataFrame(e6_results)
print(f"\n✓ E6 completed: {len(e6_df)} predictions")
display(e6_df.head())

## 6. Calculate Metrics

Compute accuracy, F1, precision, recall, and confusion matrices for each model.

In [None]:
def calculate_metrics(df, exp_name):
    """Calculate all evaluation metrics"""
    # Check if dataframe is empty or missing required columns
    if df.empty or "predicted_sentiment" not in df.columns:
        print(f"⚠️ Warning: {exp_name} has no valid predictions!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": 0,
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    # Filter out errors
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()

    # Check if we have valid predictions
    if valid_df.empty:
        print(f"⚠️ Warning: {exp_name} has no valid predictions after filtering!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": len(df),
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    metrics = {
        "Experiment": exp_name,
        "Total Samples": len(df),
        "Valid Predictions": len(valid_df),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted-F1": f1_score(y_true, y_pred, average="weighted"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
    }

    # Per-class metrics
    labels = ["positive", "negative", "neutral"]
    precision_per_class = precision_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    recall_per_class = recall_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    for i, label in enumerate(labels):
        metrics[f"{label.capitalize()}_Precision"] = precision_per_class[i]
        metrics[f"{label.capitalize()}_Recall"] = recall_per_class[i]
        metrics[f"{label.capitalize()}_F1"] = f1_per_class[i]

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    return metrics, cm, valid_df


# Calculate metrics for all experiments
e4_metrics, e4_cm, e4_valid = calculate_metrics(e4_df, "E4: Mixtral-8x7B (Few-Shot)")
e5_metrics, e5_cm, e5_valid = calculate_metrics(e5_df, "E5: Llama-3.1-70B (Few-Shot)")
e6_metrics, e6_cm, e6_valid = calculate_metrics(e6_df, "E6: FinBERT (Few-Shot)")

# Create metrics comparison table
metrics_df = pd.DataFrame([e4_metrics, e5_metrics, e6_metrics])

print("\n" + "=" * 80)
print("FEW-SHOT PERFORMANCE COMPARISON")
print("=" * 80)
display(
    metrics_df[
        ["Experiment", "Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall"]
    ].round(4)
)

## 7. Visualize Results

Visualize performance metrics and confusion matrices for all open-source LLMs.

In [None]:
# Performance comparison bar chart
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Overall metrics
metrics_to_plot = ["Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall"]
x = np.arange(len(metrics_to_plot))
width = 0.25

for i, (metrics, label) in enumerate(
    [
        (e4_metrics, "Mixtral-8x7B"),
        (e5_metrics, "Llama-3.1-70B"),
        (e6_metrics, "FinBERT"),
    ]
):
    values = [metrics[m] for m in metrics_to_plot]
    axes[0].bar(x + i * width, values, width, label=label, alpha=0.8)

axes[0].set_xlabel("Metrics", fontsize=12, weight="bold")
axes[0].set_ylabel("Score", fontsize=12, weight="bold")
axes[0].set_title(
    "Overall Performance Comparison (Few-Shot)", fontsize=14, weight="bold"
)
axes[0].set_xticks(x + width)
axes[0].set_xticklabels(metrics_to_plot)
axes[0].legend()
axes[0].set_ylim([0, 1])
axes[0].grid(axis="y", alpha=0.3)

# Per-class F1 scores
classes = ["Positive", "Negative", "Neutral"]
x2 = np.arange(len(classes))

for i, (metrics, label) in enumerate(
    [
        (e4_metrics, "Mixtral-8x7B"),
        (e5_metrics, "Llama-3.1-70B"),
        (e6_metrics, "FinBERT"),
    ]
):
    values = [metrics[f"{c}_F1"] for c in classes]
    axes[1].bar(x2 + i * width, values, width, label=label, alpha=0.8)

axes[1].set_xlabel("Sentiment Class", fontsize=12, weight="bold")
axes[1].set_ylabel("F1 Score", fontsize=12, weight="bold")
axes[1].set_title("Per-Class F1 Scores (Few-Shot)", fontsize=14, weight="bold")
axes[1].set_xticks(x2 + width)
axes[1].set_xticklabels(classes)
axes[1].legend()
axes[1].set_ylim([0, 1])
axes[1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("few_shot_performance_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
labels = ["Positive", "Negative", "Neutral"]

for idx, (cm, title) in enumerate(
    [
        (e4_cm, "E4: Mixtral-8x7B (Few-Shot)"),
        (e5_cm, "E5: Llama-3.1-70B (Few-Shot)"),
        (e6_cm, "E6: FinBERT (Few-Shot)"),
    ]
):
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Greens",
        xticklabels=labels,
        yticklabels=labels,
        ax=axes[idx],
    )
    axes[idx].set_title(title, fontsize=12, weight="bold")
    axes[idx].set_ylabel("True Label", fontsize=11, weight="bold")
    axes[idx].set_xlabel("Predicted Label", fontsize=11, weight="bold")

plt.suptitle(
    "Confusion Matrices - Few-Shot Sentiment Analysis",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("few_shot_confusion_matrices.png", dpi=300, bbox_inches="tight")
plt.show()

## 8. Save Results

Save experiment results and metrics to CSV files.

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

e4_df.to_csv(f"e4_mixtral_8x7b_few_shot_{timestamp}.csv", index=False)
e5_df.to_csv(f"e5_llama_3_1_70b_few_shot_{timestamp}.csv", index=False)
e6_df.to_csv(f"e6_finbert_few_shot_{timestamp}.csv", index=False)
metrics_df.to_csv(f"few_shot_metrics_summary_{timestamp}.csv", index=False)

print(f"\n✓ Results saved with timestamp: {timestamp}")

## 9. Error Analysis

In [None]:
# Error Analysis: Most Common Misclassifications
print("=" * 80)
print("ERROR ANALYSIS: FEW-SHOT MISCLASSIFICATION PATTERNS")
print("=" * 80)

for df_result, exp_name in [
    (e4_valid, "E4: Mixtral-8x7B"),
    (e5_valid, "E5: Llama-3.1-70B"),
    (e6_valid, "E6: FinBERT"),
]:
    print(f"\n{exp_name}")
    print("-" * 80)

    # Identify errors
    errors = df_result[df_result["true_sentiment"] != df_result["predicted_sentiment"]]

    # Count error types
    error_types = (
        errors.groupby(["true_sentiment", "predicted_sentiment"])
        .size()
        .reset_index(name="count")
    )
    error_types = error_types.sort_values("count", ascending=False)

    print(
        f"\nTotal Errors: {len(errors)} / {len(df_result)} ({len(errors) / len(df_result) * 100:.2f}%)"
    )
    print("\nMost Common Error Types:")
    display(error_types.head(5))

    # Show examples of worst errors (high confidence, wrong prediction)
    if len(errors) > 0:
        worst_errors = errors.nlargest(3, "confidence")
        print(f"\nTop 3 High-Confidence Errors:")
        for idx, row in worst_errors.iterrows():
            print(
                f"\n  True: {row['true_sentiment']} | Predicted: {row['predicted_sentiment']} | Conf: {row['confidence']:.2f}"
            )
            print(f"  Sentence: {row['sentence'][:120]}...")
            print(f"  Rationale: {row['rationale']}")

# Class-wise Performance Comparison
print("\n" + "=" * 80)
print("CLASS-WISE PERFORMANCE BREAKDOWN")
print("=" * 80)

class_comparison = []
for metrics, model_name in [
    (e4_metrics, "Mixtral-8x7B"),
    (e5_metrics, "Llama-3.1-70B"),
    (e6_metrics, "FinBERT"),
]:
    for sentiment in ["Positive", "Negative", "Neutral"]:
        class_comparison.append(
            {
                "Model": model_name,
                "Class": sentiment,
                "Precision": metrics[f"{sentiment}_Precision"],
                "Recall": metrics[f"{sentiment}_Recall"],
                "F1-Score": metrics[f"{sentiment}_F1"],
            }
        )

class_df = pd.DataFrame(class_comparison)

# Pivot for better visualization
for metric in ["Precision", "Recall", "F1-Score"]:
    print(f"\n{metric} by Class:")
    pivot = class_df.pivot(index="Class", columns="Model", values=metric)
    display(pivot.round(4))

print("\n" + "=" * 80)
print("COMPREHENSIVE METRICS TABLE")
print("=" * 80)
display(metrics_df.round(4))

## 10. Confidence Analysis

In [None]:
# Confidence Analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (df_result, title) in enumerate(
    [(e4_valid, "Mixtral-8x7B"), (e5_valid, "Llama-3.1-70B"), (e6_valid, "FinBERT")]
):
    df_result["correct"] = (
        df_result["true_sentiment"] == df_result["predicted_sentiment"]
    )

    correct_conf = df_result[df_result["correct"]]["confidence"]
    incorrect_conf = df_result[~df_result["correct"]]["confidence"]

    axes[idx].hist(
        [correct_conf, incorrect_conf],
        bins=20,
        label=["Correct", "Incorrect"],
        alpha=0.7,
        color=["green", "red"],
    )
    axes[idx].set_xlabel("Confidence Score", fontsize=11, weight="bold")
    axes[idx].set_ylabel("Frequency", fontsize=11, weight="bold")
    axes[idx].set_title(
        f"{title}\nMean Conf: Correct={correct_conf.mean():.3f}, Incorrect={incorrect_conf.mean():.3f}",
        fontsize=11,
        weight="bold",
    )
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.suptitle(
    "Confidence Distribution: Correct vs Incorrect Predictions (Few-Shot)",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("few_shot_confidence_analysis.png", dpi=300, bbox_inches="tight")
plt.show()

print("\n" + "=" * 80)
print("CONFIDENCE CALIBRATION ANALYSIS")
print("=" * 80)
for df_result, exp_name in [
    (e4_valid, "E4: Mixtral-8x7B"),
    (e5_valid, "E5: Llama-3.1-70B"),
    (e6_valid, "E6: FinBERT"),
]:
    df_result["correct"] = (
        df_result["true_sentiment"] == df_result["predicted_sentiment"]
    )

    avg_conf_correct = df_result[df_result["correct"]]["confidence"].mean()
    avg_conf_incorrect = df_result[~df_result["correct"]]["confidence"].mean()
    calibration_gap = avg_conf_correct - avg_conf_incorrect

    print(f"\n{exp_name}:")
    print(f"  Average Confidence (Correct): {avg_conf_correct:.4f}")
    print(f"  Average Confidence (Incorrect): {avg_conf_incorrect:.4f}")
    print(f"  Calibration Gap: {calibration_gap:.4f}")
    print(f"  Total Correct: {df_result['correct'].sum()} / {len(df_result)}")

    # Confidence by sentiment class
    print(f"\n  Confidence by Predicted Class:")
    for sentiment in ["positive", "negative", "neutral"]:
        class_df = df_result[df_result["predicted_sentiment"] == sentiment]
        if len(class_df) > 0:
            print(
                f"    {sentiment.capitalize()}: {class_df['confidence'].mean():.4f} (n={len(class_df)})"
            )

## 11. Classification Reports

In [None]:
# Detailed Classification Reports
print("\n" + "=" * 80)
print("DETAILED CLASSIFICATION REPORTS")
print("=" * 80)

for df_result, exp_name in [
    (e4_valid, "E4: Mixtral-8x7B"),
    (e5_valid, "E5: Llama-3.1-70B"),
    (e6_valid, "E6: FinBERT"),
]:
    print("\n" + "=" * 80)
    print(f"{exp_name}")
    print("=" * 80)
    print(
        classification_report(
            df_result["true_sentiment"],
            df_result["predicted_sentiment"],
            labels=["positive", "negative", "neutral"],
            target_names=["Positive", "Negative", "Neutral"],
        )
    )

# Class-wise Metrics Summary Table
print("\n" + "=" * 80)
print("CLASS-WISE METRICS SUMMARY")
print("=" * 80)

summary_data = []
for metrics, model in [
    (e4_metrics, "Mixtral-8x7B"),
    (e5_metrics, "Llama-3.1-70B"),
    (e6_metrics, "FinBERT"),
]:
    summary_data.append(
        {
            "Model": model,
            "Pos_P": metrics["Positive_Precision"],
            "Pos_R": metrics["Positive_Recall"],
            "Pos_F1": metrics["Positive_F1"],
            "Neg_P": metrics["Negative_Precision"],
            "Neg_R": metrics["Negative_Recall"],
            "Neg_F1": metrics["Negative_F1"],
            "Neu_P": metrics["Neutral_Precision"],
            "Neu_R": metrics["Neutral_Recall"],
            "Neu_F1": metrics["Neutral_F1"],
        }
    )

summary_df = pd.DataFrame(summary_data)
print("\nPer-Class Metrics (P=Precision, R=Recall, F1=F1-Score):")
display(summary_df.round(4))

## 12. Few-Shot vs Zero-Shot Comparison

In [None]:
# Compare with Zero-Shot results (if available)
import glob
import os

print("=" * 80)
print("FEW-SHOT vs ZERO-SHOT COMPARISON")
print("=" * 80)

# Try to load the latest zero-shot results
zero_shot_files = glob.glob("../Zero_Shot/zero_shot_metrics_summary_*.csv")

if zero_shot_files:
    # Get the most recent file
    latest_zero_shot = max(zero_shot_files, key=os.path.getctime)
    print(f"\nLoading Zero-Shot results from: {os.path.basename(latest_zero_shot)}")

    try:
        zero_shot_df = pd.read_csv(latest_zero_shot)

        # Combine Few-Shot and Zero-Shot for comparison
        few_shot_df = metrics_df.copy()
        few_shot_df["Approach"] = "Few-Shot"
        zero_shot_df["Approach"] = "Zero-Shot"

        # Select key metrics for comparison
        comparison_cols = [
            "Experiment",
            "Approach",
            "Accuracy",
            "Macro-F1",
            "Negative_F1",
            "Positive_F1",
            "Neutral_F1",
        ]

        combined = pd.concat(
            [
                few_shot_df[comparison_cols]
                if all(col in few_shot_df.columns for col in comparison_cols)
                else few_shot_df,
                zero_shot_df[comparison_cols]
                if all(col in zero_shot_df.columns for col in comparison_cols)
                else zero_shot_df,
            ]
        )

        print("\nKey Metrics Comparison:")
        display(combined[comparison_cols].round(4))

        # Calculate improvements
        print("\n" + "=" * 80)
        print("IMPROVEMENT: Few-Shot vs Zero-Shot")
        print("=" * 80)

        for i in range(min(3, len(few_shot_df))):
            model_name = ["Mixtral-8x7B", "Llama-3.1-70B", "FinBERT"][i]
            if i < len(zero_shot_df):
                fs_macro = few_shot_df.iloc[i]["Macro-F1"]
                zs_macro = zero_shot_df.iloc[i]["Macro-F1"]
                improvement = (
                    ((fs_macro - zs_macro) / zs_macro * 100) if zs_macro > 0 else 0
                )

                fs_neg = few_shot_df.iloc[i]["Negative_F1"]
                zs_neg = zero_shot_df.iloc[i]["Negative_F1"]
                neg_improvement = (
                    ((fs_neg - zs_neg) / zs_neg * 100)
                    if zs_neg > 0
                    else float("inf")
                    if fs_neg > 0
                    else 0
                )

                print(f"\n{model_name}:")
                print(
                    f"  Macro-F1: {zs_macro:.4f} → {fs_macro:.4f} ({improvement:+.2f}%)"
                )
                print(
                    f"  Negative F1: {zs_neg:.4f} → {fs_neg:.4f} ({neg_improvement:+.2f}% improvement)"
                    if neg_improvement != float("inf")
                    else f"  Negative F1: {zs_neg:.4f} → {fs_neg:.4f} (∞% - from zero!)"
                )

    except Exception as e:
        print(f"\n⚠️ Could not load zero-shot results: {str(e)}")
        print("Run Zero-Shot experiments first for comparison.")
else:
    print("\n⚠️ No Zero-Shot results found in ../Zero_Shot/")
    print("Run Zero-Shot experiments first to enable comparison.")

## 13. Key Findings & Conclusions

### Expected Conclusions from Few-Shot Experiment

### 1. **Few-Shot vs Zero-Shot Performance Gains**
   - **Hypothesis**: Few-shot learning should outperform zero-shot, especially on negative sentiment (minority class)
   - **Expected Improvement**: 5-15% boost in Macro-F1 compared to zero-shot baseline
   - **Negative Class Boost**: Few-shot examples specifically target negative detection (3/6 examples are negative)

### 2. **Impact of Curated Examples**
   - **Balanced Representation**: 6 examples (3 negative, 2 positive, 1 neutral) address class imbalance
   - **Explicit Negative Indicators**: Examples demonstrate losses, declines, and layoffs
   - **Learning Effect**: Models should better recognize financial distress signals

### 3. **Model Comparison Insights**
   - **Size vs Examples**: Does Llama-3.1-70B benefit more from examples than Mixtral-8x7B?
   - **FinBERT Advantage**: Pretrained financial domain knowledge + few-shot may yield best results
   - **Consistency**: Check if all models improve uniformly or if some benefit more

### 4. **Class-Specific Performance**
   - **Negative F1 Improvement**: Primary success metric - should increase significantly from zero-shot
   - **Positive/Neutral Stability**: Should maintain high performance while improving negative detection
   - **Confusion Reduction**: Fewer neutral→negative and negative→neutral misclassifications

### 5. **Confidence Calibration**
   - **Higher Confidence on Negatives**: Examples should reduce uncertainty on negative predictions
   - **Calibration Gap**: Expect narrower gap between correct/incorrect prediction confidence
   - **Class-Specific Confidence**: Negative predictions should approach positive/neutral confidence levels

### 6. **Error Analysis Patterns**
   - **Reduced High-Confidence Errors**: Examples should prevent overconfident misclassifications
   - **Boundary Cases**: Models may still struggle with subtle negatives (e.g., "widening losses")
   - **Context Understanding**: Few-shot should improve handling of comparative statements

### 7. **Prompt Engineering Validation**
   - **Example Quality**: 6 examples sufficient or need more for negative class?
   - **Format Consistency**: JSON compliance should improve with explicit examples
   - **Rationale Quality**: Models should provide more specific, example-aligned reasoning

### 8. **Production Readiness Assessment**
   - **Deployment Threshold**: Macro-F1 > 0.80 and Negative F1 > 0.50 for production use
   - **Cost-Benefit**: Few-shot adds ~500 tokens/request - is performance gain worth API cost increase?
   - **Comparison with CoT**: Establish baseline for Chain-of-Thought and Tree-of-Thought experiments
   - **Model Selection**: Identify best model for advancing to CoT (E7-E9) and ToT (E10-E12)

### 9. **Key Success Metrics**
   - **Primary**: Negative F1 > 0.40 (vs ~0.20-0.30 in zero-shot)
   - **Secondary**: Macro-F1 > 0.75
   - **Tertiary**: MCC (Matthews Correlation Coefficient) improvement shows better overall discrimination