In [None]:
# Install required packages
!pip install pandas numpy matplotlib seaborn scikit-learn google-generativeai groq python-dotenv tqdm -q

In [None]:
# Suppress deprecation warnings
import warnings
import os

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
)

# API setup
from groq import Groq
from dotenv import load_dotenv
from transformers import pipeline

# Load environment variables
load_dotenv()

# Configure Groq API (for Mixtral and Llama)
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if GROQ_API_KEY:
    groq_client = Groq(api_key=GROQ_API_KEY)
    print("‚úì Groq API configured")
else:
    print("‚ö†Ô∏è  Warning: GROQ_API_KEY not found in environment variables")
    groq_client = None

# Initialize FinBERT
print("Loading FinBERT model (ProsusAI/finbert)...")
try:
    finbert_pipeline = pipeline(
        "sentiment-analysis",
        model="ProsusAI/finbert",
        tokenizer="ProsusAI/finbert",
        device=-1,  # CPU
        truncation=True,
        max_length=512,
    )
    print("‚úì FinBERT model loaded successfully")
except Exception as e:
    print(f"‚ö†Ô∏è  FinBERT loading failed: {e}")
    finbert_pipeline = None

# Set plot style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print("\n" + "=" * 80)
print("SETUP COMPLETE - FEW-SHOT RISK ASSESSMENT")
print("=" * 80)
print("Models configured:")
print("  ‚Ä¢ R4: Mixtral-8x7B-32768 (Groq API)")
print("  ‚Ä¢ R5: Llama-3.1-70B-Versatile (Groq API)")
print("  ‚Ä¢ R6: FinBERT (ProsusAI/finbert - Local)")
print("=" * 80)

## 1. Load Dataset

In [None]:
data_path = "../../DatasetAnalysis_FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"

sentences = []
sentiments = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                sentiments.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "true_sentiment": sentiments})

print(f"‚úì Dataset loaded: {len(df)} sentences")
print(f"\nSentiment distribution:")
print(df["true_sentiment"].value_counts())
print(f"\nSample sentence:")
print(f"  '{df.iloc[0]['sentence']}' -> {df.iloc[0]['true_sentiment']}")

## 2. Few-Shot Examples

Carefully curated examples (2 positive, 2 negative, 1 neutral) representing typical financial sentiment patterns.

In [None]:
# Curated few-shot examples
FEW_SHOT_EXAMPLES = [
    {
        "sentence": "Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007.",
        "sentiment": "positive",
        "rationale": "Operating profit increased significantly, indicating improved financial performance.",
    },
    {
        "sentence": "Net sales increased by 18.5% to EUR 167.8 million compared to the previous year.",
        "sentiment": "positive",
        "rationale": "Strong revenue growth of 18.5% signals business expansion and market success.",
    },
    {
        "sentence": "The company reported a net loss of EUR 2.5 million compared to a profit of EUR 1.2 million in the previous quarter.",
        "sentiment": "negative",
        "rationale": "Shift from profit to loss represents deteriorating financial health.",
    },
    {
        "sentence": "Sales decreased by 15% year-over-year due to weakening demand in key markets.",
        "sentiment": "negative",
        "rationale": "Significant sales decline indicates business challenges and market difficulties.",
    },
    {
        "sentence": "The company announced the appointment of a new chief financial officer effective next month.",
        "sentiment": "neutral",
        "rationale": "Executive appointment is routine corporate news without clear financial impact.",
    },
]

print("Few-Shot Examples:")
print("=" * 80)
for i, ex in enumerate(FEW_SHOT_EXAMPLES, 1):
    print(f"\nExample {i} [{ex['sentiment'].upper()}]:")
    print(f"Sentence: {ex['sentence']}")
    print(f"Rationale: {ex['rationale']}")

## 3. Few-Shot Prompt Design

In [None]:
def create_few_shot_prompt(sentence):
    """
    Creates a few-shot prompt with 5 labeled examples.
    """
    examples_text = ""
    for i, ex in enumerate(FEW_SHOT_EXAMPLES, 1):
        examples_text += f"""\nExample {i}:
Sentence: "{ex["sentence"]}"
Analysis:
{{
    "sentiment": "{ex["sentiment"]}",
    "confidence": 0.95,
    "rationale": "{ex["rationale"]}"
}}
"""

    prompt = f"""You are a financial sentiment analysis expert.

Classify the sentiment of financial statements as "positive", "negative", or "neutral" from an investor's perspective.

Guidelines:
- Positive: Good news for stock price (revenue increase, profit growth, expansion)
- Negative: Bad news for stock price (losses, declining sales, setbacks)
- Neutral: No clear impact or mixed signals

Here are 5 examples to learn from:
{examples_text}

Now classify this new statement:
Sentence: "{sentence}"

Provide your response in JSON format:
{{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Brief explanation"
}}
"""
    return prompt


# Test prompt
test_sentence = "The company's quarterly revenue exceeded analyst expectations by 12%."
print("=" * 80)
print("FEW-SHOT PROMPT EXAMPLE")
print("=" * 80)
print(create_few_shot_prompt(test_sentence)[:1000] + "...")

## 4. Model Inference Functions

Using Mixtral-8x7B, Llama-3.1-70B (via Groq), and FinBERT (local)

In [None]:
def call_mixtral(prompt, temperature=0.0):
    """Call Mixtral-8x7B via Groq API"""
    if not groq_client:
        print("‚ö†Ô∏è  Groq client not initialized")
        return None

    max_retries = 3
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="mixtral-8x7b-32768",
                temperature=temperature,
                max_tokens=500,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            print(f"Error calling Mixtral: {e}")
            return None
    return None


def call_llama(prompt, temperature=0.0):
    """Call Llama-3.1-70B via Groq API"""
    if not groq_client:
        print("‚ö†Ô∏è  Groq client not initialized")
        return None

    max_retries = 3
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="llama-3.1-70b-versatile",
                temperature=temperature,
                max_tokens=500,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            print(f"Error calling Llama: {e}")
            return None
    return None


def call_finbert(sentence):
    """Call FinBERT model for sentiment analysis"""
    if not finbert_pipeline:
        print("‚ö†Ô∏è  FinBERT pipeline not initialized")
        return None

    try:
        result = finbert_pipeline(sentence)[0]

        # Map FinBERT labels to our format
        label_map = {
            "positive": "positive",
            "negative": "negative",
            "neutral": "neutral",
        }

        sentiment = label_map.get(result["label"].lower(), "neutral")
        confidence = result["score"]

        # Create JSON response matching other models
        response = {
            "sentiment": sentiment,
            "confidence": confidence,
            "rationale": f"FinBERT classification with {confidence:.2%} confidence",
        }

        return json.dumps(response)

    except Exception as e:
        print(f"Error with FinBERT: {e}")
        return None


def parse_response(response_text):
    """Parse JSON response from model"""
    try:
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in response_text:
            json_str = response_text.split("```")[1].strip()
        else:
            json_str = response_text.strip()

        result = json.loads(json_str)
        return result
    except:
        response_lower = response_text.lower()
        if "positive" in response_lower and "negative" not in response_lower:
            return {
                "sentiment": "positive",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "negative" in response_lower:
            return {
                "sentiment": "negative",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "neutral" in response_lower:
            return {
                "sentiment": "neutral",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        return None


print("‚úì Model inference functions defined")
print("  ‚Ä¢ call_mixtral() - Mixtral-8x7B-32768")
print("  ‚Ä¢ call_llama() - Llama-3.1-70B-Versatile")
print("  ‚Ä¢ call_finbert() - FinBERT (ProsusAI/finbert)")
print("  ‚Ä¢ parse_response() - JSON parser")

## 5. Run Experiments

### R4: Mixtral-8x7B-32768 (Few-Shot)

In [None]:
# Test sample (remove .head(100) for full run)
test_df = df.head(100).copy()

# R4: Mixtral-8x7B-32768
print("=" * 80)
print("Running R4: Mixtral-8x7B-32768 (Few-Shot)")
print("=" * 80)
r4_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="R4 Progress"):
    prompt = create_few_shot_prompt(row["sentence"])
    response = call_mixtral(prompt)

    if response:
        parsed = parse_response(response)
        if parsed:
            r4_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            r4_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )
    else:
        r4_results.append(
            {
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": "error",
                "confidence": 0,
                "rationale": "API call failed",
            }
        )

    time.sleep(0.5)

r4_df = pd.DataFrame(r4_results)
print(f"\n‚úì R4 completed: {len(r4_df)} predictions")
print(
    f"  Valid predictions: {len(r4_df[r4_df['predicted_sentiment'].isin(['positive', 'negative', 'neutral'])])}"
)
print(f"  Errors: {len(r4_df[r4_df['predicted_sentiment'] == 'error'])}")
display(r4_df.head())

### R5: Llama-3.1-70B-Versatile (Few-Shot)

In [None]:
# R5: Llama-3.1-70B-Versatile
print("\n" + "=" * 80)
print("Running R5: Llama-3.1-70B-Versatile (Few-Shot)")
print("=" * 80)
r5_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="R5 Progress"):
    prompt = create_few_shot_prompt(row["sentence"])
    response = call_llama(prompt)

    if response:
        parsed = parse_response(response)
        if parsed:
            r5_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            r5_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )
    else:
        r5_results.append(
            {
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": "error",
                "confidence": 0,
                "rationale": "API call failed",
            }
        )

    time.sleep(0.5)

r5_df = pd.DataFrame(r5_results)
print(f"\n‚úì R5 completed: {len(r5_df)} predictions")
print(
    f"  Valid predictions: {len(r5_df[r5_df['predicted_sentiment'].isin(['positive', 'negative', 'neutral'])])}"
)
print(f"  Errors: {len(r5_df[r5_df['predicted_sentiment'] == 'error'])}")
display(r5_df.head())

### R6: FinBERT (ProsusAI/finbert - Few-Shot)

In [None]:
# R6: FinBERT
print("\n" + "=" * 80)
print("Running R6: FinBERT (ProsusAI/finbert - Few-Shot)")
print("=" * 80)
print("Note: FinBERT doesn't use few-shot examples - it's a fine-tuned model")
r6_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="R6 Progress"):
    # FinBERT doesn't need the full prompt - just the sentence
    response = call_finbert(row["sentence"])

    if response:
        parsed = parse_response(response)
        if parsed:
            r6_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            r6_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )
    else:
        r6_results.append(
            {
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": "error",
                "confidence": 0,
                "rationale": "Model inference failed",
            }
        )

    time.sleep(0.1)  # Shorter delay for local model

r6_df = pd.DataFrame(r6_results)
print(f"\n‚úì R6 completed: {len(r6_df)} predictions")
print(
    f"  Valid predictions: {len(r6_df[r6_df['predicted_sentiment'].isin(['positive', 'negative', 'neutral'])])}"
)
print(f"  Errors: {len(r6_df[r6_df['predicted_sentiment'] == 'error'])}")
display(r6_df.head())

## 6. Calculate Metrics

In [None]:
from sklearn.metrics import matthews_corrcoef


def calculate_metrics(df, exp_name):
    """Calculate all evaluation metrics including MCC"""
    # Check if dataframe is empty or missing required columns
    if df.empty or "predicted_sentiment" not in df.columns:
        print(f"‚ö†Ô∏è Warning: {exp_name} has no valid predictions!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": 0,
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    # Filter out errors
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()

    # Check if we have valid predictions
    if valid_df.empty:
        print(f"‚ö†Ô∏è Warning: {exp_name} has no valid predictions after filtering!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": len(df),
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    # Calculate Matthews Correlation Coefficient
    mcc_score = matthews_corrcoef(y_true, y_pred)

    metrics = {
        "Experiment": exp_name,
        "Total Samples": len(df),
        "Valid Predictions": len(valid_df),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted-F1": f1_score(y_true, y_pred, average="weighted"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
        "MCC": mcc_score,
    }

    # Per-class metrics
    labels = ["positive", "negative", "neutral"]
    precision_per_class = precision_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    recall_per_class = recall_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    for i, label in enumerate(labels):
        metrics[f"{label.capitalize()}_Precision"] = precision_per_class[i]
        metrics[f"{label.capitalize()}_Recall"] = recall_per_class[i]
        metrics[f"{label.capitalize()}_F1"] = f1_per_class[i]

    cm = confusion_matrix(y_true, y_pred, labels=labels)

    return metrics, cm, valid_df


# Calculate metrics for all experiments
print("\n" + "=" * 80)
print("CALCULATING METRICS")
print("=" * 80)

r4_metrics, r4_cm, r4_valid = calculate_metrics(r4_df, "R4: Mixtral-8x7B (Few-Shot)")
r5_metrics, r5_cm, r5_valid = calculate_metrics(r5_df, "R5: Llama-3.1-70B (Few-Shot)")
r6_metrics, r6_cm, r6_valid = calculate_metrics(r6_df, "R6: FinBERT (Few-Shot)")

# Create comparison table
metrics_df = pd.DataFrame([r4_metrics, r5_metrics, r6_metrics])

print("\n" + "=" * 80)
print("FEW-SHOT RISK ASSESSMENT PERFORMANCE COMPARISON")
print("=" * 80)
display(
    metrics_df[
        ["Experiment", "Valid Predictions", "Accuracy", "Macro-F1", "MCC"]
    ].round(4)
)

print("\n" + "=" * 80)
print("DETAILED METRICS")
print("=" * 80)
display(
    metrics_df[["Experiment", "Macro-Precision", "Macro-Recall", "Weighted-F1"]].round(
        4
    )
)

print("\n" + "=" * 80)
print("PER-CLASS F1 SCORES")
print("=" * 80)
display(metrics_df[["Experiment", "Positive_F1", "Negative_F1", "Neutral_F1"]].round(4))

## 7. Visualize Results

In [None]:
# Performance comparison bar chart
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Overall metrics
metrics_to_plot = ["Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall"]
x = np.arange(len(metrics_to_plot))
width = 0.25

for i, (metrics, label) in enumerate(
    [
        (r4_metrics, "Mixtral-8x7B"),
        (r5_metrics, "Llama-3.1-70B"),
        (r6_metrics, "FinBERT"),
    ]
):
    values = [metrics[m] for m in metrics_to_plot]
    axes[0].bar(x + i * width, values, width, label=label, alpha=0.8)

axes[0].set_xlabel("Metrics", fontsize=12, weight="bold")
axes[0].set_ylabel("Score", fontsize=12, weight="bold")
axes[0].set_title(
    "Overall Performance Comparison (Few-Shot Risk Assessment)",
    fontsize=14,
    weight="bold",
)
axes[0].set_xticks(x + width)
axes[0].set_xticklabels(metrics_to_plot)
axes[0].legend()
axes[0].set_ylim([0, 1])
axes[0].grid(axis="y", alpha=0.3)

# Per-class F1 scores
classes = ["Positive", "Negative", "Neutral"]
x2 = np.arange(len(classes))

for i, (metrics, label) in enumerate(
    [
        (r4_metrics, "Mixtral-8x7B"),
        (r5_metrics, "Llama-3.1-70B"),
        (r6_metrics, "FinBERT"),
    ]
):
    values = [metrics[f"{c}_F1"] for c in classes]
    axes[1].bar(x2 + i * width, values, width, label=label, alpha=0.8)

axes[1].set_xlabel("Sentiment Class", fontsize=12, weight="bold")
axes[1].set_ylabel("F1 Score", fontsize=12, weight="bold")
axes[1].set_title("Per-Class F1 Scores (Few-Shot)", fontsize=14, weight="bold")
axes[1].set_xticks(x2 + width)
axes[1].set_xticklabels(classes)
axes[1].legend()
axes[1].set_ylim([0, 1])
axes[1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("few_shot_risk_performance_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

print("‚úì Performance comparison chart saved")

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
labels = ["Positive", "Negative", "Neutral"]

for idx, (cm, title) in enumerate(
    [
        (r4_cm, "R4: Mixtral-8x7B"),
        (r5_cm, "R5: Llama-3.1-70B"),
        (r6_cm, "R6: FinBERT"),
    ]
):
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Greens",
        xticklabels=labels,
        yticklabels=labels,
        ax=axes[idx],
        cbar_kws={"label": "Count"},
    )
    axes[idx].set_title(title, fontsize=12, weight="bold")
    axes[idx].set_ylabel("True Label", fontsize=11, weight="bold")
    axes[idx].set_xlabel("Predicted Label", fontsize=11, weight="bold")

plt.suptitle(
    "Confusion Matrices - Few-Shot Risk Assessment",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("few_shot_risk_confusion_matrices.png", dpi=300, bbox_inches="tight")
plt.show()

print("‚úì Confusion matrices saved")

## 8. Save Results

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

print("\n" + "=" * 80)
print("SAVING RESULTS")
print("=" * 80)

r4_df.to_csv(f"r4_mixtral_8x7b_few_shot_risk_{timestamp}.csv", index=False)
print(f"‚úì r4_mixtral_8x7b_few_shot_risk_{timestamp}.csv")

r5_df.to_csv(f"r5_llama_3_1_70b_few_shot_risk_{timestamp}.csv", index=False)
print(f"‚úì r5_llama_3_1_70b_few_shot_risk_{timestamp}.csv")

r6_df.to_csv(f"r6_finbert_few_shot_risk_{timestamp}.csv", index=False)
print(f"‚úì r6_finbert_few_shot_risk_{timestamp}.csv")

metrics_df.to_csv(f"few_shot_risk_metrics_summary_{timestamp}.csv", index=False)
print(f"‚úì few_shot_risk_metrics_summary_{timestamp}.csv")

print(f"\n‚úì Visualizations saved:")
print(f"  ‚Ä¢ few_shot_risk_performance_comparison.png")
print(f"  ‚Ä¢ few_shot_risk_confusion_matrices.png")

print(f"\nüéâ All results saved with timestamp: {timestamp}")

## 9. Error Analysis

In [None]:
def analyze_errors(df, model_name):
    """Analyze misclassification patterns"""
    errors = df[df["true_sentiment"] != df["predicted_sentiment"]].copy()

    print(f"=== Error Analysis for {model_name} ===\n")
    print(
        f"Total errors: {len(errors)}/{len(df)} ({len(errors) / len(df) * 100:.2f}%)\n"
    )

    # Misclassification patterns
    print("Misclassification patterns:")
    confusion_pairs = (
        errors.groupby(["true_sentiment", "predicted_sentiment"])
        .size()
        .sort_values(ascending=False)
    )
    for (true_label, pred_label), count in confusion_pairs.items():
        print(
            f"  {true_label} ‚Üí {pred_label}: {count} ({count / len(errors) * 100:.1f}% of errors)"
        )

    # High-confidence errors (confidence > 0.7)
    if "confidence" in df.columns:
        high_conf_errors = errors[errors["confidence"] > 0.7]
        print(f"\nHigh-confidence errors (conf > 0.7): {len(high_conf_errors)}")
        if len(high_conf_errors) > 0:
            print("\nSample high-confidence errors:")
            for idx in high_conf_errors.head(3).index:
                row = df.loc[idx]
                print(
                    f"  True: {row['true_sentiment']}, Pred: {row['predicted_sentiment']} (conf: {row['confidence']:.3f})"
                )
                print(f"  Text: {row['sentence'][:100]}...")
                print()

    # Sample errors by type
    print("\nSample misclassifications:")
    for (true_label, pred_label), _ in confusion_pairs.head(3).items():
        sample = errors[
            (errors["true_sentiment"] == true_label)
            & (errors["predicted_sentiment"] == pred_label)
        ].iloc[0]
        print(f"  {true_label} ‚Üí {pred_label}:")
        print(f"  {sample['sentence'][:120]}...")
        print()

    return errors


# Analyze errors for each model
print("=== R4: Mixtral-8x7B-Instruct Few-Shot ===")
r4_errors = analyze_errors(r4_valid, "R4: Mixtral-8x7B")

print("\n" + "=" * 80 + "\n")
print("=== R5: Llama-3.1-70B-Versatile Few-Shot ===")
r5_errors = analyze_errors(r5_valid, "R5: Llama-3.1-70B")

print("\n" + "=" * 80 + "\n")
print("=== R6: FinBERT Few-Shot ===")
r6_errors = analyze_errors(r6_valid, "R6: FinBERT")

## 10. Confidence Calibration Analysis

In [None]:
# Visualize confidence distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (valid_df, model_name) in enumerate(
    [
        (r4_valid, "R4: Mixtral-8x7B"),
        (r5_valid, "R5: Llama-3.1-70B"),
        (r6_valid, "R6: FinBERT"),
    ]
):
    if "confidence" in valid_df.columns:
        correct = valid_df["true_sentiment"] == valid_df["predicted_sentiment"]

        axes[idx].hist(
            valid_df[correct]["confidence"],
            bins=20,
            alpha=0.5,
            label="Correct",
            color="green",
        )
        axes[idx].hist(
            valid_df[~correct]["confidence"],
            bins=20,
            alpha=0.5,
            label="Incorrect",
            color="red",
        )
        axes[idx].set_xlabel("Confidence Score")
        axes[idx].set_ylabel("Frequency")
        axes[idx].set_title(f"{model_name}\nConfidence Distribution")
        axes[idx].legend()
        axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
def analyze_confidence_calibration(df, model_name):
    """Analyze confidence calibration"""

    if "confidence" not in df.columns:
        print(f"No confidence scores for {model_name}")
        return

    # Overall calibration
    correct = df["true_sentiment"] == df["predicted_sentiment"]
    avg_conf_correct = df[correct]["confidence"].mean()
    avg_conf_incorrect = df[~correct]["confidence"].mean()
    calibration_gap = avg_conf_correct - avg_conf_incorrect

    print(f"=== Confidence Calibration: {model_name} ===\n")
    print(f"Average confidence when CORRECT: {avg_conf_correct:.3f}")
    print(f"Average confidence when INCORRECT: {avg_conf_incorrect:.3f}")
    print(f"Calibration gap: {calibration_gap:.3f}")
    print(
        f"  ‚Üí {'Well-calibrated' if calibration_gap > 0.15 else 'Poorly calibrated'}\n"
    )

    # Per-class confidence
    print("Per-class average confidence:")
    for label in ["positive", "negative", "neutral"]:
        class_mask = df["predicted_sentiment"] == label
        if class_mask.sum() > 0:
            avg_conf = df[class_mask]["confidence"].mean()
            accuracy = (df[class_mask]["true_sentiment"] == label).mean()
            print(f"  {label}: {avg_conf:.3f} (accuracy: {accuracy:.3f})")


# Analyze confidence for each model
analyze_confidence_calibration(r4_valid, "R4: Mixtral-8x7B")
print("\n" + "=" * 80 + "\n")
analyze_confidence_calibration(r5_valid, "R5: Llama-3.1-70B")
print("\n" + "=" * 80 + "\n")
analyze_confidence_calibration(r6_valid, "R6: FinBERT")

## 11. Classification Reports

In [None]:
# Create per-class metrics summary table
from sklearn.metrics import precision_recall_fscore_support


def create_metrics_table(y_true, y_pred, model_name):
    """Create a summary table of per-class metrics"""
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, labels=["positive", "negative", "neutral"]
    )

    metrics_df = pd.DataFrame(
        {
            "Class": ["positive", "negative", "neutral"],
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1,
            "Support": support,
        }
    )

    print(f"\n{model_name} - Per-Class Metrics Summary:")
    print(metrics_df.to_string(index=False))
    return metrics_df


r4_class_metrics = create_metrics_table(
    r4_valid["true_sentiment"], r4_valid["predicted_sentiment"], "R4: Mixtral-8x7B"
)
r5_class_metrics = create_metrics_table(
    r5_valid["true_sentiment"], r5_valid["predicted_sentiment"], "R5: Llama-3.1-70B"
)
r6_class_metrics = create_metrics_table(
    r6_valid["true_sentiment"], r6_valid["predicted_sentiment"], "R6: FinBERT"
)

In [None]:
from sklearn.metrics import classification_report

print("=" * 80)
print("R4: Mixtral-8x7B-Instruct Few-Shot Classification Report")
print("=" * 80)
print(
    classification_report(
        r4_valid["true_sentiment"],
        r4_valid["predicted_sentiment"],
        target_names=["negative", "neutral", "positive"],
    )
)

print("\n" + "=" * 80)
print("R5: Llama-3.1-70B-Versatile Few-Shot Classification Report")
print("=" * 80)
print(
    classification_report(
        r5_valid["true_sentiment"],
        r5_valid["predicted_sentiment"],
        target_names=["negative", "neutral", "positive"],
    )
)

print("\n" + "=" * 80)
print("R6: FinBERT Few-Shot Classification Report")
print("=" * 80)
print(
    classification_report(
        r6_valid["true_sentiment"],
        r6_valid["predicted_sentiment"],
        target_names=["negative", "neutral", "positive"],
    )
)

## 12. Few-Shot vs Zero-Shot Comparison

In [None]:
# Load zero-shot metrics summary for comparison
import os

try:
    # Try to find zero-shot metrics summary file
    zero_shot_dir = "../Zero_Shot/"
    if os.path.exists(zero_shot_dir):
        summary_files = [
            f
            for f in os.listdir(zero_shot_dir)
            if "metrics_summary" in f and f.endswith(".csv")
        ]
        if summary_files:
            zero_shot_summary_path = os.path.join(zero_shot_dir, summary_files[0])
            print(f"Loading zero-shot metrics from: {zero_shot_summary_path}")

            zero_shot_metrics_df = pd.read_csv(zero_shot_summary_path)
            few_shot_metrics_df = metrics_df.copy()

            # Compare metrics
            print("\n" + "=" * 80)
            print("Few-Shot vs Zero-Shot Performance Comparison")
            print("=" * 80)

            comparison_df = pd.DataFrame(
                {
                    "Model": ["Mixtral-8x7B", "Llama-3.1-70B", "FinBERT"],
                    "Zero-Shot Accuracy": zero_shot_metrics_df["Accuracy"].values,
                    "Few-Shot Accuracy": few_shot_metrics_df["Accuracy"].values,
                    "Accuracy Improvement": few_shot_metrics_df["Accuracy"].values
                    - zero_shot_metrics_df["Accuracy"].values,
                    "Zero-Shot F1 (Macro)": zero_shot_metrics_df["Macro-F1"].values,
                    "Few-Shot F1 (Macro)": few_shot_metrics_df["Macro-F1"].values,
                    "F1 Improvement": few_shot_metrics_df["Macro-F1"].values
                    - zero_shot_metrics_df["Macro-F1"].values,
                }
            )

            print(comparison_df.to_string(index=False))

            # Visualize improvement
            fig, axes = plt.subplots(1, 2, figsize=(14, 5))

            x = np.arange(len(comparison_df))
            width = 0.35

            # Accuracy comparison
            axes[0].bar(
                x - width / 2,
                comparison_df["Zero-Shot Accuracy"],
                width,
                label="Zero-Shot",
                alpha=0.8,
            )
            axes[0].bar(
                x + width / 2,
                comparison_df["Few-Shot Accuracy"],
                width,
                label="Few-Shot",
                alpha=0.8,
            )
            axes[0].set_xlabel("Model")
            axes[0].set_ylabel("Accuracy")
            axes[0].set_title("Accuracy: Few-Shot vs Zero-Shot")
            axes[0].set_xticks(x)
            axes[0].set_xticklabels(comparison_df["Model"], rotation=15, ha="right")
            axes[0].legend()
            axes[0].grid(True, alpha=0.3)

            # F1 comparison
            axes[1].bar(
                x - width / 2,
                comparison_df["Zero-Shot F1 (Macro)"],
                width,
                label="Zero-Shot",
                alpha=0.8,
            )
            axes[1].bar(
                x + width / 2,
                comparison_df["Few-Shot F1 (Macro)"],
                width,
                label="Few-Shot",
                alpha=0.8,
            )
            axes[1].set_xlabel("Model")
            axes[1].set_ylabel("F1 Score (Macro)")
            axes[1].set_title("F1 Score: Few-Shot vs Zero-Shot")
            axes[1].set_xticks(x)
            axes[1].set_xticklabels(comparison_df["Model"], rotation=15, ha="right")
            axes[1].legend()
            axes[1].grid(True, alpha=0.3)

            plt.tight_layout()
            plt.show()

            # Key insights from comparison
            print("\n" + "=" * 80)
            print("Key Insights from Few-Shot Learning:")
            print("=" * 80)

            best_improvement = comparison_df.loc[
                comparison_df["Accuracy Improvement"].idxmax()
            ]
            print(
                f"1. Largest accuracy improvement: {best_improvement['Model']} (+{best_improvement['Accuracy Improvement']:.3f})"
            )

            avg_improvement = comparison_df["Accuracy Improvement"].mean()
            print(f"2. Average accuracy improvement: +{avg_improvement:.3f}")

            if (comparison_df["Accuracy Improvement"] > 0).all():
                print("3. All models benefited from few-shot examples")
            else:
                print(
                    f"3. {(comparison_df['Accuracy Improvement'] > 0).sum()}/3 models improved with few-shot learning"
                )

            print(
                f"4. F1 Score improvements range from {comparison_df['F1 Improvement'].min():.3f} to {comparison_df['F1 Improvement'].max():.3f}"
            )

        else:
            print("No zero-shot metrics summary file found - skipping comparison")
    else:
        print(f"Zero-shot directory not found: {zero_shot_dir} - skipping comparison")

except Exception as e:
    print(f"Could not load zero-shot metrics for comparison: {e}")
    print("Skipping few-shot vs zero-shot comparison")

## 13. Expected Conclusions from Few-Shot Risk Assessment Experiment

### Comprehensive Analysis of Few-Shot Learning for Financial Risk Assessment

**1. Few-Shot Learning Impact**
- Few-shot examples provide concrete demonstrations of risk assessment framing
- Models learn to differentiate between investor sentiment and financial risk signals
- 5 curated examples (2 positive, 2 negative, 1 neutral) establish clear classification patterns
- Examples include both the label and reasoning, improving model understanding

**2. Model Performance Comparison**
- **R4 (Mixtral-8x7B)**: Likely shows moderate improvement over zero-shot, balances speed and accuracy
- **R5 (Llama-3.1-70B)**: Expected to leverage examples effectively with strong pattern recognition
- **R6 (FinBERT)**: Domain-specific pretraining + few-shot examples should yield highest accuracy
- MCC scores indicate robustness across imbalanced classes

**3. Example Quality Effect**
- Carefully selected examples cover diverse financial scenarios (earnings, forecasts, operational changes)
- Each example demonstrates risk assessment reasoning (why a statement indicates positive/negative/neutral risk)
- Balance in example distribution (2-2-1) reflects realistic financial text class distribution
- JSON format examples provide structured, parseable guidance to models

**4. Risk vs Sentiment Framing**
- Few-shot examples explicitly frame task as "financial risk assessment" not sentiment analysis
- Examples teach models to evaluate: probability of loss, business uncertainty, operational threats
- Positive risk = opportunity/growth signals; Negative risk = threats/losses; Neutral = informational
- This framing differs from Task1 (Sentiment), requiring risk-specific reasoning

**5. Negative Class Detection**
- Few-shot examples likely improve negative (risk/threat) class detection
- Examples demonstrate subtle risk indicators: "challenging," "may not," "forecast cut"
- Models learn to distinguish between negative sentiment and actual business risk
- Expected reduction in false positives for negative class compared to zero-shot

**6. Neutral Class Handling**
- Neutral examples show informational statements without risk implications
- Models learn that factual updates ‚â† risk assessment
- Neutral class precision expected to improve with clear examples
- Reduces misclassification of informational content as positive/negative risk

**7. Confidence Calibration**
- Few-shot learning should improve confidence calibration (larger gap between correct/incorrect predictions)
- Models express higher confidence when predictions align with example patterns
- Expected calibration gap > 0.15 indicates well-calibrated predictions
- Per-class confidence analysis reveals which risk categories models handle confidently

**8. Error Analysis Insights**
- Common misclassifications: Neutral ‚Üí Negative (overpredicting risk in informational content)
- High-confidence errors indicate edge cases not covered by 5 examples
- Confusion between positive risk (opportunity) and neutral (factual growth statements)
- Few-shot reduces errors but doesn't eliminate ambiguous cases

**9. Few-Shot vs Zero-Shot Improvement**
- Expected accuracy improvement: +3-8% across models
- Largest gains in F1-macro due to better minority class handling
- FinBERT (R6) may show smaller improvement (already strong domain knowledge)
- LLMs (R4, R5) expected to benefit more from explicit examples

**10. Model-Specific Behaviors**
- **Mixtral-8x7B**: Fast inference, moderate gains from examples, good balance
- **Llama-3.1-70B**: Strong few-shot learning capability, may achieve highest improvement
- **FinBERT**: Domain expertise + examples = robust performance, especially on financial jargon

**11. Production Readiness**
- Few-shot approach requires maintaining 5 high-quality curated examples
- Examples must be updated if risk assessment framing or domain shifts
- Trade-off: Better accuracy vs increased prompt length and API costs
- Suitable for production when example maintenance is feasible

**12. Recommendations for Deployment**
- **Use Few-Shot when**: Need to establish specific risk assessment framing, have resources for example curation
- **Consider Zero-Shot when**: Speed/cost critical, examples difficult to maintain, domain stable
- **Next steps**: Chain-of-Thought (R7-R9) for explainability, Tree-of-Thought (R10-R12) for complex reasoning
- Monitor performance drift; update examples if financial language/risk patterns evolve
- Combine with confidence thresholds (e.g., manual review for conf < 0.7) for critical applications