In [None]:
# Install required packages
!pip install pandas numpy matplotlib seaborn scikit-learn google-generativeai groq python-dotenv tqdm -q

In [None]:
# Suppress deprecation warnings
import warnings
import os

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
)

# API setup
from groq import Groq
from dotenv import load_dotenv
from transformers import pipeline

# Load environment variables
load_dotenv()

# Configure Groq API (for Mixtral and Llama)
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if GROQ_API_KEY:
    groq_client = Groq(api_key=GROQ_API_KEY)
    print("‚úì Groq API configured")
else:
    print("‚ö†Ô∏è  Warning: GROQ_API_KEY not found in environment variables")
    groq_client = None

# Initialize FinBERT
print("Loading FinBERT model (ProsusAI/finbert)...")
try:
    finbert_pipeline = pipeline(
        "sentiment-analysis",
        model="ProsusAI/finbert",
        tokenizer="ProsusAI/finbert",
        device=-1,  # CPU
        truncation=True,
        max_length=512,
    )
    print("‚úì FinBERT model loaded successfully")
except Exception as e:
    print(f"‚ö†Ô∏è  FinBERT loading failed: {e}")
    finbert_pipeline = None

# Set plot style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print("\n" + "=" * 80)
print("SETUP COMPLETE")
print("=" * 80)
print("Models configured:")
print("  ‚Ä¢ R1: Mixtral-8x7B-32768 (Groq API)")
print("  ‚Ä¢ R2: Llama-3.1-70B-Versatile (Groq API)")
print("  ‚Ä¢ R3: FinBERT (ProsusAI/finbert - Local)")
print("=" * 80)

## 1. Load Dataset

In [None]:
# Load the 100% agreement dataset (highest quality)
data_path = "../../DatasetAnalysis_FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"

sentences = []
sentiments = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                sentiments.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "true_sentiment": sentiments})

print(f"Dataset loaded: {len(df)} sentences")
print(f"\nSentiment distribution:")
print(df["true_sentiment"].value_counts())

# Display sample
print("\n" + "=" * 80)
print("Sample sentences:")
print("=" * 80)
display(df.sample(5, random_state=42))

## 2. Zero-Shot Prompt Design

**Prompt Strategy**: Simple, direct instruction with no examples. Enforces strict JSON output format.

In [None]:
def create_zero_shot_prompt(sentence):
    """
    Creates a zero-shot prompt for sentiment classification.
    No examples provided - model relies on pretrained knowledge.
    """
    prompt = f"""You are a financial sentiment analysis expert.

Classify the sentiment of the following financial statement as either "positive", "negative", or "neutral" from an investor's perspective.

Consider:
- Positive: Good news for stock price (revenue increase, profit growth, etc.)
- Negative: Bad news for stock price (losses, declining sales, etc.)
- Neutral: No clear impact on stock price or mixed signals

Financial Statement:
"{sentence}"

Provide your response in the following JSON format:
{{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Brief explanation in one sentence"
}}
"""
    return prompt


# Test prompt
test_sentence = "Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007."
print("=" * 80)
print("ZERO-SHOT PROMPT EXAMPLE")
print("=" * 80)
print(create_zero_shot_prompt(test_sentence))

## 3. Model Inference Functions

In [None]:
def call_mixtral(prompt, temperature=0.0):
    """Call Mixtral-8x7B via Groq API"""
    if not groq_client:
        print("‚ö†Ô∏è  Groq client not initialized")
        return None

    max_retries = 3
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model="mixtral-8x7b-32768",
                temperature=temperature,
                max_tokens=500,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)  # Exponential backoff
                continue
            print(f"Error calling Mixtral: {e}")
            return None
    return None


def call_llama(prompt, temperature=0.0):
    """Call Llama-3.1-70B via Groq API"""
    if not groq_client:
        print("‚ö†Ô∏è  Groq client not initialized")
        return None

    max_retries = 3
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model="llama-3.1-70b-versatile",
                temperature=temperature,
                max_tokens=500,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            print(f"Error calling Llama: {e}")
            return None
    return None


def call_finbert(sentence):
    """Call FinBERT model for sentiment analysis"""
    if not finbert_pipeline:
        print("‚ö†Ô∏è  FinBERT pipeline not initialized")
        return None

    try:
        result = finbert_pipeline(sentence)[0]

        # Map FinBERT labels to our format
        label_map = {
            "positive": "positive",
            "negative": "negative",
            "neutral": "neutral",
        }

        sentiment = label_map.get(result["label"].lower(), "neutral")
        confidence = result["score"]

        # Create JSON response matching other models
        response = {
            "sentiment": sentiment,
            "confidence": confidence,
            "rationale": f"FinBERT classification with {confidence:.2%} confidence",
        }

        return json.dumps(response)

    except Exception as e:
        print(f"Error with FinBERT: {e}")
        return None


def parse_response(response_text):
    """Parse JSON response from model"""
    try:
        # Try to extract JSON from response
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in response_text:
            json_str = response_text.split("```")[1].strip()
        else:
            json_str = response_text.strip()

        result = json.loads(json_str)
        return result
    except:
        # Fallback: try to extract sentiment with regex
        response_lower = response_text.lower()
        if "positive" in response_lower and "negative" not in response_lower:
            return {
                "sentiment": "positive",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "negative" in response_lower:
            return {
                "sentiment": "negative",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "neutral" in response_lower:
            return {
                "sentiment": "neutral",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        return None


print("‚úì Model inference functions defined")
print("  ‚Ä¢ call_mixtral() - Mixtral-8x7B-32768")
print("  ‚Ä¢ call_llama() - Llama-3.1-70B-Versatile")
print("  ‚Ä¢ call_finbert() - FinBERT (ProsusAI/finbert)")
print("  ‚Ä¢ parse_response() - JSON parser")

## 4. Run Experiments

### R1: Mixtral-8x7B-32768 (Zero-Shot)

In [None]:
# For testing, use a sample of the dataset (remove .head(100) for full run)
test_df = df.head(100).copy()  # Remove .head(100) for full dataset

# R1: Mixtral-8x7B-32768
print("=" * 80)
print("Running R1: Mixtral-8x7B-32768 (Zero-Shot)")
print("=" * 80)
r1_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="R1 Progress"):
    prompt = create_zero_shot_prompt(row["sentence"])
    response = call_mixtral(prompt)

    if response:
        parsed = parse_response(response)
        if parsed:
            r1_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            r1_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )
    else:
        r1_results.append(
            {
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": "error",
                "confidence": 0,
                "rationale": "API call failed",
            }
        )

    time.sleep(0.5)  # Rate limiting

r1_df = pd.DataFrame(r1_results)
print(f"\n‚úì R1 completed: {len(r1_df)} predictions")
print(
    f"  Valid predictions: {len(r1_df[r1_df['predicted_sentiment'].isin(['positive', 'negative', 'neutral'])])}"
)
print(f"  Errors: {len(r1_df[r1_df['predicted_sentiment'] == 'error'])}")
display(r1_df.head())

### R2: Llama-3.1-70B-Versatile (Zero-Shot)

In [None]:
# R2: Llama-3.1-70B-Versatile
print("\n" + "=" * 80)
print("Running R2: Llama-3.1-70B-Versatile (Zero-Shot)")
print("=" * 80)
r2_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="R2 Progress"):
    prompt = create_zero_shot_prompt(row["sentence"])
    response = call_llama(prompt)

    if response:
        parsed = parse_response(response)
        if parsed:
            r2_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            r2_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )
    else:
        r2_results.append(
            {
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": "error",
                "confidence": 0,
                "rationale": "API call failed",
            }
        )

    time.sleep(0.5)

r2_df = pd.DataFrame(r2_results)
print(f"\n‚úì R2 completed: {len(r2_df)} predictions")
print(
    f"  Valid predictions: {len(r2_df[r2_df['predicted_sentiment'].isin(['positive', 'negative', 'neutral'])])}"
)
print(f"  Errors: {len(r2_df[r2_df['predicted_sentiment'] == 'error'])}")
display(r2_df.head())

### R3: FinBERT (ProsusAI/finbert - Zero-Shot)

In [None]:
# R3: FinBERT
print("\n" + "=" * 80)
print("Running R3: FinBERT (ProsusAI/finbert - Zero-Shot)")
print("=" * 80)
r3_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="R3 Progress"):
    # FinBERT doesn't need the full prompt - just the sentence
    response = call_finbert(row["sentence"])

    if response:
        parsed = parse_response(response)
        if parsed:
            r3_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            r3_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )
    else:
        r3_results.append(
            {
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": "error",
                "confidence": 0,
                "rationale": "Model inference failed",
            }
        )

    time.sleep(0.1)  # Shorter delay for local model

r3_df = pd.DataFrame(r3_results)
print(f"\n‚úì R3 completed: {len(r3_df)} predictions")
print(
    f"  Valid predictions: {len(r3_df[r3_df['predicted_sentiment'].isin(['positive', 'negative', 'neutral'])])}"
)
print(f"  Errors: {len(r3_df[r3_df['predicted_sentiment'] == 'error'])}")
display(r3_df.head())

## 5. Calculate Metrics

In [None]:
from sklearn.metrics import matthews_corrcoef


def calculate_metrics(df, exp_name):
    """Calculate all evaluation metrics including MCC"""
    # Check if dataframe is empty or missing required columns
    if df.empty or "predicted_sentiment" not in df.columns:
        print(f"‚ö†Ô∏è Warning: {exp_name} has no valid predictions!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": 0,
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    # Filter out errors
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()

    # Check if we have valid predictions
    if valid_df.empty:
        print(f"‚ö†Ô∏è Warning: {exp_name} has no valid predictions after filtering!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": len(df),
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    # Calculate Matthews Correlation Coefficient
    mcc_score = matthews_corrcoef(y_true, y_pred)

    metrics = {
        "Experiment": exp_name,
        "Total Samples": len(df),
        "Valid Predictions": len(valid_df),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted-F1": f1_score(y_true, y_pred, average="weighted"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
        "MCC": mcc_score,
    }

    # Per-class metrics
    labels = ["positive", "negative", "neutral"]
    precision_per_class = precision_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    recall_per_class = recall_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    for i, label in enumerate(labels):
        metrics[f"{label.capitalize()}_Precision"] = precision_per_class[i]
        metrics[f"{label.capitalize()}_Recall"] = recall_per_class[i]
        metrics[f"{label.capitalize()}_F1"] = f1_per_class[i]

    cm = confusion_matrix(y_true, y_pred, labels=labels)

    return metrics, cm, valid_df


# Calculate metrics for all experiments
print("\n" + "=" * 80)
print("CALCULATING METRICS")
print("=" * 80)

r1_metrics, r1_cm, r1_valid = calculate_metrics(r1_df, "R1: Mixtral-8x7B (Zero-Shot)")
r2_metrics, r2_cm, r2_valid = calculate_metrics(r2_df, "R2: Llama-3.1-70B (Zero-Shot)")
r3_metrics, r3_cm, r3_valid = calculate_metrics(r3_df, "R3: FinBERT (Zero-Shot)")

# Create comparison table
metrics_df = pd.DataFrame([r1_metrics, r2_metrics, r3_metrics])

print("\n" + "=" * 80)
print("ZERO-SHOT RISK ASSESSMENT PERFORMANCE COMPARISON")
print("=" * 80)
display(
    metrics_df[
        ["Experiment", "Valid Predictions", "Accuracy", "Macro-F1", "MCC"]
    ].round(4)
)

print("\n" + "=" * 80)
print("DETAILED METRICS")
print("=" * 80)
display(
    metrics_df[["Experiment", "Macro-Precision", "Macro-Recall", "Weighted-F1"]].round(
        4
    )
)

print("\n" + "=" * 80)
print("PER-CLASS F1 SCORES")
print("=" * 80)
display(metrics_df[["Experiment", "Positive_F1", "Negative_F1", "Neutral_F1"]].round(4))

## 6. Visualize Results

In [None]:
# Performance comparison bar chart
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Overall metrics
metrics_to_plot = ["Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall"]
x = np.arange(len(metrics_to_plot))
width = 0.25

for i, (metrics, label) in enumerate(
    [
        (r1_metrics, "Mixtral-8x7B"),
        (r2_metrics, "Llama-3.1-70B"),
        (r3_metrics, "FinBERT"),
    ]
):
    values = [metrics[m] for m in metrics_to_plot]
    axes[0].bar(x + i * width, values, width, label=label, alpha=0.8)

axes[0].set_xlabel("Metrics", fontsize=12, weight="bold")
axes[0].set_ylabel("Score", fontsize=12, weight="bold")
axes[0].set_title(
    "Overall Performance Comparison (Zero-Shot Risk Assessment)",
    fontsize=14,
    weight="bold",
)
axes[0].set_xticks(x + width)
axes[0].set_xticklabels(metrics_to_plot)
axes[0].legend()
axes[0].set_ylim([0, 1])
axes[0].grid(axis="y", alpha=0.3)

# Per-class F1 scores
classes = ["Positive", "Negative", "Neutral"]
x2 = np.arange(len(classes))

for i, (metrics, label) in enumerate(
    [
        (r1_metrics, "Mixtral-8x7B"),
        (r2_metrics, "Llama-3.1-70B"),
        (r3_metrics, "FinBERT"),
    ]
):
    values = [metrics[f"{c}_F1"] for c in classes]
    axes[1].bar(x2 + i * width, values, width, label=label, alpha=0.8)

axes[1].set_xlabel("Sentiment Class", fontsize=12, weight="bold")
axes[1].set_ylabel("F1 Score", fontsize=12, weight="bold")
axes[1].set_title("Per-Class F1 Scores (Zero-Shot)", fontsize=14, weight="bold")
axes[1].set_xticks(x2 + width)
axes[1].set_xticklabels(classes)
axes[1].legend()
axes[1].set_ylim([0, 1])
axes[1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("zero_shot_risk_performance_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

print("‚úì Performance comparison chart saved")

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
labels = ["Positive", "Negative", "Neutral"]

for idx, (cm, title) in enumerate(
    [
        (r1_cm, "R1: Mixtral-8x7B"),
        (r2_cm, "R2: Llama-3.1-70B"),
        (r3_cm, "R3: FinBERT"),
    ]
):
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=labels,
        yticklabels=labels,
        ax=axes[idx],
        cbar_kws={"label": "Count"},
    )
    axes[idx].set_title(title, fontsize=12, weight="bold")
    axes[idx].set_ylabel("True Label", fontsize=11, weight="bold")
    axes[idx].set_xlabel("Predicted Label", fontsize=11, weight="bold")

plt.suptitle(
    "Confusion Matrices - Zero-Shot Risk Assessment",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("zero_shot_risk_confusion_matrices.png", dpi=300, bbox_inches="tight")
plt.show()

print("‚úì Confusion matrices saved")

## 7. Save Results

In [None]:
# Save detailed results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

print("\n" + "=" * 80)
print("SAVING RESULTS")
print("=" * 80)

r1_df.to_csv(f"r1_mixtral_8x7b_zero_shot_risk_{timestamp}.csv", index=False)
print(f"‚úì r1_mixtral_8x7b_zero_shot_risk_{timestamp}.csv")

r2_df.to_csv(f"r2_llama_3_1_70b_zero_shot_risk_{timestamp}.csv", index=False)
print(f"‚úì r2_llama_3_1_70b_zero_shot_risk_{timestamp}.csv")

r3_df.to_csv(f"r3_finbert_zero_shot_risk_{timestamp}.csv", index=False)
print(f"‚úì r3_finbert_zero_shot_risk_{timestamp}.csv")

# Save metrics summary
metrics_df.to_csv(f"zero_shot_risk_metrics_summary_{timestamp}.csv", index=False)
print(f"‚úì zero_shot_risk_metrics_summary_{timestamp}.csv")

print(f"\n‚úì Visualizations saved:")
print(f"  ‚Ä¢ zero_shot_risk_performance_comparison.png")
print(f"  ‚Ä¢ zero_shot_risk_confusion_matrices.png")
print(f"  ‚Ä¢ zero_shot_risk_confidence_analysis.png")

print(f"\nüéâ All results saved with timestamp: {timestamp}")

## 8. Error Analysis

In [None]:
# Error Analysis: Most Common Misclassifications
print("=" * 80)
print("ERROR ANALYSIS: TOP MISCLASSIFIED PATTERNS")
print("=" * 80)

for df_result, exp_name in [
    (r1_valid, "R1: Mixtral-8x7B"),
    (r2_valid, "R2: Llama-3.1-70B"),
    (r3_valid, "R3: FinBERT"),
]:
    if not df_result.empty:
        print(f"\n{exp_name}:")
        errors = df_result[
            df_result["true_sentiment"] != df_result["predicted_sentiment"]
        ]

        # Count confusion pairs
        if not errors.empty:
            confusion_pairs = errors.groupby(
                ["true_sentiment", "predicted_sentiment"]
            ).size()
            print(f"Total Errors: {len(errors)}")
            print("\nMost Common Misclassifications:")
            for (true_label, pred_label), count in (
                confusion_pairs.sort_values(ascending=False).head(5).items()
            ):
                print(f"  {true_label} ‚Üí {pred_label}: {count} errors")

            # High-confidence errors
            if "confidence" in df_result.columns:
                high_conf_errors = errors[errors["confidence"] > 0.7]
                if not high_conf_errors.empty:
                    print(
                        f"\nHigh-Confidence Errors (confidence > 0.7): {len(high_conf_errors)}"
                    )
                    print("Sample high-confidence misclassifications:")
                    for idx, row in high_conf_errors.head(2).iterrows():
                        print(f"\n  Sentence: {row['sentence'][:100]}...")
                        print(
                            f"  True: {row['true_sentiment']} | Predicted: {row['predicted_sentiment']} | Confidence: {row['confidence']:.3f}"
                        )

            # Show sample errors
            print(f"\nSample Misclassified Sentences:")
            for idx, row in errors.head(3).iterrows():
                print(f"\n  Sentence: {row['sentence'][:100]}...")
                print(
                    f"  True: {row['true_sentiment']} | Predicted: {row['predicted_sentiment']} | Confidence: {row.get('confidence', 0):.3f}"
                )
        else:
            print(f"  ‚úì No errors - perfect classification!")
    else:
        print(f"\n{exp_name}: No valid predictions to analyze")

## 9. Confidence Calibration Analysis

In [None]:
print("\n" + "=" * 80)
print("CONFIDENCE CALIBRATION ANALYSIS")
print("=" * 80)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (df, name) in enumerate(
    [
        (r1_valid, "R1: Mixtral-8x7B"),
        (r2_valid, "R2: Llama-3.1-70B"),
        (r3_valid, "R3: FinBERT"),
    ]
):
    if not df.empty and "confidence" in df.columns:
        # Confidence by correctness
        df_copy = df.copy()
        df_copy["correct"] = df_copy["true_sentiment"] == df_copy["predicted_sentiment"]

        correct_conf = df_copy[df_copy["correct"]]["confidence"]
        incorrect_conf = df_copy[~df_copy["correct"]]["confidence"]

        axes[idx].hist(
            correct_conf,
            bins=20,
            alpha=0.6,
            label=f"Correct (n={len(correct_conf)})",
            color="green",
        )
        axes[idx].hist(
            incorrect_conf,
            bins=20,
            alpha=0.6,
            label=f"Incorrect (n={len(incorrect_conf)})",
            color="red",
        )

        axes[idx].set_xlabel("Confidence Score", fontsize=11, weight="bold")
        axes[idx].set_ylabel("Frequency", fontsize=11, weight="bold")
        axes[idx].set_title(name, fontsize=12, weight="bold")
        axes[idx].legend()
        axes[idx].grid(axis="y", alpha=0.3)

        # Print calibration statistics
        avg_conf_correct = correct_conf.mean() if len(correct_conf) > 0 else 0
        avg_conf_incorrect = incorrect_conf.mean() if len(incorrect_conf) > 0 else 0
        calibration_gap = avg_conf_correct - avg_conf_incorrect

        print(f"\n{name}:")
        print(f"  Correct predictions - Mean confidence: {avg_conf_correct:.3f}")
        print(f"  Incorrect predictions - Mean confidence: {avg_conf_incorrect:.3f}")
        print(f"  Calibration gap: {calibration_gap:.3f}")

        # Per-class confidence
        print(f"  Per-class average confidence:")
        for sentiment in ["positive", "negative", "neutral"]:
            class_df = df_copy[df_copy["predicted_sentiment"] == sentiment]
            if not class_df.empty:
                print(
                    f"    {sentiment.capitalize()}: {class_df['confidence'].mean():.3f}"
                )
    else:
        axes[idx].text(
            0.5,
            0.5,
            "No data available",
            ha="center",
            va="center",
            fontsize=14,
            color="red",
        )
        axes[idx].set_title(name, fontsize=12, weight="bold")

plt.suptitle(
    "Confidence Score Distribution - Zero-Shot Risk Assessment",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("zero_shot_risk_confidence_analysis.png", dpi=300, bbox_inches="tight")
plt.show()

print("\n‚úì Confidence analysis visualization saved")

## 10. Classification Reports

In [None]:
print("\n" + "=" * 80)
print("DETAILED CLASSIFICATION REPORTS")
print("=" * 80)

for exp_name, valid_df in [
    ("R1: Mixtral-8x7B (Zero-Shot)", r1_valid),
    ("R2: Llama-3.1-70B (Zero-Shot)", r2_valid),
    ("R3: FinBERT (Zero-Shot)", r3_valid),
]:
    if not valid_df.empty:
        print(f"\n{exp_name}")
        print("-" * 80)
        print(
            classification_report(
                valid_df["true_sentiment"],
                valid_df["predicted_sentiment"],
                labels=["positive", "negative", "neutral"],
                target_names=["Positive", "Negative", "Neutral"],
                zero_division=0,
            )
        )
    else:
        print(f"\n{exp_name}")
        print("-" * 80)
        print("‚ö†Ô∏è  No valid predictions to report")

# Per-class metrics summary table
print("\n" + "=" * 80)
print("PER-CLASS METRICS SUMMARY")
print("=" * 80)
class_metrics_summary = metrics_df[
    [
        "Experiment",
        "Positive_Precision",
        "Positive_Recall",
        "Positive_F1",
        "Negative_Precision",
        "Negative_Recall",
        "Negative_F1",
        "Neutral_Precision",
        "Neutral_Recall",
        "Neutral_F1",
    ]
]
display(class_metrics_summary.round(4))

## 11. Expected Conclusions from Zero-Shot Risk Assessment

### Zero-Shot Performance Analysis

**1. Model Performance Ranking**
- **Best Overall**: Identify which model achieves highest Macro-F1 and MCC scores
- **Expected Leader**: FinBERT (R3) likely outperforms due to financial domain specialization
- **LLM Comparison**: Llama-3.1-70B (R2) expected to beat Mixtral-8x7B (R1) due to larger parameter count
- **Accuracy vs F1**: Check if rankings differ between accuracy and F1 (important for imbalanced classes)

**2. Zero-Shot Baseline Establishment**
- **Purpose**: This experiment establishes baseline performance without any examples
- **Benchmark**: All future experiments (Few-Shot, CoT, ToT) should exceed this baseline
- **Minimum Threshold**: Zero-Shot Macro-F1 should be > 0.60 for production viability
- **Model Comparison**: Establishes which model architecture (LLM vs domain-specific) works best for financial sentiment

**3. Class-Specific Performance (Critical for Risk Assessment)**

   **Negative Class Detection** (Highest Priority):
   - **Business Impact**: Missing negative sentiment = missing financial risks
   - **Expected Challenge**: Negative class typically has lowest recall across all models
   - **Risk Assessment Goal**: Negative_Recall > 0.70 is acceptable, > 0.80 is excellent
   - **False Negatives**: Count how many true negative cases were missed (critical errors)

   **Positive Class Detection**:
   - **Expected Performance**: Highest F1 score (clearest signals: profit, growth, revenue increase)
   - **Common Errors**: May confuse neutral statements with positive (overly optimistic bias)

   **Neutral Class Detection**:
   - **Expected Challenge**: Hardest to classify (ambiguous signals, mixed news)
   - **Error Pattern**: Often misclassified as positive or negative
   - **Acceptable F1**: Neutral_F1 > 0.50 is acceptable given inherent ambiguity

**4. Matthews Correlation Coefficient (MCC) Analysis**
- **Why MCC**: Better than accuracy for imbalanced classes, ranges from -1 to +1
- **Interpretation**: 
  - MCC > 0.5 = Good performance
  - MCC > 0.7 = Excellent performance
  - MCC < 0.3 = Model barely better than random guessing
- **Expected Ranking**: MCC should rank models similarly to Macro-F1
- **Class Balance Check**: If MCC << Accuracy, model is biased toward majority class

**5. Confidence Calibration Quality**
- **Well-Calibrated Model**: Avg confidence for correct predictions >> avg confidence for incorrect predictions
- **Calibration Gap**: Target gap > 0.10 (well-calibrated), > 0.20 (excellent)
- **Overconfidence Risk**: If incorrect predictions have high confidence, model is dangerously overconfident
- **Expected Behavior**:
  - FinBERT: Best calibration (fine-tuned on financial data with proper probability outputs)
  - LLMs: May be overconfident (trained to sound confident even when uncertain)

**6. Error Pattern Analysis**

   **Most Common Misclassifications** (Expected):
   - **neutral ‚Üí negative**: Model interprets caution/uncertainty as bad news
   - **neutral ‚Üí positive**: Model misses subtle negative signals in mixed statements
   - **positive ‚Üí neutral**: Model underestimates positive impact
   - **negative ‚Üí neutral**: Most dangerous - missing risk signals

   **High-Confidence Errors**:
   - When model is confident but wrong, indicates systematic misunderstanding
   - Review these cases to understand model's fundamental limitations
   - May require prompt engineering or few-shot examples to fix

**7. Model-Specific Behaviors**

   **R1: Mixtral-8x7B** (Groq API):
   - **Architecture**: Mixture-of-Experts (8 specialists)
   - **Expected Strength**: Fast inference, cost-effective, good general reasoning
   - **Expected Weakness**: Limited financial domain knowledge, may miss subtle signals
   - **Prediction**: Moderate performance (Macro-F1: 0.60-0.70)

   **R2: Llama-3.1-70B-Versatile** (Groq API):
   - **Architecture**: Dense 70B parameter model
   - **Expected Strength**: Best general language understanding, complex reasoning
   - **Expected Weakness**: No financial specialization, may interpret statements too literally
   - **Prediction**: Best among LLMs (Macro-F1: 0.65-0.75)

   **R3: FinBERT** (ProsusAI/finbert - Local):
   - **Architecture**: BERT-base fine-tuned on 4.9M financial sentences
   - **Expected Strength**: Deep financial domain knowledge, well-calibrated probabilities
   - **Expected Weakness**: Cannot reason beyond training patterns, no context beyond 512 tokens
   - **Prediction**: Best overall (Macro-F1: 0.75-0.85)

**8. Zero-Shot vs Future Approaches**

   **Expected Improvements**:
   - **Few-Shot (R4-R6)**: +5-10% F1 improvement from providing curated examples
   - **Chain-of-Thought (R7-R9)**: +3-7% F1 improvement from structured reasoning
   - **Tree-of-Thought (R10-R12)**: +2-5% F1 improvement from multi-path hypothesis exploration

   **When Zero-Shot is Sufficient**:
   - If Macro-F1 > 0.75 and Negative_Recall > 0.75
   - No computational budget for complex prompting
   - Real-time inference requirements (latency-sensitive)

**9. Production Deployment Considerations**

   **Best for Accuracy** (regardless of cost):
   - Model with highest Macro-F1 and MCC
   - Priority: Minimize all misclassifications
   - Use Case: High-stakes financial decisions, regulatory compliance

   **Best for Risk Detection** (highest negative recall):
   - Model with highest Negative_Recall score
   - Priority: Never miss financial risks
   - Use Case: Early warning systems, portfolio risk monitoring

   **Best for Cost-Efficiency**:
   - Acceptable F1 (> 0.65) at lowest inference cost
   - Priority: Balance accuracy and operational cost
   - Use Case: High-volume sentiment monitoring, preliminary screening

   **Recommended Approach**:
   - **Primary**: FinBERT (R3) for accuracy and speed
   - **Backup**: Llama-3.1-70B (R2) for cases requiring reasoning
   - **Cost-Optimized**: Mixtral-8x7B (R1) for high-volume batch processing

**10. Limitations and Constraints**

   **Zero-Shot Limitations**:
   - No examples = model relies entirely on pretrained knowledge
   - Cannot guide model toward specific interpretation patterns
   - May misinterpret domain-specific terminology
   - No control over output format consistency

   **Data Limitations**:
   - Sentences_AllAgree.txt = 100% annotator agreement (2,217 samples)
   - High-quality but limited size
   - May not represent full diversity of financial statements
   - Balanced distribution (positive/negative/neutral) may not reflect real-world skew

   **Task Framing**:
   - "Risk Assessment" uses same sentiment labels as Task1
   - Risk = negative sentiment, opportunity = positive sentiment
   - Real financial risk assessment more complex (volatility, uncertainty, exposure)

**11. Next Steps and Improvements**

   **If Zero-Shot Performance is Good (Macro-F1 > 0.70)**:
   - Skip to production deployment testing
   - May not need Few-Shot or CoT approaches
   - Focus on error analysis and edge case handling

   **If Zero-Shot Performance is Weak (Macro-F1 < 0.65)**:
   - Proceed to Few-Shot experiments (R4-R6) with curated examples
   - Design examples specifically targeting weak classes (likely negative and neutral)
   - Consider Chain-of-Thought for complex reasoning cases

   **Prompt Engineering Opportunities**:
   - Add specific financial terminology definitions
   - Provide clearer distinction between neutral and negative
   - Include examples of edge cases in system prompt
   - Request confidence scores and reasoning (improve calibration)

**12. Key Validation Questions**
- ‚úì/‚úó Does any model achieve Macro-F1 > 0.70?
- ‚úì/‚úó Does FinBERT outperform LLMs in zero-shot setting?
- ‚úì/‚úó Is Negative_Recall > 0.65 for best model?
- ‚úì/‚úó Is calibration gap > 0.10 for at least one model?
- ‚úì/‚úó Are most errors neutral‚Üîpositive/negative (expected pattern)?
- ‚úì/‚úó Does MCC ranking match Macro-F1 ranking?
- ‚úì/‚úó Is zero-shot baseline sufficient for production deployment?