In [None]:
# Updated imports for new LLMs and libraries
import warnings
import os

warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
    matthews_corrcoef,
)

# API setup for LLMs (Groq, dotenv)
from groq import Groq
from dotenv import load_dotenv
from transformers import pipeline
import torch

load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if GROQ_API_KEY:
    groq_client = Groq(api_key=GROQ_API_KEY)

# Load FinBERT model for local inference
print("Loading FinBERT model...")
device = 0 if torch.cuda.is_available() else -1
finbert_pipeline = pipeline(
    "sentiment-analysis", model="ProsusAI/finbert", device=device
)

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print(f"✓ FinBERT loaded on {'GPU' if device == 0 else 'CPU'}")
print(f"✓ Groq API configured: {bool(GROQ_API_KEY)}")
print("✓ Updated setup complete for new LLM sentiment analysis")

## 1. Load Dataset

In [None]:
# Load the 100% agreement dataset (highest quality)
data_path = "../../DatasetAnalysis_FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"

sentences = []
sentiments = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                sentiments.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "true_sentiment": sentiments})

print(f"Dataset loaded: {len(df)} sentences")
print(f"\nSentiment distribution:")
print(df["true_sentiment"].value_counts())

# Display sample
print("\n" + "=" * 80)
print("Sample sentences:")
print("=" * 80)
display(df.sample(5, random_state=42))

## 2. Chain-of-Thought Prompt Design

**Prompt Strategy**: Stepwise reasoning with explicit JSON output format.

In [None]:
def create_cot_prompt(sentence):
    """
    Creates a Chain-of-Thought prompt for sentiment classification.
    Guides the model through stepwise reasoning with emphasis on negative detection.
    """
    prompt = f"""You are a financial sentiment analysis expert. Use step-by-step reasoning to classify this financial statement.

Financial Statement: "{sentence}"

Think through this systematically:

1. IDENTIFY Key Metrics: What financial numbers, trends, or events are mentioned?

2. POSITIVE Signals: Any growth, profit increases, revenue gains, expansions, cost reductions?

3. NEGATIVE Signals: Any losses, declines, revenue drops, widening losses, layoffs, failed ventures?
   ⚠️ Pay special attention to: "loss", "decline", "decrease", "fell", "dropped", "worse", "weak"

4. NET IMPACT: From an investor's perspective, does this help or hurt the stock price?

5. FINAL CLASSIFICATION:
   - Positive: Financial improvements, growth, profitability increases
   - Negative: Financial deterioration, losses, declining metrics
   - Neutral: No clear financial impact or balanced signals

Provide ONLY this JSON format (no markdown, no extra text):
{{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Your step-by-step reasoning summary"
}}
"""

    return prompt

## 3. Model Inference Functions

In [None]:
def call_llama(prompt, model_name, temperature=0.0):
    """Call Llama via Groq API"""
    max_retries = 3
    last_error = None
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model=model_name,
                temperature=temperature,
                max_tokens=500,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            last_error = str(e)
            print(f"Attempt {attempt + 1}/{max_retries} failed: {last_error}")
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None


def parse_response(response_text):
    """Parse JSON response from model"""
    try:
        # Try to extract JSON from response
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in response_text:
            json_str = response_text.split("```")[1].strip()
        else:
            json_str = response_text.strip()
        result = json.loads(json_str)
        return result
    except Exception as e:
        print("Parse error: {}".format(str(e)[:100]))
        print("Raw response was:\n{}".format(response_text))
        # Fallback: try to extract sentiment with regex
        response_lower = response_text.lower() if response_text else ""
        if "positive" in response_lower and "negative" not in response_lower:
            return {
                "sentiment": "positive",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "negative" in response_lower:
            return {
                "sentiment": "negative",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "neutral" in response_lower:
            return {
                "sentiment": "neutral",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        return None


def call_finbert(sentence):
    """Call FinBERT for sentiment classification (local inference)"""
    try:
        result = finbert_pipeline(sentence[:512])  # FinBERT max length
        label_map = {
            "positive": "positive",
            "negative": "negative",
            "neutral": "neutral",
        }
        return {
            "sentiment": label_map.get(result[0]["label"].lower(), "neutral"),
            "confidence": result[0]["score"],
            "rationale": f"FinBERT classification: {result[0]['label']}",
        }
    except Exception as e:
        print(f"FinBERT error: {str(e)[:100]}")
        return None


print("✓ Model inference functions defined")

## 4. Run Experiments

In [None]:
# Run on full dataset for comprehensive evaluation
test_df = df.copy()

# E7: Updated Mixtral-8x7B (Chain-of-Thought)
print("Running E7: Updated Mixtral-8x7B (Chain-of-Thought)...")
e7_results = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E7 Progress"):
    prompt = create_cot_prompt(row["sentence"])
    response = call_llama(prompt, model_name="mixtral-8x7b-32768")
    if response:
        parsed = parse_response(response)
        if parsed:
            e7_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            e7_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )
    time.sleep(0.5)
e7_df = pd.DataFrame(e7_results)
print(f"\n✓ E7 completed: {len(e7_df)} predictions")
display(e7_df.head())

# E8: Updated GPT OSS 1Z20B (Chain-of-Thought)
print("Running E8: Updated Llama-3.1-70B (Chain-of-Thought)...")
e8_results = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E8 Progress"):
    prompt = create_cot_prompt(row["sentence"])
    response = call_llama(prompt, model_name="llama-3.1-70b-versatile")
    if response:
        parsed = parse_response(response)
        if parsed:
            e8_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            e8_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )
    time.sleep(0.5)
e8_df = pd.DataFrame(e8_results)
print(f"\n✓ E8 completed: {len(e8_df)} predictions")
display(e8_df.head())

# E9: FinBERT (Chain-of-Thought - Note: Cannot use reasoning)
print("Running E9: FinBERT (Chain-of-Thought)...")
print(
    "⚠️ Note: FinBERT uses pre-trained weights, cannot perform chain-of-thought reasoning"
)
e9_results = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E9 Progress"):
    # FinBERT uses direct inference (CoT reasoning is not applicable)
    result = call_finbert(row["sentence"])
    if result:
        e9_results.append(
            {
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": result.get("sentiment", "unknown"),
                "confidence": result.get("confidence", 0),
                "rationale": result.get("rationale", ""),
            }
        )
    else:
        e9_results.append(
            {
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": "error",
                "confidence": 0,
                "rationale": "FinBERT inference error",
            }
        )
e9_df = pd.DataFrame(e9_results)
print(f"\n✓ E9 completed: {len(e9_df)} predictions")
display(e9_df.head())

## 5. Calculate Metrics

In [None]:
# Updated metrics calculation for new LLMs
def calculate_metrics(df, exp_name):
    """Calculate all evaluation metrics"""
    if df.empty or "predicted_sentiment" not in df.columns:
        print(f"⚠️ Warning: {exp_name} has no valid predictions!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": 0,
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()
    if valid_df.empty:
        print(f"⚠️ Warning: {exp_name} has no valid predictions after filtering!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": len(df),
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    metrics = {
        "Experiment": exp_name,
        "Total Samples": len(df),
        "Valid Predictions": len(valid_df),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted-F1": f1_score(y_true, y_pred, average="weighted"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
        "MCC": matthews_corrcoef(y_true, y_pred),
    }

    labels = ["positive", "negative", "neutral"]
    precision_per_class = precision_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    recall_per_class = recall_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    for i, label in enumerate(labels):
        metrics[f"{label.capitalize()}_Precision"] = precision_per_class[i]
        metrics[f"{label.capitalize()}_Recall"] = recall_per_class[i]
        metrics[f"{label.capitalize()}_F1"] = f1_per_class[i]

    cm = confusion_matrix(y_true, y_pred, labels=labels)
    return metrics, cm, valid_df


# Calculate metrics for all experiments
e7_metrics, e7_cm, e7_valid = calculate_metrics(e7_df, "E7: Updated Mixtral-8x7B (CoT)")
e8_metrics, e8_cm, e8_valid = calculate_metrics(
    e8_df, "E8: Updated Llama-3.1-70B (CoT)"
)
e9_metrics, e9_cm, e9_valid = calculate_metrics(e9_df, "E9: Updated FinBERT (CoT)")

metrics_df = pd.DataFrame([e7_metrics, e8_metrics, e9_metrics])
print("\n" + "=" * 80)
print("CHAIN-OF-THOUGHT PERFORMANCE COMPARISON")
print("=" * 80)
display(
    metrics_df[
        ["Experiment", "Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall"]
    ].round(4)
)

## 6. Visualize Results

In [None]:
# Performance comparison bar chart
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Overall metrics
metrics_to_plot = ["Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall"]
x = np.arange(len(metrics_to_plot))
width = 0.25

for i, (metrics, label) in enumerate(
    [
        (e7_metrics, "Mixtral-8x7B (CoT)"),
        (e8_metrics, "Llama-3.1-70B (CoT)"),
        (e9_metrics, "FinBERT (CoT)"),
    ]
):
    values = [metrics[m] for m in metrics_to_plot]
    axes[0].bar(x + i * width, values, width, label=label, alpha=0.8)

axes[0].set_xlabel("Metrics", fontsize=12, weight="bold")
axes[0].set_ylabel("Score", fontsize=12, weight="bold")
axes[0].set_title(
    "Overall Performance Comparison (Chain-of-Thought)", fontsize=14, weight="bold"
)
axes[0].set_xticks(x + width)
axes[0].set_xticklabels(metrics_to_plot)
axes[0].legend()
axes[0].set_ylim([0, 1])
axes[0].grid(axis="y", alpha=0.3)

# Per-class F1 scores
classes = ["Positive", "Negative", "Neutral"]
x2 = np.arange(len(classes))

for i, (metrics, label) in enumerate(
    [
        (e7_metrics, "Mixtral-8x7B (CoT)"),
        (e8_metrics, "Llama-3.1-70B (CoT)"),
        (e9_metrics, "FinBERT (CoT)"),
    ]
):
    values = [metrics[f"{c}_F1"] for c in classes]
    axes[1].bar(x2 + i * width, values, width, label=label, alpha=0.8)

axes[1].set_xlabel("Sentiment Class", fontsize=12, weight="bold")
axes[1].set_ylabel("F1 Score", fontsize=12, weight="bold")
axes[1].set_title("Per-Class F1 Scores (Chain-of-Thought)", fontsize=14, weight="bold")
axes[1].set_xticks(x2 + width)
axes[1].set_xticklabels(classes)
axes[1].legend()
axes[1].set_ylim([0, 1])
axes[1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("cot_performance_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
labels = ["Positive", "Negative", "Neutral"]

for idx, (cm, title) in enumerate(
    [
        (e7_cm, "E7: Mixtral-8x7B (CoT)"),
        (e8_cm, "E8: Llama-3.1-70B (CoT)"),
        (e9_cm, "E9: FinBERT (CoT)"),
    ]
):
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=labels,
        yticklabels=labels,
        ax=axes[idx],
        cbar_kws={"label": "Count"},
    )
    axes[idx].set_title(title, fontsize=12, weight="bold")
    axes[idx].set_ylabel("True Label", fontsize=11, weight="bold")
    axes[idx].set_xlabel("Predicted Label", fontsize=11, weight="bold")

plt.suptitle(
    "Confusion Matrices - Chain-of-Thought Sentiment Analysis",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("cot_confusion_matrices.png", dpi=300, bbox_inches="tight")
plt.show()

## 7. Save Results

In [None]:
# Save detailed results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

e7_df.to_csv(f"e7_Mixtral_8x7B_cot_{timestamp}.csv", index=False)
e8_df.to_csv(f"e8_Llama_3_1_70B_cot_{timestamp}.csv", index=False)
e9_df.to_csv(f"e9_FinBERT_cot_{timestamp}.csv", index=False)

# Save metrics summary
metrics_df.to_csv(f"cot_metrics_summary_{timestamp}.csv", index=False)

print(f"\n✓ Chain-of-Thought results saved with timestamp: {timestamp}")
print("\nFiles created:")
print(f"  - e7_Mixtral_8x7B_cot_{timestamp}.csv")
print(f"  - e8_Llama_3_1_70B_cot_{timestamp}.csv")
print(f"  - e9_FinBERT_cot_{timestamp}.csv")
print(f"  - cot_metrics_summary_{timestamp}.csv")
print(f"  - cot_performance_comparison.png")
print(f"  - cot_confusion_matrices.png")

## 8. Error Analysis

In [None]:
# Error Analysis: Most Common Misclassifications
print("=" * 80)
print("ERROR ANALYSIS: CHAIN-OF-THOUGHT MISCLASSIFICATION PATTERNS")
print("=" * 80)

for df_result, exp_name in [
    (e7_valid, "E7: Mixtral-8x7B"),
    (e8_valid, "E8: Llama-3.1-70B"),
    (e9_valid, "E9: FinBERT"),
]:
    print(f"\n{exp_name}")
    print("-" * 80)

    # Identify errors
    errors = df_result[df_result["true_sentiment"] != df_result["predicted_sentiment"]]

    # Count error types
    error_types = (
        errors.groupby(["true_sentiment", "predicted_sentiment"])
        .size()
        .reset_index(name="count")
    )
    error_types = error_types.sort_values("count", ascending=False)

    print(
        f"\nTotal Errors: {len(errors)} / {len(df_result)} ({len(errors) / len(df_result) * 100:.2f}%)"
    )
    print("\nMost Common Error Types:")
    display(error_types.head(5))

    # Show examples of worst errors (high confidence, wrong prediction)
    if len(errors) > 0:
        worst_errors = errors.nlargest(3, "confidence")
        print(f"\nTop 3 High-Confidence Errors:")
        for idx, row in worst_errors.iterrows():
            print(
                f"\n  True: {row['true_sentiment']} | Predicted: {row['predicted_sentiment']} | Conf: {row['confidence']:.2f}"
            )
            print(f"  Sentence: {row['sentence'][:120]}...")
            print(f"  Rationale: {row['rationale']}")

# Class-wise Performance Comparison
print("\n" + "=" * 80)
print("CLASS-WISE PERFORMANCE BREAKDOWN")
print("=" * 80)

class_comparison = []
for metrics, model_name in [
    (e7_metrics, "Mixtral-8x7B"),
    (e8_metrics, "Llama-3.1-70B"),
    (e9_metrics, "FinBERT"),
]:
    for sentiment in ["Positive", "Negative", "Neutral"]:
        class_comparison.append(
            {
                "Model": model_name,
                "Class": sentiment,
                "Precision": metrics[f"{sentiment}_Precision"],
                "Recall": metrics[f"{sentiment}_Recall"],
                "F1-Score": metrics[f"{sentiment}_F1"],
            }
        )

class_df = pd.DataFrame(class_comparison)

# Pivot for better visualization
for metric in ["Precision", "Recall", "F1-Score"]:
    print(f"\n{metric} by Class:")
    pivot = class_df.pivot(index="Class", columns="Model", values=metric)
    display(pivot.round(4))

print("\n" + "=" * 80)
print("COMPREHENSIVE METRICS TABLE")
print("=" * 80)
display(metrics_df.round(4))

## 9. Confidence Analysis

In [None]:
# Confidence Analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (df_result, title) in enumerate(
    [(e7_valid, "Mixtral-8x7B"), (e8_valid, "Llama-3.1-70B"), (e9_valid, "FinBERT")]
):
    df_result["correct"] = (
        df_result["true_sentiment"] == df_result["predicted_sentiment"]
    )

    correct_conf = df_result[df_result["correct"]]["confidence"]
    incorrect_conf = df_result[~df_result["correct"]]["confidence"]

    axes[idx].hist(
        [correct_conf, incorrect_conf],
        bins=20,
        label=["Correct", "Incorrect"],
        alpha=0.7,
        color=["green", "red"],
    )
    axes[idx].set_xlabel("Confidence Score", fontsize=11, weight="bold")
    axes[idx].set_ylabel("Frequency", fontsize=11, weight="bold")
    axes[idx].set_title(
        f"{title}\nMean Conf: Correct={correct_conf.mean():.3f}, Incorrect={incorrect_conf.mean():.3f}",
        fontsize=11,
        weight="bold",
    )
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.suptitle(
    "Confidence Distribution: Correct vs Incorrect Predictions (Chain-of-Thought)",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("cot_confidence_analysis.png", dpi=300, bbox_inches="tight")
plt.show()

print("\n" + "=" * 80)
print("CONFIDENCE CALIBRATION ANALYSIS")
print("=" * 80)
for df_result, exp_name in [
    (e7_valid, "E7: Mixtral-8x7B"),
    (e8_valid, "E8: Llama-3.1-70B"),
    (e9_valid, "E9: FinBERT"),
]:
    df_result["correct"] = (
        df_result["true_sentiment"] == df_result["predicted_sentiment"]
    )

    avg_conf_correct = df_result[df_result["correct"]]["confidence"].mean()
    avg_conf_incorrect = df_result[~df_result["correct"]]["confidence"].mean()
    calibration_gap = avg_conf_correct - avg_conf_incorrect

    print(f"\n{exp_name}:")
    print(f"  Average Confidence (Correct): {avg_conf_correct:.4f}")
    print(f"  Average Confidence (Incorrect): {avg_conf_incorrect:.4f}")
    print(f"  Calibration Gap: {calibration_gap:.4f}")
    print(f"  Total Correct: {df_result['correct'].sum()} / {len(df_result)}")

    # Confidence by sentiment class
    print(f"\n  Confidence by Predicted Class:")
    for sentiment in ["positive", "negative", "neutral"]:
        class_df = df_result[df_result["predicted_sentiment"] == sentiment]
        if len(class_df) > 0:
            print(
                f"    {sentiment.capitalize()}: {class_df['confidence'].mean():.4f} (n={len(class_df)})"
            )

## 10. Classification Reports

In [None]:
# Detailed Classification Reports
print("\n" + "=" * 80)
print("DETAILED CLASSIFICATION REPORTS")
print("=" * 80)

for df_result, exp_name in [
    (e7_valid, "E7: Mixtral-8x7B"),
    (e8_valid, "E8: Llama-3.1-70B"),
    (e9_valid, "E9: FinBERT"),
]:
    print("\n" + "=" * 80)
    print(f"{exp_name}")
    print("=" * 80)
    print(
        classification_report(
            df_result["true_sentiment"],
            df_result["predicted_sentiment"],
            labels=["positive", "negative", "neutral"],
            target_names=["Positive", "Negative", "Neutral"],
        )
    )

# Class-wise Metrics Summary Table
print("\n" + "=" * 80)
print("CLASS-WISE METRICS SUMMARY")
print("=" * 80)

summary_data = []
for metrics, model in [
    (e7_metrics, "Mixtral-8x7B"),
    (e8_metrics, "Llama-3.1-70B"),
    (e9_metrics, "FinBERT"),
]:
    summary_data.append(
        {
            "Model": model,
            "Pos_P": metrics["Positive_Precision"],
            "Pos_R": metrics["Positive_Recall"],
            "Pos_F1": metrics["Positive_F1"],
            "Neg_P": metrics["Negative_Precision"],
            "Neg_R": metrics["Negative_Recall"],
            "Neg_F1": metrics["Negative_F1"],
            "Neu_P": metrics["Neutral_Precision"],
            "Neu_R": metrics["Neutral_Recall"],
            "Neu_F1": metrics["Neutral_F1"],
        }
    )

summary_df = pd.DataFrame(summary_data)
print("\nPer-Class Metrics (P=Precision, R=Recall, F1=F1-Score):")
display(summary_df.round(4))

## 11. CoT vs Few-Shot vs Zero-Shot Comparison

In [None]:
# Compare with Few-Shot and Zero-Shot results (if available)
import glob
import os

print("=" * 80)
print("CHAIN-OF-THOUGHT vs FEW-SHOT vs ZERO-SHOT COMPARISON")
print("=" * 80)

# Try to load Few-Shot and Zero-Shot results
few_shot_files = glob.glob("../Few_Shot/few_shot_metrics_summary_*.csv")
zero_shot_files = glob.glob("../Zero_Shot/zero_shot_metrics_summary_*.csv")

comparison_data = [metrics_df.copy()]
comparison_data[0]["Approach"] = "Chain-of-Thought"

if few_shot_files:
    latest_few_shot = max(few_shot_files, key=os.path.getctime)
    print(f"\n✓ Loading Few-Shot results from: {os.path.basename(latest_few_shot)}")
    try:
        few_shot_df = pd.read_csv(latest_few_shot)
        few_shot_df["Approach"] = "Few-Shot"
        comparison_data.append(few_shot_df)
    except Exception as e:
        print(f"  ⚠️ Could not load few-shot results: {str(e)}")
else:
    print("\n⚠️ No Few-Shot results found")

if zero_shot_files:
    latest_zero_shot = max(zero_shot_files, key=os.path.getctime)
    print(f"✓ Loading Zero-Shot results from: {os.path.basename(latest_zero_shot)}")
    try:
        zero_shot_df = pd.read_csv(latest_zero_shot)
        zero_shot_df["Approach"] = "Zero-Shot"
        comparison_data.append(zero_shot_df)
    except Exception as e:
        print(f"  ⚠️ Could not load zero-shot results: {str(e)}")
else:
    print("⚠️ No Zero-Shot results found")

if len(comparison_data) > 1:
    combined = pd.concat(comparison_data, ignore_index=True)

    # Select key metrics for comparison
    comparison_cols = [
        "Experiment",
        "Approach",
        "Accuracy",
        "Macro-F1",
        "MCC",
        "Negative_F1",
        "Positive_F1",
        "Neutral_F1",
    ]

    available_cols = [col for col in comparison_cols if col in combined.columns]

    print("\n" + "=" * 80)
    print("KEY METRICS COMPARISON")
    print("=" * 80)
    display(combined[available_cols].round(4))

    # Calculate improvements
    if len(comparison_data) == 3:  # All three approaches available
        print("\n" + "=" * 80)
        print("PERFORMANCE PROGRESSION")
        print("=" * 80)

        for i in range(min(3, len(metrics_df))):
            model_name = ["Mixtral-8x7B", "Llama-3.1-70B", "FinBERT"][i]
            print(f"\n{model_name}:")

            if i < len(zero_shot_df):
                zs_macro = zero_shot_df.iloc[i]["Macro-F1"]
                print(f"  Zero-Shot Macro-F1: {zs_macro:.4f}")

            if i < len(few_shot_df):
                fs_macro = few_shot_df.iloc[i]["Macro-F1"]
                zs_to_fs = (
                    ((fs_macro - zs_macro) / zs_macro * 100) if zs_macro > 0 else 0
                )
                print(
                    f"  Few-Shot Macro-F1:  {fs_macro:.4f} ({zs_to_fs:+.2f}% vs Zero-Shot)"
                )

            cot_macro = metrics_df.iloc[i]["Macro-F1"]
            fs_to_cot = ((cot_macro - fs_macro) / fs_macro * 100) if fs_macro > 0 else 0
            total_improvement = (
                ((cot_macro - zs_macro) / zs_macro * 100) if zs_macro > 0 else 0
            )
            print(
                f"  CoT Macro-F1:       {cot_macro:.4f} ({fs_to_cot:+.2f}% vs Few-Shot, {total_improvement:+.2f}% vs Zero-Shot)"
            )

            # Negative F1 comparison
            print(f"\n  Negative F1 Progression:")
            if i < len(zero_shot_df):
                zs_neg = zero_shot_df.iloc[i]["Negative_F1"]
                print(f"    Zero-Shot: {zs_neg:.4f}")
            if i < len(few_shot_df):
                fs_neg = few_shot_df.iloc[i]["Negative_F1"]
                print(f"    Few-Shot:  {fs_neg:.4f}")
            cot_neg = metrics_df.iloc[i]["Negative_F1"]
            print(f"    CoT:       {cot_neg:.4f}")
else:
    print("\n⚠️ Run Few-Shot and Zero-Shot experiments for complete comparison")

## 12. Expected Conclusions from Chain-of-Thought Experiment

### 1. **CoT vs Few-Shot vs Zero-Shot Performance**
   - **Key Question**: Does explicit step-by-step reasoning improve accuracy?
   - **Expected**: CoT should outperform Few-Shot by 5-15% on complex cases
   - **Reality Check**: Previous 100-sample run showed E9 (Llama) had 100% parsing failures
   - **Metric to Watch**: Negative F1 improvement (was 0.0 in previous run for E7 & E8)

### 2. **Reasoning Quality Assessment**
   - **Step-by-Step Analysis**: Do models actually follow the 5-step reasoning process?
   - **Rationale Depth**: Are CoT rationales more detailed than Few-Shot/Zero-Shot?
   - **Error Transparency**: Can we identify where reasoning goes wrong?
   - **Parsing Success Rate**: Critical for Llama (previous: 0% success on CoT format)

### 3. **Model-Specific CoT Effectiveness**
   - **GPT OSS Models**: Expected to handle structured prompts better
   - **FinBERT**: Previous run showed catastrophic JSON formatting failure
   - **Format Compliance**: New simplified format should reduce parsing errors
   - **Reasoning Capability**: Which model provides best step-by-step analysis?

### 4. **Negative Class Detection (Critical)**
   - **Previous Results**: E7 & E8 had 0% Negative F1 despite CoT reasoning
   - **Enhanced Prompt**: New version explicitly highlights negative indicators with ⚠️
   - **Target**: Negative F1 > 0.60 (297 negative samples in full dataset)
   - **Hypothesis**: Step 3 (NEGATIVE Signals) should force models to look for losses/declines

### 5. **JSON Format Compliance**
   - **Previous Issue**: FinBERT had 100/100 parsing errors
   - **Root Cause**: Complex CoT format with multiple steps confused the model
   - **Fix**: Simplified format, clearer JSON instruction ("no markdown, no extra text")
   - **Success Metric**: Parsing error rate < 10%

### 6. **Confidence Calibration in CoT**
   - **Expected**: CoT should produce more calibrated confidence scores
   - **Reasoning**: Step-by-step analysis allows uncertainty assessment
   - **Previous**: Llama all predictions had 0.5 confidence (parsing fallback)
   - **Comparison**: CoT vs Few-Shot confidence distribution

### 7. **Computational Cost Analysis**
   - **Longer Prompts**: CoT uses ~2-3x more tokens than Few-Shot
   - **API Costs**: Higher per-prediction cost due to prompt length
   - **Latency**: Full dataset (2,217 samples × 3 models × 0.5s) = ~2 hours
   - **Trade-off**: Is performance gain worth the added cost?

### 8. **Error Pattern Analysis**
   - **Common Mistakes**: Do all models fail on same sentences?
   - **Reasoning Failures**: Where does step-by-step logic break down?
   - **High-Confidence Errors**: Are wrong predictions still confident?
   - **Class Confusion**: Negative → Neutral or Negative → Positive?

### 9. **CoT Prompt Engineering Effectiveness**
   - **Structured Steps**: Does numbered reasoning improve consistency?
   - **Warning Symbol ⚠️**: Does emphasis on negatives help detection?
   - **Explicit Keywords**: Do listed negative terms ("loss", "decline") trigger recognition?
   - **Simplified Format**: Does cleaner JSON reduce Llama parsing errors?

### 10. **Production Deployment Viability**
   - **Accuracy Threshold**: Need Macro-F1 > 0.75 for deployment
   - **Parsing Reliability**: Must have < 5% format errors
   - **Negative Detection**: Critical for risk assessment (can't miss bad news)
   - **Cost-Benefit**: CoT worth it ONLY if significantly better than Few-Shot

### 11. **Critical Success Factors**

**For Experiment to be Valuable:**
✅ Llama parsing success rate > 90% (vs 0% previously)
✅ Negative F1 > 0.50 for at least one model (vs 0.0 previously)
✅ Macro-F1 improvement over Few-Shot by > 10%
✅ Reasoning quality demonstrably better than Few-Shot

**If These Fail:**
❌ CoT not suitable for these models
❌ Stick with simpler Few-Shot approach
❌ Consider fine-tuning instead of prompt engineering

### 12. **Actionable Recommendations Based on Results**

- **If E9 still fails**: Llama incompatible with complex CoT formats
- **If Negative F1 < 0.3**: Dataset imbalance too severe, need different approach
- **If CoT ≈ Few-Shot**: Simpler prompts preferred (lower cost, same performance)
- **If parsing errors > 20%**: Format still too complex, simplify further