In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn google-generativeai groq python-dotenv tqdm -q

In [None]:
# Suppress deprecation warnings
import warnings
import os

warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
    matthews_corrcoef,
)

from groq import Groq
from dotenv import load_dotenv
from transformers import pipeline
import torch

load_dotenv()

if os.getenv("GROQ_API_KEY"):
    groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

# Initialize FinBERT for E12
device = 0 if torch.cuda.is_available() else -1
finbert_pipeline = pipeline(
    "sentiment-analysis", model="ProsusAI/finbert", device=device
)

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print("✓ Setup complete (FinBERT loaded)")

## 1. Load Dataset

In [None]:
data_path = "../../DatasetAnalysis_FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"

sentences = []
sentiments = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                sentiments.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "true_sentiment": sentiments})
print(f"Dataset loaded: {len(df)} sentences")
print(f"\nSentiment distribution:")
print(df["true_sentiment"].value_counts())

## 2. Tree-of-Thought Prompt Design

**Multi-Path Reasoning**:
- Path 1: Consider "positive" hypothesis
- Path 2: Consider "negative" hypothesis  
- Path 3: Consider "neutral" hypothesis
- Evaluation: Score each path's evidence strength
- Selection: Choose the most supported hypothesis

In [None]:
def create_tot_prompt(sentence):
    """
    Creates a Tree-of-Thought prompt with multi-path exploration.
    """
    prompt = f"""You are a financial sentiment analysis expert. Analyze this statement using a tree-of-thought approach.

Financial Statement:
"{sentence}"

TASK: Explore three possible sentiment classifications and select the best one.

---
PATH 1: Hypothesis = POSITIVE
Consider if this statement represents positive news for investors.
- What evidence supports this being positive?
- What evidence contradicts this being positive?
- Confidence score (0-1) for this hypothesis:

PATH 2: Hypothesis = NEGATIVE ⚠️
Consider if this statement represents FINANCIAL DETERIORATION for investors.
Look specifically for: losses, declines, revenue drops, margin compression, layoffs, failed ventures, widening losses, falling sales, cost increases, debt problems.
- What evidence supports this being negative?
- What evidence contradicts this being negative?
- Confidence score (0-1) for this hypothesis:

PATH 3: Hypothesis = NEUTRAL
Consider if this statement has no clear market impact.
- What evidence supports this being neutral?
- What evidence contradicts this being neutral?
- Confidence score (0-1) for this hypothesis:

---
FINAL DECISION:
Based on evaluating all three paths, select the hypothesis with the strongest evidence.
Provide ONLY this JSON format (no markdown, no extra text):
Provide your final answer in this exact JSON format:
{{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Explanation of why this hypothesis was selected over the others",
    "path_scores": {{
        "positive": 0.0-1.0,
        "negative": 0.0-1.0,
        "neutral": 0.0-1.0
    }}
}}
"""
    return prompt


# Test prompt
test_sentence = (
    "The company reported mixed results with revenue up 10% but margins declining."
)
print("=" * 80)
print("TREE-OF-THOUGHT PROMPT EXAMPLE")
print("=" * 80)
print(create_tot_prompt(test_sentence))

## 3. Model Inference Functions

In [None]:
def call_llama(prompt, model_name, temperature=0.0):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=model_name,
                temperature=temperature,
                max_tokens=1500,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None


def parse_response(response_text):
    """Parse JSON with path scores from ToT response"""
    try:
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "{" in response_text:
            start = response_text.find("{")
            end = response_text.rfind("}") + 1
            json_str = response_text[start:end]
        else:
            json_str = response_text.strip()

        result = json.loads(json_str)
        return result
    except:
        response_lower = response_text.lower()
        if "positive" in response_lower and "negative" not in response_lower:
            return {
                "sentiment": "positive",
                "confidence": 0.5,
                "rationale": "Parsed",
                "path_scores": {},
            }
        elif "negative" in response_lower:
            return {
                "sentiment": "negative",
                "confidence": 0.5,
                "rationale": "Parsed",
                "path_scores": {},
            }
        elif "neutral" in response_lower:
            return {
                "sentiment": "neutral",
                "confidence": 0.5,
                "rationale": "Parsed",
                "path_scores": {},
            }
        return None


def call_finbert(sentence):
    """
    Call FinBERT for sentiment classification.
    Note: FinBERT does not support Tree-of-Thought reasoning.
    Returns direct classification result.
    """
    try:
        result = finbert_pipeline(sentence)[0]
        label_map = {
            "positive": "positive",
            "negative": "negative",
            "neutral": "neutral",
        }
        sentiment = label_map.get(result["label"].lower(), "neutral")

        return {
            "sentiment": sentiment,
            "confidence": result["score"],
            "rationale": "FinBERT direct inference (ToT reasoning not applicable)",
            "path_scores": {},
        }
    except Exception as e:
        print(f"FinBERT error: {e}")
        return None


print("✓ Inference functions defined (including FinBERT)")

## 4. Run Experiments

In [None]:
# Run on full dataset for comprehensive evaluation
test_df = df.copy()  # Use complete dataset (2,217 samples)


# Updated Tree-of-Thought experiments for new LLMs
def run_tot_experiment(test_df, model_func, model_name, exp_id):
    print(f"Running {exp_id}: {model_name} (Tree-of-Thought)...")
    results = []

    for idx, row in tqdm(
        test_df.iterrows(), total=len(test_df), desc=f"{exp_id} Progress"
    ):
        prompt = create_tot_prompt(row["sentence"])
        response = model_func(prompt)

        if response:
            parsed = parse_response(response)
            if parsed:
                results.append(
                    {
                        "sentence": row["sentence"],
                        "true_sentiment": row["true_sentiment"],
                        "predicted_sentiment": parsed.get("sentiment", "unknown"),
                        "confidence": parsed.get("confidence", 0),
                        "rationale": parsed.get("rationale", ""),
                        "path_scores": str(parsed.get("path_scores", {})),
                        "full_response": response[:700],
                    }
                )

        time.sleep(0.5)  # Adjusted for new LLMs

    results_df = pd.DataFrame(results)
    print(f"\n✓ {exp_id} completed: {len(results_df)} predictions")
    return results_df


# Run Tree-of-Thought experiments with new LLMs
e10_df = run_tot_experiment(
    test_df,
    lambda p: call_llama(p, model_name="mixtral-8x7b-32768"),
    "Mixtral-8x7B",
    "E10",
)
e11_df = run_tot_experiment(
    test_df,
    lambda p: call_llama(p, model_name="llama-3.1-70b-versatile"),
    "Llama-3.1-70B",
    "E11",
)

# E12: FinBERT (direct inference, no ToT reasoning)
print("Running E12: FinBERT (Tree-of-Thought - Direct Classification)...")
e12_results = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E12 Progress"):
    result = call_finbert(row["sentence"])
    if result:
        e12_results.append(
            {
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": result["sentiment"],
                "confidence": result["confidence"],
                "rationale": result["rationale"],
                "path_scores": str(result["path_scores"]),
                "full_response": f"FinBERT: {result['sentiment']} ({result['confidence']:.2f})",
            }
        )
    time.sleep(0.1)

e12_df = pd.DataFrame(e12_results)
print(f"\n✓ E12 completed: {len(e12_df)} predictions")

display(e10_df.head())
display(e11_df.head())
display(e12_df.head())

## 5. Calculate Metrics

In [None]:
def calculate_metrics(df, exp_name):
    """Calculate all evaluation metrics"""
    # Check if dataframe is empty or missing required columns
    if df.empty or "predicted_sentiment" not in df.columns:
        print(f"⚠️ Warning: {exp_name} has no valid predictions!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": 0,
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    # Filter out errors
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()

    # Check if we have valid predictions
    if valid_df.empty:
        print(f"⚠️ Warning: {exp_name} has no valid predictions after filtering!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": len(df),
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    metrics = {
        "Experiment": exp_name,
        "Total Samples": len(df),
        "Valid Predictions": len(valid_df),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted-F1": f1_score(y_true, y_pred, average="weighted"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
        "MCC": matthews_corrcoef(y_true, y_pred),
    }

    # Per-class metrics
    labels = ["positive", "negative", "neutral"]
    precision_per_class = precision_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    recall_per_class = recall_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    for i, label in enumerate(labels):
        metrics[f"{label.capitalize()}_Precision"] = precision_per_class[i]
        metrics[f"{label.capitalize()}_Recall"] = recall_per_class[i]
        metrics[f"{label.capitalize()}_F1"] = f1_per_class[i]

    cm = confusion_matrix(y_true, y_pred, labels=labels)

    return metrics, cm, valid_df


# Calculate metrics for all experiments
e10_metrics, e10_cm, e10_valid = calculate_metrics(e10_df, "E10: Mixtral-8x7B (ToT)")
e11_metrics, e11_cm, e11_valid = calculate_metrics(e11_df, "E11: Llama-3.1-70B (ToT)")
e12_metrics, e12_cm, e12_valid = calculate_metrics(e12_df, "E12: FinBERT (ToT)")

# Create metrics comparison table
metrics_df = pd.DataFrame([e10_metrics, e11_metrics, e12_metrics])

print("\n" + "=" * 80)
print("TREE-OF-THOUGHT PERFORMANCE COMPARISON")
print("=" * 80)
display(
    metrics_df[
        ["Experiment", "Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall", "MCC"]
    ].round(4)
)

## 6. Visualize Results

In [None]:
# Performance comparison bar chart
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Overall metrics
metrics_to_plot = ["Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall", "MCC"]
x = np.arange(len(metrics_to_plot))
width = 0.25

for i, (metrics, label) in enumerate(
    [
        (e10_metrics, "Mixtral-8x7B"),
        (e11_metrics, "Llama-3.1-70B"),
        (e12_metrics, "FinBERT"),
    ]
):
    values = [metrics[m] for m in metrics_to_plot]
    axes[0].bar(x + i * width, values, width, label=label, alpha=0.8)

axes[0].set_xlabel("Metrics", fontsize=12, weight="bold")
axes[0].set_ylabel("Score", fontsize=12, weight="bold")
axes[0].set_title(
    "Overall Performance Comparison (Tree-of-Thought)", fontsize=14, weight="bold"
)
axes[0].set_xticks(x + width)
axes[0].set_xticklabels(metrics_to_plot, rotation=15, ha="right")
axes[0].legend()
axes[0].set_ylim([0, 1])
axes[0].grid(axis="y", alpha=0.3)

# Per-class F1 scores
classes = ["Positive", "Negative", "Neutral"]
x2 = np.arange(len(classes))

for i, (metrics, label) in enumerate(
    [
        (e10_metrics, "Mixtral-8x7B"),
        (e11_metrics, "Llama-3.1-70B"),
        (e12_metrics, "FinBERT"),
    ]
):
    values = [metrics[f"{c}_F1"] for c in classes]
    axes[1].bar(x2 + i * width, values, width, label=label, alpha=0.8)

axes[1].set_xlabel("Sentiment Class", fontsize=12, weight="bold")
axes[1].set_ylabel("F1 Score", fontsize=12, weight="bold")
axes[1].set_title("Per-Class F1 Scores (Tree-of-Thought)", fontsize=14, weight="bold")
axes[1].set_xticks(x2 + width)
axes[1].set_xticklabels(classes)
axes[1].legend()
axes[1].set_ylim([0, 1])
axes[1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("tot_performance_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
labels = ["Positive", "Negative", "Neutral"]

for idx, (cm, title) in enumerate(
    [
        (e10_cm, "E10: Mixtral-8x7B"),
        (e11_cm, "E11: Llama-3.1-70B"),
        (e12_cm, "E12: FinBERT"),
    ]
):
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=labels,
        yticklabels=labels,
        ax=axes[idx],
        cbar_kws={"label": "Count"},
    )
    axes[idx].set_title(title, fontsize=12, weight="bold")
    axes[idx].set_ylabel("True Label", fontsize=11, weight="bold")
    axes[idx].set_xlabel("Predicted Label", fontsize=11, weight="bold")

plt.suptitle(
    "Confusion Matrices - Tree-of-Thought Sentiment Analysis",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("tot_confusion_matrices.png", dpi=300, bbox_inches="tight")
plt.show()

## 7. Save Results

In [None]:
# Save detailed results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

e10_df.to_csv(f"e10_Mixtral_8x7B_tot_{timestamp}.csv", index=False)
e11_df.to_csv(f"e11_Llama_3.1_70B_tot_{timestamp}.csv", index=False)
e12_df.to_csv(f"e12_FinBERT_tot_{timestamp}.csv", index=False)
metrics_df.to_csv(f"tot_metrics_summary_{timestamp}.csv", index=False)

print(f"\n✓ Results saved with timestamp: {timestamp}")
print("\nFiles created:")
print(f"  - e10_Mixtral_8x7B_tot_{timestamp}.csv")
print(f"  - e11_Llama_3.1_70B_tot_{timestamp}.csv")
print(f"  - e12_FinBERT_tot_{timestamp}.csv")
print(f"  - tot_metrics_summary_{timestamp}.csv")
print(f"  - tot_performance_comparison.png")
print(f"  - tot_confusion_matrices.png")
print(f"  - tot_confidence_analysis.png")

## 8. Error Analysis

In [None]:
# Error Analysis: Most Common Misclassifications
print("=" * 80)
print("ERROR ANALYSIS: TOP MISCLASSIFIED PATTERNS")
print("=" * 80)

for df_result, exp_name in [
    (e10_valid, "E10: Mixtral-8x7B"),
    (e11_valid, "E11: Llama-3.1-70B"),
    (e12_valid, "E12: FinBERT"),
]:
    print(f"\n{exp_name}:")
    errors = df_result[df_result["true_sentiment"] != df_result["predicted_sentiment"]]

    # Count confusion pairs
    confusion_pairs = errors.groupby(["true_sentiment", "predicted_sentiment"]).size()
    print(f"Total Errors: {len(errors)}")
    print("\nMost Common Misclassifications:")
    for (true_label, pred_label), count in (
        confusion_pairs.sort_values(ascending=False).head(5).items()
    ):
        print(f"  {true_label} → {pred_label}: {count} errors")

    # Show sample errors
    print(f"\nSample Misclassified Sentences:")
    for idx, row in errors.head(3).iterrows():
        print(f"\n  Sentence: {row['sentence'][:100]}...")
        print(
            f"  True: {row['true_sentiment']} | Predicted: {row['predicted_sentiment']} | Confidence: {row['confidence']:.3f}"
        )

## 9. Confidence Calibration Analysis

In [None]:
# Confidence analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (df_result, title) in enumerate(
    [(e10_valid, "Mixtral-8x7B"), (e11_valid, "Llama-3.1-70B"), (e12_valid, "FinBERT")]
):
    df_result["correct"] = (
        df_result["true_sentiment"] == df_result["predicted_sentiment"]
    )

    correct_conf = df_result[df_result["correct"]]["confidence"]
    incorrect_conf = df_result[~df_result["correct"]]["confidence"]

    axes[idx].hist(
        [correct_conf, incorrect_conf],
        bins=20,
        label=["Correct", "Incorrect"],
        alpha=0.7,
        color=["green", "red"],
    )
    axes[idx].set_xlabel("Confidence Score", fontsize=11, weight="bold")
    axes[idx].set_ylabel("Frequency", fontsize=11, weight="bold")
    axes[idx].set_title(
        f"{title}\nMean Conf: Correct={correct_conf.mean():.3f}, Incorrect={incorrect_conf.mean():.3f}",
        fontsize=11,
        weight="bold",
    )
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.suptitle(
    "Confidence Distribution: Correct vs Incorrect Predictions (ToT)",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("tot_confidence_analysis.png", dpi=300, bbox_inches="tight")
plt.show()

## 10. Classification Reports

In [None]:
# Print detailed metrics for each experiment
print("\n" + "=" * 80)
print("DETAILED CLASSIFICATION REPORTS")
print("=" * 80)

for df_result, exp_name in [
    (e10_valid, "E10: Mixtral-8x7B (ToT)"),
    (e11_valid, "E11: Llama-3.1-70B (ToT)"),
    (e12_valid, "E12: FinBERT (ToT)"),
]:
    print("\n" + "=" * 80)
    print(f"{exp_name}")
    print("=" * 80)
    print(
        classification_report(
            df_result["true_sentiment"],
            df_result["predicted_sentiment"],
            labels=["positive", "negative", "neutral"],
            target_names=["Positive", "Negative", "Neutral"],
        )
    )

# Per-class metrics summary table
print("\n" + "=" * 80)
print("PER-CLASS METRICS SUMMARY")
print("=" * 80)
class_metrics_summary = metrics_df[
    [
        "Experiment",
        "Positive_Precision",
        "Positive_Recall",
        "Positive_F1",
        "Negative_Precision",
        "Negative_Recall",
        "Negative_F1",
        "Neutral_Precision",
        "Neutral_Recall",
        "Neutral_F1",
    ]
]
display(class_metrics_summary.round(4))

## 11. Tree-of-Thought vs CoT vs Few-Shot vs Zero-Shot Comparison

In [None]:
# Load results from all prompting approaches for comprehensive comparison
import glob
import os

# Find the most recent results files from each approach
zero_shot_metrics = glob.glob("../Zero_Shot/zero_shot_metrics_summary_*.csv")
few_shot_metrics = glob.glob("../Few_Shot/few_shot_metrics_summary_*.csv")
cot_metrics = glob.glob("../Chain_of_Thought/cot_metrics_summary_*.csv")

all_approaches = []

# Load Zero-Shot results
if zero_shot_metrics:
    zs_df = pd.read_csv(sorted(zero_shot_metrics)[-1])  # Most recent
    zs_df["Approach"] = "Zero-Shot"
    all_approaches.append(zs_df)

# Load Few-Shot results
if few_shot_metrics:
    fs_df = pd.read_csv(sorted(few_shot_metrics)[-1])
    fs_df["Approach"] = "Few-Shot"
    all_approaches.append(fs_df)

# Load CoT results
if cot_metrics:
    cot_df = pd.read_csv(sorted(cot_metrics)[-1])
    cot_df["Approach"] = "Chain-of-Thought"
    all_approaches.append(cot_df)

# Add current ToT results
tot_current = metrics_df.copy()
tot_current["Approach"] = "Tree-of-Thought"
all_approaches.append(tot_current)

# Combine all results
if all_approaches:
    comparison_df = pd.concat(all_approaches, ignore_index=True)

    # Reorder columns for better readability
    cols = [
        "Approach",
        "Experiment",
        "Accuracy",
        "Macro-F1",
        "MCC",
        "Macro-Precision",
        "Macro-Recall",
    ]
    available_cols = [c for c in cols if c in comparison_df.columns]

    print("\n" + "=" * 100)
    print("COMPREHENSIVE PROMPTING APPROACH COMPARISON")
    print("=" * 100)
    display(comparison_df[available_cols].round(4))

    # Best performance by approach
    print("\n" + "=" * 100)
    print("BEST PERFORMANCE BY APPROACH")
    print("=" * 100)
    best_by_approach = comparison_df.groupby("Approach")[
        ["Accuracy", "Macro-F1", "MCC"]
    ].max()
    display(best_by_approach.round(4))

    # Analysis
    print("\n" + "=" * 100)
    print("KEY INSIGHTS")
    print("=" * 100)
    best_accuracy_row = comparison_df.loc[comparison_df["Accuracy"].idxmax()]
    best_f1_row = comparison_df.loc[comparison_df["Macro-F1"].idxmax()]
    best_mcc_row = comparison_df.loc[comparison_df["MCC"].idxmax()]

    print(
        f"✓ Best Accuracy: {best_accuracy_row['Experiment']} ({best_accuracy_row['Approach']}) = {best_accuracy_row['Accuracy']:.4f}"
    )
    print(
        f"✓ Best Macro-F1: {best_f1_row['Experiment']} ({best_f1_row['Approach']}) = {best_f1_row['Macro-F1']:.4f}"
    )
    print(
        f"✓ Best MCC: {best_mcc_row['Experiment']} ({best_mcc_row['Approach']}) = {best_mcc_row['MCC']:.4f}"
    )
else:
    print(
        "⚠️ No comparison files found. Run other experiments first to enable full comparison."
    )

## 12. Expected Conclusions from Tree-of-Thought Experiment

### Multi-Path Reasoning Analysis

**1. Hypothesis Exploration Impact**
- **Strength**: ToT forces models to systematically evaluate all three sentiment possibilities (positive, negative, neutral) before making a decision
- **Expected Behavior**: Should reduce "first impression bias" where models commit to an answer too quickly
- **Path Score Analysis**: When all three paths have similar scores, this indicates an ambiguous/difficult sentence that may benefit from human review
- **Justification Quality**: ToT provides explicit rationale comparing all options, making predictions more explainable than zero-shot

**2. Computational Cost vs Performance Trade-Off**
- **Token Usage**: ~3x higher than Zero-Shot due to exploring three hypothesis paths
- **Cost Implication**: If ToT improves Macro-F1 by <5%, it may not justify 3x cost increase
- **Latency**: Longer prompts = slower inference, problematic for real-time applications
- **Recommendation**: Use ToT only if accuracy gains outweigh cost, or for high-stakes predictions where explainability matters

**3. Negative Sentiment Detection (Critical)**
- **Challenge**: Negative class is typically hardest to detect (lowest recall in all approaches)
- **ToT Advantage**: Explicit "negative hypothesis" path with listed negative indicators (losses, declines, revenue drops, etc.) should improve negative recall
- **Expected Improvement**: If ToT negative recall > CoT/Few-Shot/Zero-Shot, this validates the multi-path approach
- **Business Impact**: Better negative detection = earlier risk identification in financial monitoring

**4. Model-Specific Behaviors**

   **Mixtral-8x7B (E10)**:
   - **Architecture**: Mixture-of-Experts (8 specialists)
   - **Expected Strength**: Different experts may naturally align with different hypothesis paths
   - **Potential Weakness**: 20B params may struggle with complex multi-step reasoning
   - **Prediction**: Moderate performance, good cost-efficiency

   **Llama-3.1-70B (E11)**:
   - **Architecture**: Dense 70B parameter model
   - **Expected Strength**: Larger capacity = better complex reasoning and hypothesis evaluation
   - **Expected Weakness**: May still lack financial domain knowledge
   - **Prediction**: Best general performance among LLMs

   **FinBERT (E12)**:
   - **Architecture**: BERT-base fine-tuned on financial text
   - **Expected Strength**: Deep financial domain knowledge, fast inference
   - **Expected Weakness**: Cannot perform actual ToT reasoning (no multi-path exploration)
   - **Note**: E12 uses direct classification, not ToT reasoning (included for comparison only)
   - **Prediction**: Competitive performance despite no ToT reasoning, due to domain specialization

**5. Confidence Calibration**
- **Well-Calibrated**: Correct predictions should have higher avg confidence than incorrect ones
- **ToT Advantage**: Path score agreement can indicate confidence (all paths agree → high confidence)
- **Calibration Gap**: Measure difference between avg confidence for correct vs incorrect predictions
- **Expected Finding**: ToT should have better calibration than Zero-Shot due to multi-path validation

**6. Error Pattern Analysis**
- **Common Errors**: neutral→negative, positive→neutral (from all previous experiments)
- **ToT Mitigation**: Explicit neutral hypothesis path should reduce false positives (mislabeling neutral as positive/negative)
- **High-Confidence Errors**: When ToT is confident but wrong, examine path scores to understand why all paths converged on wrong answer
- **Learning Opportunity**: Error analysis reveals whether ToT reasoning is fundamentally flawed or just needs better path prompts

**7. Decision Quality Metrics**
- **MCC (Matthews Correlation Coefficient)**: Better metric than accuracy for imbalanced classes
- **Per-Class F1**: Track Positive_F1, Negative_F1, Neutral_F1 separately to identify class-specific strengths/weaknesses
- **Macro-F1 vs Weighted-F1**: Macro treats all classes equally (better for our balanced dataset), Weighted accounts for class imbalance
- **Expected Ranking**: MCC should rank models similarly to Macro-F1, but with more penalty for class imbalance

**8. Comparison with Other Approaches**

   **vs Zero-Shot**:
   - **Expected**: ToT should outperform due to structured reasoning
   - **Trade-Off**: 3x cost, slower inference
   - **Decision**: Use ToT if accuracy gain > 5%

   **vs Few-Shot**:
   - **Expected**: Close competition (Few-Shot has 6 examples, ToT has 3 paths)
   - **Advantage ToT**: No need to curate examples
   - **Advantage Few-Shot**: Provides concrete patterns to mimic

   **vs Chain-of-Thought**:
   - **Expected**: ToT should match or exceed CoT
   - **CoT**: Linear 5-step reasoning
   - **ToT**: Parallel 3-path hypothesis exploration
   - **Key Difference**: ToT explores multiple possibilities, CoT follows single reasoning chain

**9. Production Deployment Considerations**

   **Best for Accuracy** (regardless of cost):
   - Highest Macro-F1 model
   - Priority: Minimize misclassifications

   **Best for Cost-Efficiency**:
   - Acceptable F1 (>0.75) at lowest token cost
   - Priority: Balance accuracy and operational cost
   - Likely winner: Zero-Shot or FinBERT

   **Best for Reliability** (lowest variance, highest negative recall):
   - Consistent performance across all classes
   - Priority: Avoid missing negative signals
   - Metric: Highest Negative_Recall + lowest stddev across runs

   **Best for Explainability**:
   - ToT provides path scores and explicit reasoning
   - Priority: Audit trail for regulatory compliance
   - Use case: High-stakes decisions requiring justification

**10. Hypothesis Validation**
- **Hypothesis 1**: ToT improves negative recall over simpler approaches ✓/✗
- **Hypothesis 2**: ToT provides better confidence calibration ✓/✗
- **Hypothesis 3**: ToT justifies 3x cost increase with >5% accuracy improvement ✓/✗
- **Hypothesis 4**: Larger models (Llama-70B) benefit more from ToT reasoning than smaller models ✓/✗
- **Hypothesis 5**: FinBERT's domain knowledge compensates for lack of ToT reasoning ✓/✗

**11. Limitations and Future Work**
- **Path Design**: Current 3-path design is manual; could optimize with automatic path generation
- **Path Scoring**: Models may not accurately self-score hypothesis strength
- **Prompt Sensitivity**: ToT performance heavily depends on how negative indicators are phrased
- **Computational Cost**: 3x token usage limits scalability for high-volume applications
- **Future**: Experiment with 2-path (positive/negative only) or 4-path (adding "mixed" sentiment) designs

**12. Final Recommendation**
- **Academic/Research**: Use ToT for explainability and to understand model reasoning processes
- **Production/Enterprise**: Use ToT only if:
  1. Accuracy gain > 5% over simpler approaches
  2. Explainability is required for compliance
  3. Computational cost is acceptable for use case
- **Most Practical**: FinBERT (E12) or best Few-Shot model likely offers best accuracy/cost trade-off for real-world financial sentiment monitoring