# Comprehensive Comparative Analysis: Prompt Engineering for Financial Sentiment Analysis

**Experiment Matrix**: 4 Approaches √ó 3 Models = 12 Experiments

| Approach | Mixtral-8x7B | Llama-3.1-70B | FinBERT |
|----------|--------------|---------------|---------|
| **Zero-Shot** | E1 | E2 | E3 |
| **Few-Shot** | E4 | E5 | E6 |
| **Chain-of-Thought** | E7 | E8 | E9 |
| **Tree-of-Thought** | E10 | E11 | E12 |

**Dataset**: FinancialPhraseBank Sentences_AllAgree.txt (2,217 samples: 297 negative, 1,361 neutral, 559 positive)

**Research Questions**:
1. Which model performs best across all prompting strategies?
2. Does prompting complexity improve performance (Zero‚ÜíFew‚ÜíCoT‚ÜíToT)?
3. Can prompt engineering beat domain-specific fine-tuning (FinBERT)?
4. What is the cost-benefit trade-off for each approach?
5. Which combination is best for production deployment?

In [None]:
# Install required packages
!pip install pandas numpy matplotlib seaborn scikit-learn scipy statsmodels -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
)
from scipy import stats
import warnings

warnings.filterwarnings("ignore")

# Set plotting style
sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (16, 8)

print("‚úì Libraries imported successfully")

## 1. Load All Experiment Results

In [None]:
# Define experiment metadata with actual file patterns
experiments = {
    "E1": {
        "model": "Mixtral-8x7B",
        "approach": "Zero-Shot",
        "file": "../Zero_Shot/e1_gpt_oss_20b_zero_shot_*.csv",
    },
    "E2": {
        "model": "Llama-3.1-70B",
        "approach": "Zero-Shot",
        "file": "../Zero_Shot/e2_gpt_oss_120b_zero_shot_*.csv",
    },
    "E3": {
        "model": "FinBERT",
        "approach": "Zero-Shot",
        "file": "../Zero_Shot/e3_llama_zero_shot_*.csv",
    },
    "E4": {
        "model": "Mixtral-8x7B",
        "approach": "Few-Shot",
        "file": "../Few_Shot/e4_gpt_oss_20b_few_shot_*.csv",
    },
    "E5": {
        "model": "Llama-3.1-70B",
        "approach": "Few-Shot",
        "file": "../Few_Shot/e5_gpt_oss_120b_few_shot_*.csv",
    },
    "E6": {
        "model": "FinBERT",
        "approach": "Few-Shot",
        "file": "../Few_Shot/e6_llama_few_shot_*.csv",
    },
    "E7": {
        "model": "Mixtral-8x7B",
        "approach": "Chain-of-Thought",
        "file": "../Chain_of_Thought/e7_GPT_OSS_20B_cot_*.csv",
    },
    "E8": {
        "model": "Llama-3.1-70B",
        "approach": "Chain-of-Thought",
        "file": "../Chain_of_Thought/e8_GPT_OSS_120B_cot_*.csv",
    },
    "E9": {
        "model": "FinBERT",
        "approach": "Chain-of-Thought",
        "file": "../Chain_of_Thought/e9_Llama-3.3-70B_cot_*.csv",
    },
    "E10": {
        "model": "Mixtral-8x7B",
        "approach": "Tree-of-Thought",
        "file": "../Tree_of_Thought/e10_GPT_OSS_20B_tot_*.csv",
    },
    "E11": {
        "model": "Llama-3.1-70B",
        "approach": "Tree-of-Thought",
        "file": "../Tree_of_Thought/e11_GPT_OSS_120B_flash_tot_*.csv",
    },
    "E12": {
        "model": "FinBERT",
        "approach": "Tree-of-Thought",
        "file": "../Tree_of_Thought/e12_Llama_3.3_70B_tot_*.csv",
    },
}

# Load results with robust error handling
import glob
import os


def load_experiment_robust(exp_id, exp_info):
    """Load experiment data with comprehensive error handling"""
    try:
        files = glob.glob(exp_info["file"])
        if not files:
            print(f"‚ö†Ô∏è  {exp_id}: No files found matching '{exp_info['file']}'")
            return None

        # Get most recent file (by modification time)
        latest_file = max(files, key=os.path.getmtime)
        df = pd.read_csv(latest_file)

        # Validate required columns
        required_cols = ["true_sentiment", "predicted_sentiment"]
        if not all(col in df.columns for col in required_cols):
            print(
                f"‚ùå {exp_id}: Missing required columns in {os.path.basename(latest_file)}"
            )
            return None

        # Validate data integrity
        if len(df) == 0:
            print(f"‚ö†Ô∏è  {exp_id}: Empty dataframe in {os.path.basename(latest_file)}")
            return None

        print(
            f"‚úì {exp_id}: Loaded {len(df):,} samples from {os.path.basename(latest_file)}"
        )
        return df

    except Exception as e:
        print(f"‚ùå {exp_id}: Error loading data - {str(e)}")
        return None


print("\n" + "=" * 100)
print("üìä LOADING ALL EXPERIMENT RESULTS")
print("=" * 100)
print("\nAttempting to load 12 experiments across 4 approaches...\n")

results = {}
loading_status = {
    "Zero-Shot": [],
    "Few-Shot": [],
    "Chain-of-Thought": [],
    "Tree-of-Thought": [],
}

for exp_id, exp_info in experiments.items():
    df = load_experiment_robust(exp_id, exp_info)
    if df is not None:
        results[exp_id] = df
        loading_status[exp_info["approach"]].append((exp_id, True, len(df)))
    else:
        loading_status[exp_info["approach"]].append((exp_id, False, 0))

# Detailed summary by approach
print("\n" + "=" * 100)
print("üìà LOADING SUMMARY BY APPROACH")
print("=" * 100)

for approach, status_list in loading_status.items():
    loaded = sum(1 for _, success, _ in status_list if success)
    total = len(status_list)

    if loaded == total:
        icon = "‚úÖ"
        status = "COMPLETE"
    elif loaded > 0:
        icon = "‚ö†Ô∏è "
        status = "PARTIAL"
    else:
        icon = "‚ùå"
        status = "MISSING"

    print(f"\n{icon} {approach}: {loaded}/{total} {status}")
    for exp_id, success, count in status_list:
        exp_model = experiments[exp_id]["model"]
        if success:
            print(f"   ‚úì {exp_id} ({exp_model}): {count:,} samples")
        else:
            print(f"   ‚úó {exp_id} ({exp_model}): Not found")

# Overall summary
loaded_count = len(results)
total_count = len(experiments)
success_rate = (loaded_count / total_count) * 100

print("\n" + "=" * 100)
if loaded_count == total_count:
    print(f"‚úÖ SUCCESS: All {total_count} experiments loaded ({success_rate:.0f}%)")
    total_samples = sum(len(df) for df in results.values())
    print(f"üìä Total samples across all experiments: {total_samples:,}")
elif loaded_count > 0:
    print(
        f"‚ö†Ô∏è  PARTIAL SUCCESS: {loaded_count}/{total_count} experiments loaded ({success_rate:.0f}%)"
    )
    print(
        f"\nüí° TIP: Missing {total_count - loaded_count} experiments. Run corresponding notebooks to complete analysis."
    )
    missing = [exp_id for exp_id in experiments.keys() if exp_id not in results]
    print(f"   Missing: {', '.join(missing)}")
else:
    print(f"‚ùå ERROR: No experiments loaded (0/{total_count})")
    print("\nüîç TROUBLESHOOTING GUIDE:")
    print("   1. Verify experiment notebooks exist:")
    print("      ‚Ä¢ Zero_Shot/E1_E2_E3_zero_shot_sentiment_All_agree.ipynb")
    print("      ‚Ä¢ Few_Shot/E4_E5_E6_few_shot_sentiment.ipynb")
    print("      ‚Ä¢ Chain_of_Thought/E7_E8_E9_cot_sentiment.ipynb")
    print("      ‚Ä¢ Tree_of_Thought/E10_tot_sentiment.ipynb")
    print("\n   2. Run experiment notebooks to generate CSV files")
    print("\n   3. Check file patterns match expectations:")
    for exp_id, exp_info in list(experiments.items())[:3]:
        print(f"      ‚Ä¢ {exp_id}: {exp_info['file']}")
    print("      ...")

print("=" * 100)

## 2. Calculate Comprehensive Metrics

In [None]:
def calculate_all_metrics(df, exp_id, model, approach):
    """Calculate comprehensive metrics for an experiment"""

    # Filter valid predictions
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()

    if valid_df.empty:
        return None

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    # Overall metrics
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
    weighted_f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    macro_precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
    macro_recall = recall_score(y_true, y_pred, average="macro", zero_division=0)

    # Per-class metrics
    labels = ["positive", "negative", "neutral"]
    precision_per_class = precision_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    recall_per_class = recall_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    # False negative rate for negative class (critical metric)
    neg_idx = 1  # negative is at index 1
    true_negatives = (y_true == "negative").sum()
    false_negatives = (
        true_negatives - cm[neg_idx, neg_idx]
    )  # Actual negatives - correctly predicted
    fnr = false_negatives / true_negatives if true_negatives > 0 else 0

    return {
        "Experiment": exp_id,
        "Model": model,
        "Approach": approach,
        "Total_Samples": len(df),
        "Valid_Predictions": len(valid_df),
        "Parsing_Errors": len(df) - len(valid_df),
        "Parsing_Error_Rate": (len(df) - len(valid_df)) / len(df),
        "Accuracy": acc,
        "Macro_F1": macro_f1,
        "Weighted_F1": weighted_f1,
        "Macro_Precision": macro_precision,
        "Macro_Recall": macro_recall,
        "Positive_Precision": precision_per_class[0],
        "Positive_Recall": recall_per_class[0],
        "Positive_F1": f1_per_class[0],
        "Negative_Precision": precision_per_class[1],
        "Negative_Recall": recall_per_class[1],
        "Negative_F1": f1_per_class[1],  # CRITICAL METRIC
        "Negative_FNR": fnr,  # False negative rate
        "Neutral_Precision": precision_per_class[2],
        "Neutral_Recall": recall_per_class[2],
        "Neutral_F1": f1_per_class[2],
        "Avg_Confidence": valid_df["confidence"].mean()
        if "confidence" in valid_df.columns
        else 0,
        "Confusion_Matrix": cm,
    }


# Calculate metrics for all experiments
all_metrics = []
for exp_id, df in results.items():
    exp_info = experiments[exp_id]
    metrics = calculate_all_metrics(df, exp_id, exp_info["model"], exp_info["approach"])
    if metrics:
        all_metrics.append(metrics)

if all_metrics:
    metrics_df = pd.DataFrame(all_metrics)
    print("\n" + "=" * 100)
    print("COMPREHENSIVE METRICS SUMMARY")
    print("=" * 100)
    display(
        metrics_df[
            [
                "Experiment",
                "Model",
                "Approach",
                "Accuracy",
                "Macro_F1",
                "Negative_F1",
                "Parsing_Error_Rate",
            ]
        ].round(4)

    )    metrics_df = pd.DataFrame()  # Empty dataframe for safety

else:    print("\n‚ö†Ô∏è  No valid metrics calculated - no experiment data available")

## 3. Model Comparison (Across Approaches)

In [None]:
# Average performance by model across all approaches
if not metrics_df.empty:
    print("\n" + "=" * 100)
    print("üîÑ Analyzing model performance across all approaches...")
    
    model_summary = metrics_df.groupby("Model")[
        ["Accuracy", "Macro_F1", "Negative_F1", "Parsing_Error_Rate"]
    ].mean()
    
    print(f"‚úì Analyzed {len(model_summary)} models: {', '.join(model_summary.index)}")
    
    print("\n" + "=" * 80)
    print("AVERAGE PERFORMANCE BY MODEL (across all prompting strategies)")
    print("=" * 80)
    display(model_summary.round(4))
    
    # Visualization
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Macro F1 by model
model_summary["Macro_F1"].plot(
    kind="bar", ax=axes[0], color=["#FF6B6B", "#4ECDC4", "#45B7D1"]
)
axes[0].set_title("Average Macro-F1 by Model", fontsize=14, weight="bold")
axes[0].set_ylabel("Macro-F1 Score", fontsize=12)
axes[0].set_xlabel("Model", fontsize=12)
axes[0].set_ylim([0, 1])
axes[0].grid(axis="y", alpha=0.3)
axes[0].tick_params(axis="x", rotation=45)

# Negative F1 by model (CRITICAL)
model_summary["Negative_F1"].plot(
    kind="bar", ax=axes[1], color=["#FF6B6B", "#4ECDC4", "#45B7D1"]
)
axes[1].set_title(
    "Average Negative F1 by Model (Critical Metric)", fontsize=14, weight="bold"
)
axes[1].set_ylabel("Negative F1 Score", fontsize=12)
axes[1].set_xlabel("Model", fontsize=12)
axes[1].set_ylim([0, 1])
axes[1].axhline(y=0.5, color="red", linestyle="--", label="Minimum Threshold")
axes[1].legend()
axes[1].grid(axis="y", alpha=0.3)
axes[1].tick_params(axis="x", rotation=45)

# Parsing error rate
model_summary["Parsing_Error_Rate"].plot(
    kind="bar", ax=axes[2], color=["#FF6B6B", "#4ECDC4", "#45B7D1"]
)
axes[2].set_title("Parsing Error Rate by Model", fontsize=14, weight="bold")
axes[2].set_ylabel("Error Rate", fontsize=12)
axes[2].set_xlabel("Model", fontsize=12)
axes[2].set_ylim([0, 0.5])
axes[2].grid(axis="y", alpha=0.3)
axes[2].tick_params(axis="x", rotation=45)
    plt.tight_layout()

    plt.savefig("model_comparison.png", dpi=300, bbox_inches="tight")plt.show()    print("‚ö†Ô∏è  Skipping model comparison - no data available")

    plt.show()else:


## 4. Approach Comparison (Across Models)

In [None]:
# Average performance by approach across all models
if not metrics_df.empty:
    print("\n" + "=" * 100)
    print("üîÑ Analyzing approach effectiveness across all models...")
    
    approach_summary = metrics_df.groupby("Approach")[
        ["Accuracy", "Macro_F1", "Negative_F1", "Avg_Confidence"]
    ].mean()
    
    # Order by complexity
    approach_order = ["Zero-Shot", "Few-Shot", "Chain-of-Thought", "Tree-of-Thought"]
    approach_summary = approach_summary.reindex([a for a in approach_order if a in approach_summary.index])
    
    print(f"‚úì Analyzed {len(approach_summary)} approaches: {', '.join(approach_summary.index)}")
    
    print("\n" + "=" * 80)
    print("AVERAGE PERFORMANCE BY APPROACH (across all models)")
    print("=" * 80)
    display(approach_summary.round(4))
    
    # Visualize complexity vs performance
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

x = np.arange(len(approach_order))
width = 0.35

# Macro F1 progression
axes[0].plot(
    x,
    approach_summary["Macro_F1"],
    marker="o",
    linewidth=2,
    markersize=10,
    label="Macro-F1",
)
axes[0].plot(
    x,
    approach_summary["Negative_F1"],
    marker="s",
    linewidth=2,
    markersize=10,
    label="Negative-F1",
)
axes[0].set_xlabel("Prompting Complexity ‚Üí", fontsize=12, weight="bold")
axes[0].set_ylabel("F1 Score", fontsize=12, weight="bold")
axes[0].set_title("Does Complexity Improve Performance?", fontsize=14, weight="bold")
axes[0].set_xticks(x)
axes[0].set_xticklabels(approach_order, rotation=15, ha="right")
axes[0].legend()
axes[0].grid(axis="y", alpha=0.3)
axes[0].set_ylim([0, 1])

# Confidence by approach
approach_summary["Avg_Confidence"].plot(
    kind="bar", ax=axes[1], color=["#95E1D3", "#F38181", "#AA96DA", "#FCBAD3"]
)
axes[1].set_title("Average Confidence by Approach", fontsize=14, weight="bold")
axes[1].set_ylabel("Confidence Score", fontsize=12)
axes[1].set_xlabel("Approach", fontsize=12)
axes[1].set_ylim([0, 1])
axes[1].grid(axis="y", alpha=0.3)
axes[1].tick_params(axis="x", rotation=15)
    plt.tight_layout()

    plt.savefig("approach_comparison.png", dpi=300, bbox_inches="tight")plt.show()    print("‚ö†Ô∏è  Skipping approach comparison - no data available")

    plt.show()else:


## 5. Heatmap: Model √ó Approach Performance

In [None]:
# Create pivot table for heatmap
if not metrics_df.empty and len(metrics_df) >= 3:
    approach_order = ["Zero-Shot", "Few-Shot", "Chain-of-Thought", "Tree-of-Thought"]
    
    pivot_macro_f1 = metrics_df.pivot(index="Model", columns="Approach", values="Macro_F1")
    pivot_macro_f1 = pivot_macro_f1.reindex(columns=[a for a in approach_order if a in pivot_macro_f1.columns])
    
    pivot_neg_f1 = metrics_df.pivot(index="Model", columns="Approach", values="Negative_F1")
    pivot_neg_f1 = pivot_neg_f1.reindex(columns=[a for a in approach_order if a in pivot_neg_f1.columns])
    
    fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Macro-F1 heatmap
sns.heatmap(
    pivot_macro_f1,
    annot=True,
    fmt=".3f",
    cmap="YlGnBu",
    ax=axes[0],
    cbar_kws={"label": "Macro-F1"},
)
axes[0].set_title("Macro-F1: Model √ó Approach", fontsize=14, weight="bold")
axes[0].set_xlabel("Approach", fontsize=12)
axes[0].set_ylabel("Model", fontsize=12)

# Negative-F1 heatmap (CRITICAL)
sns.heatmap(
    pivot_neg_f1,
    annot=True,
    fmt=".3f",
    cmap="RdYlGn",
    vmin=0,
    vmax=1,
    ax=axes[1],
    cbar_kws={"label": "Negative-F1"},
)
axes[1].set_title(
    "Negative-F1: Model √ó Approach (Critical Metric)", fontsize=14, weight="bold"
)
    plt.tight_layout()
    plt.savefig("model_approach_heatmap.png", dpi=300, bbox_inches="tight")
    plt.show()
    
    print("\n" + "=" * 80)
    print("BEST PERFORMING COMBINATIONS")
    print("=" * 80)
    best_macro = metrics_df.loc[metrics_df["Macro_F1"].idxmax()]
    best_neg = metrics_df.loc[metrics_df["Negative_F1"].idxmax()]
    best_acc = metrics_df.loc[metrics_df["Accuracy"].idxmax()]
    
    print(f"Best Macro-F1: {best_macro['Experiment']} ({best_macro['Model']} + {best_macro['Approach']}) = {best_macro['Macro_F1']:.4f}")
    print(f"Best Negative-F1: {best_neg['Experiment']} ({best_neg['Model']} + {best_neg['Approach']}) = {best_neg['Negative_F1']:.4f}")
    print(f"Best Accuracy: {best_acc['Experiment']} ({best_acc['Model']} + {best_acc['Approach']}) = {best_acc['Accuracy']:.4f}")
else:
    print("\n" + "=" * 100)
    print("‚ö†Ô∏è  SKIPPING HEATMAP ANALYSIS")
    print("=" * 100)
    if metrics_df.empty:

        print("Reason: No metrics data available")    f"Best Accuracy: {metrics_df.loc[metrics_df['Accuracy'].idxmax(), 'Experiment']} - {metrics_df['Accuracy'].max():.4f}"

    else:print()

        print(f"Reason: Insufficient data (need ‚â•3 experiments, have {len(metrics_df)})")
    print("üí° Run more experiment notebooks to enable this visualization")

## 6. Statistical Significance Testing

In [None]:
# McNemar's test to compare paired experiments
if len(results) >= 2:
    from statsmodels.stats.contingency_tables import mcnemar
    
    
    def compare_experiments(exp1_id, exp2_id):
        """Compare two experiments using McNemar's test"""
        if exp1_id not in results or exp2_id not in results:
            return None
        
        df1 = results[exp1_id]
        df2 = results[exp2_id]

    # Filter valid predictions
    valid1 = df1[
        df1["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()
    valid2 = df2[
        df2["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()

    # Ensure same samples
    if len(valid1) != len(valid2):
        print(
            f"‚ö†Ô∏è Sample size mismatch: {exp1_id}={len(valid1)}, {exp2_id}={len(valid2)}"
        )
        return None

    # Create contingency table
    correct1 = (valid1["true_sentiment"] == valid1["predicted_sentiment"]).astype(int)
    correct2 = (valid2["true_sentiment"] == valid2["predicted_sentiment"]).astype(int)

    # McNemar table: [[both_correct, exp1_only], [exp2_only, both_wrong]]
    both_correct = ((correct1 == 1) & (correct2 == 1)).sum()
    exp1_only = ((correct1 == 1) & (correct2 == 0)).sum()
    exp2_only = ((correct1 == 0) & (correct2 == 1)).sum()
    both_wrong = ((correct1 == 0) & (correct2 == 0)).sum()

    table = [[both_correct, exp1_only], [exp2_only, both_wrong]]

    result = mcnemar(table, exact=False, correction=True)

    return {
        "Comparison": f"{exp1_id} vs {exp2_id}",
        "Statistic": result.statistic,
        "P-value": result.pvalue,
        "Significant": "Yes" if result.pvalue < 0.05 else "No",
        "Winner": exp1_id
        if exp1_only > exp2_only
        else exp2_id
        if exp2_only > exp1_only
        else "Tie",
    }


# Compare key pairs
comparisons = [
    ("E1", "E2"),  # Mixtral vs Llama (Zero-Shot)
    ("E1", "E3"),  # Mixtral vs FinBERT (Zero-Shot)
    ("E2", "E3"),  # Llama vs FinBERT (Zero-Shot)
    ("E1", "E4"),  # Mixtral: Zero-Shot vs Few-Shot
    sig_results = []
    for exp1, exp2 in comparisons:
        if exp1 in results and exp2 in results:
            result = compare_experiments(exp1, exp2)
            if result:
                sig_results.append(result)
    
    if sig_results:
        sig_df = pd.DataFrame(sig_results)
        print("\n" + "=" * 80)
        print("STATISTICAL SIGNIFICANCE TESTS (McNemar's Test)")
        print("=" * 80)

        display(sig_df)
print("\n" + "=" * 80)    print("‚ö†Ô∏è  Skipping statistical testing - insufficient experiments loaded")

    else:
print("STATISTICAL SIGNIFICANCE TESTS (McNemar's Test)")display(sig_df)

        print("‚ö†Ô∏è  Not enough paired experiments for statistical testing")
print("=" * 80)else:

## 7. Cost-Benefit Analysis

In [None]:
# Token costs (approximate)
if not metrics_df.empty:
    token_costs = {
        "Mixtral-8x7B": {"input": 0.27, "output": 0.27},  # per 1M tokens (Groq)
        "Llama-3.1-70B": {"input": 0.59, "output": 0.79},  # per 1M tokens (Groq)
        "FinBERT": {"input": 0, "output": 0},  # Free (local)
    }

# Approximate token usage per approach
token_usage = {
    "Zero-Shot": {"input": 150, "output": 50},  # Average tokens per sample
    "Few-Shot": {"input": 400, "output": 50},
    "Chain-of-Thought": {"input": 300, "output": 150},
    "Tree-of-Thought": {"input": 450, "output": 200},
}

# Calculate costs
total_samples = 2217

cost_analysis = []
for _, row in metrics_df.iterrows():
    model = row["Model"]
    approach = row["Approach"]

    if model in token_costs and approach in token_usage:
        input_tokens = token_usage[approach]["input"] * total_samples
        output_tokens = token_usage[approach]["output"] * total_samples

        input_cost = (input_tokens / 1_000_000) * token_costs[model]["input"]
        output_cost = (output_tokens / 1_000_000) * token_costs[model]["output"]
        total_cost = input_cost + output_cost

        # Cost per F1 point
        cost_per_f1 = (
            total_cost / row["Macro_F1"] if row["Macro_F1"] > 0 else float("inf")
        )

        cost_analysis.append(
            {
                "Experiment": row["Experiment"],
                "Model": model,
                "Approach": approach,
                "Input_Tokens": input_tokens,
                "Output_Tokens": output_tokens,
                "Total_Cost_USD": total_cost,
                "Macro_F1": row["Macro_F1"],
                "Cost_per_F1": cost_per_f1,
            }
        )

cost_df = pd.DataFrame(cost_analysis)
print("\n" + "=" * 80)
print("COST-BENEFIT ANALYSIS")
print("=" * 80)
display(
    cost_df[
        ["Experiment", "Model", "Approach", "Total_Cost_USD", "Macro_F1", "Cost_per_F1"]
    ].round(4)
)

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Cost vs Performance scatter
for model in metrics_df["Model"].unique():
    model_data = cost_df[cost_df["Model"] == model]
    axes[0].scatter(
        model_data["Total_Cost_USD"],
        model_data["Macro_F1"],
        s=200,
        alpha=0.6,
        label=model,
    )
    for _, row in model_data.iterrows():
        axes[0].annotate(
            row["Experiment"],
            (row["Total_Cost_USD"], row["Macro_F1"]),
            fontsize=9,
            ha="center",
        )

axes[0].set_xlabel("Total Cost (USD)", fontsize=12, weight="bold")
axes[0].set_ylabel("Macro-F1", fontsize=12, weight="bold")
axes[0].set_title("Cost vs Performance Trade-off", fontsize=14, weight="bold")
axes[0].legend()
axes[0].grid(alpha=0.3)

# Cost efficiency (Cost per F1 point)
top_efficient = cost_df.nsmallest(6, "Cost_per_F1")
top_efficient.plot(
    x="Experiment", y="Cost_per_F1", kind="bar", ax=axes[1], legend=False
)
axes[1].set_title("Most Cost-Efficient Experiments", fontsize=14, weight="bold")
axes[1].set_ylabel("Cost per F1 Point (USD)", fontsize=12)
axes[1].set_xlabel("Experiment", fontsize=12)
axes[1].tick_params(axis="x", rotation=45)
axes[1].grid(axis="y", alpha=0.3)
    plt.tight_layout()
    plt.savefig("cost_benefit_analysis.png", dpi=300, bbox_inches="tight")
    plt.show()

else:
plt.show()    print("‚ö†Ô∏è  Skipping cost analysis - no data available")

## 8. Production Recommendation

In [None]:
# Define production criteria
if not metrics_df.empty and 'cost_df' in locals():
    PRODUCTION_CRITERIA = {
        "Minimum_Macro_F1": 0.75,
        "Minimum_Negative_F1": 0.50,  # Critical - can't miss bad financial news
        "Maximum_Parsing_Error": 0.05,
        "Maximum_Cost_per_1000": 1.0,  # USD
    }

# Evaluate each experiment against criteria
production_ready = metrics_df.copy()
production_ready["Meets_Macro_F1"] = (
    production_ready["Macro_F1"] >= PRODUCTION_CRITERIA["Minimum_Macro_F1"]
)
production_ready["Meets_Negative_F1"] = (
    production_ready["Negative_F1"] >= PRODUCTION_CRITERIA["Minimum_Negative_F1"]
)
production_ready["Meets_Parsing"] = (
    production_ready["Parsing_Error_Rate"]
    <= PRODUCTION_CRITERIA["Maximum_Parsing_Error"]
)

# Add cost criterion
production_ready = production_ready.merge(
    cost_df[["Experiment", "Total_Cost_USD"]], on="Experiment"
)
production_ready["Cost_per_1000"] = (production_ready["Total_Cost_USD"] / 2217) * 1000
production_ready["Meets_Cost"] = (
    production_ready["Cost_per_1000"] <= PRODUCTION_CRITERIA["Maximum_Cost_per_1000"]
)

production_ready["All_Criteria_Met"] = (
    production_ready["Meets_Macro_F1"]
    & production_ready["Meets_Negative_F1"]
    & production_ready["Meets_Parsing"]
    & production_ready["Meets_Cost"]
)

print("\n" + "=" * 100)
print("PRODUCTION READINESS EVALUATION")
print("=" * 100)
print(f"Criteria:")
print(f"  - Macro-F1 ‚â• {PRODUCTION_CRITERIA['Minimum_Macro_F1']}")
print(f"  - Negative-F1 ‚â• {PRODUCTION_CRITERIA['Minimum_Negative_F1']} (critical)")
print(f"  - Parsing errors ‚â§ {PRODUCTION_CRITERIA['Maximum_Parsing_Error'] * 100}%")
print(f"  - Cost ‚â§ ${PRODUCTION_CRITERIA['Maximum_Cost_per_1000']}/1000 samples")
print()

display(
    production_ready[
        [
            "Experiment",
            "Model",
            "Approach",
            "Macro_F1",
            "Negative_F1",
            "Parsing_Error_Rate",
            "Cost_per_1000",
            "All_Criteria_Met",
        ]
    ].round(4)
)

# Highlight production-ready experiments
production_ready_list = production_ready[production_ready["All_Criteria_Met"]]
if len(production_ready_list) > 0:
    print("\n‚úÖ PRODUCTION-READY EXPERIMENTS:")
    for _, row in production_ready_list.iterrows():
        print(f"  {row['Experiment']}: {row['Model']} + {row['Approach']}")
        print(
            f"    Macro-F1: {row['Macro_F1']:.4f}, Negative-F1: {row['Negative_F1']:.4f}, Cost: ${row['Cost_per_1000']:.4f}/1000"
    else:
        print("\n‚ö†Ô∏è  NO EXPERIMENTS MEET ALL PRODUCTION CRITERIA")
        print("\nTop candidates:")
        display(
            production_ready.nlargest(3, "Macro_F1")[
                ["Experiment", "Model", "Approach", "Macro_F1", "Negative_F1"]
            ].round(4)
        )

else:
    )    print("‚ö†Ô∏è  Skipping production evaluation - no cost data or metrics available")

## 9. Final Recommendations

### Key Findings:

1. **Best Overall Model**: [To be determined after running experiments]
2. **Best Overall Approach**: [To be determined]
3. **Best Combination**: [To be determined]

### Critical Insights:

#### Negative Class Detection (Most Important)
- **Critical Metric**: Negative F1 must be > 0.50 for production
- **Business Impact**: False negatives = missing bad financial news = unacceptable risk
- **Best Performer**: [Will show which model/approach best detects negatives]

#### Prompt Engineering vs Fine-Tuning
- **FinBERT Advantage**: Domain-specific pre-training on financial texts
- **LLM Advantage**: Flexibility, reasoning capability, handles edge cases
- **Verdict**: [Compare FinBERT vs best prompt-engineered LLM]

#### Complexity vs Performance
- **Zero-Shot Baseline**: Establishes minimum without examples
- **Few-Shot Improvement**: In-context learning boost
- **CoT Reasoning**: Step-by-step analysis benefit
- **ToT Exploration**: Diminishing returns?
- **Conclusion**: [Optimal complexity level for this task]

#### Cost-Benefit Analysis
- **Free Option**: FinBERT (local inference, one-time download)
- **Budget Option**: Mixtral-8x7B with Few-Shot (~$0.X for 2,217 samples)
- **Premium Option**: Llama-3.1-70B with CoT (~$0.X for 2,217 samples)
- **Best Value**: [Cost per F1 point winner]

### Production Deployment Recommendation:

**Scenario 1: Budget Unlimited, Performance Critical**
- Use: [Best performing experiment]
- Rationale: Highest Macro-F1 and Negative-F1

**Scenario 2: Cost-Conscious, Good Performance Needed**
- Use: [Best cost-efficient experiment]
- Rationale: 90% of performance at 50% of cost

**Scenario 3: Free/Local Deployment**
- Use: FinBERT (E3/E6/E9/E12)
- Rationale: No API costs, fast inference, proven financial domain performance

### Future Work:
1. **Ensemble Approach**: Combine FinBERT + best LLM (vote or confidence-weighted)
2. **Fine-Tuning**: Fine-tune Mixtral/Llama on FinancialPhraseBank for best of both worlds
3. **Active Learning**: Focus on high-disagreement samples between models
4. **Domain Expansion**: Test on other financial datasets (earnings calls, news, etc.)

In [None]:
# Save comprehensive results
if not metrics_df.empty:
    from datetime import datetime
    import os
    
    print("\n" + "=" * 100)
    print("üíæ EXPORTING COMPREHENSIVE ANALYSIS RESULTS")
    print("=" * 100)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    print(f"\nTimestamp: {timestamp}")
    print(f"Export location: {os.getcwd()}\n")
    
    csv_files = []
    viz_files = []
    
    # Save main results
    print("üìÑ Saving CSV files...")
    main_file = f"comprehensive_comparative_analysis_{timestamp}.csv"
    metrics_df.to_csv(main_file, index=False)
    size_kb = os.path.getsize(main_file) / 1024
    csv_files.append((main_file, len(metrics_df), size_kb))
    print(f"  ‚úì {main_file} ({len(metrics_df)} experiments, {size_kb:.1f} KB)")
    
    # Save cost analysis
    if 'cost_df' in locals() and not cost_df.empty:
        cost_file = f"cost_benefit_analysis_{timestamp}.csv"
        cost_df.to_csv(cost_file, index=False)
        size_kb = os.path.getsize(cost_file) / 1024
        csv_files.append((cost_file, len(cost_df), size_kb))
        print(f"  ‚úì {cost_file} ({len(cost_df)} rows, {size_kb:.1f} KB)")
    
    # Save production readiness
    if 'production_ready' in locals() and not production_ready.empty:
        prod_file = f"production_readiness_evaluation_{timestamp}.csv"
        production_ready.to_csv(prod_file, index=False)
        size_kb = os.path.getsize(prod_file) / 1024

        csv_files.append((prod_file, len(production_ready), size_kb))    print("\nüí° TIP: You can run experiments incrementally. This notebook will analyze whatever is available.")

        print(f"  ‚úì {prod_file} ({len(production_ready)} rows, {size_kb:.1f} KB)")    print("\n   2. Re-run this notebook once CSV files are generated")

        print("      ‚Ä¢ E10-E12: Tree_of_Thought/E10_tot_sentiment.ipynb")

    # Check for visualizations    print("      ‚Ä¢ E7-E9: Chain_of_Thought/E7_E8_E9_cot_sentiment.ipynb")

    print("\nüìä Checking generated visualizations...")    print("      ‚Ä¢ E4-E6: Few_Shot/E4_E5_E6_few_shot_sentiment.ipynb")

    potential_viz = [    print("      ‚Ä¢ E1-E3: Zero_Shot/E1_E2_E3_zero_shot_sentiment_All_agree.ipynb")

        "model_comparison.png",    print("   1. Run experiment notebooks to generate results:")

        "approach_comparison.png",    print("üìã Required steps:")

        "model_approach_heatmap.png",    print("\n‚ö†Ô∏è  Cannot export - no experiment metrics calculated\n")

        "cost_benefit_analysis.png"    print("=" * 100)

    ]    print("‚ùå EXPORT FAILED: NO DATA AVAILABLE")

        print("\n" + "=" * 100)

    for viz in potential_viz:else:

        if os.path.exists(viz):    print("   ‚Ä¢ Share comprehensive_comparative_analysis_*.csv with stakeholders")

            size_kb = os.path.getsize(viz) / 1024    print("   ‚Ä¢ Use insights for model/approach selection")

            viz_files.append((viz, size_kb))    print("   ‚Ä¢ Check PNG files for visual comparisons")

            print(f"  ‚úì {viz} ({size_kb:.1f} KB)")    print("   ‚Ä¢ Review CSV files for detailed metrics")

        else:    print("\nüí° Next steps:")

            print(f"  ‚úó {viz} (not generated)")    # Usage tip

        

    # Final summary    print(f"üìÇ Files are in: {os.getcwd()}")

    print("\n" + "=" * 100)    print(f"\nüéâ Analysis complete! All results saved with timestamp: {timestamp}")

    print("‚úÖ EXPORT COMPLETE")    

    print("=" * 100)        print("   ‚Ä¢ Visualizations: 0 (sections may have been skipped due to insufficient data)")

        else:

    print(f"\nüìÅ Summary:")        print(f"   ‚Ä¢ Total viz size: {viz_total_size:.1f} KB ({viz_total_size/1024:.2f} MB)")

    print(f"   ‚Ä¢ CSV files: {len(csv_files)}")        viz_total_size = sum(size for _, size in viz_files)

    csv_total_size = sum(size for _, _, size in csv_files)        print(f"   ‚Ä¢ Visualizations: {len(viz_files)}")

    print(f"   ‚Ä¢ Total CSV size: {csv_total_size:.1f} KB ({csv_total_size/1024:.2f} MB)")    if viz_files:
    