# Comprehensive Results Comparison: All 12 Experiments

**Experiment Matrix**: 4 Approaches √ó 3 Models = 12 Experiments

| Approach | Mixtral-8x7B-32768 | Llama-3.1-70B-Versatile | FinBERT |
|----------|-------------------|-------------------------|---------|
| **Zero-Shot** | E1 | E2 | E3 |
| **Few-Shot** | E4 | E5 | E6 |
| **Chain-of-Thought** | E7 | E8 | E9 |
| **Tree-of-Thought** | E10 | E11 | E12 |

**Dataset**: FinancialPhraseBank Sentences_AllAgree.txt (2,217 samples)

This notebook provides a comprehensive comparison across all experiments with automatic data loading.

In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn plotly -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (16, 8)

print("‚úì Libraries loaded successfully")

## 1. Load All Results

Load metrics summaries from all prompting strategy experiments.

In [None]:
def calculate_metrics(df, exp_id, model, strategy):
    """Calculate comprehensive metrics for an experiment"""

    # Filter valid predictions
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()

    if valid_df.empty:
        print(f"‚ö†Ô∏è  {exp_id}: No valid predictions found")
        return None

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
    weighted_f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    macro_precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
    macro_recall = recall_score(y_true, y_pred, average="macro", zero_division=0)

    # Per-class metrics
    labels = ["positive", "negative", "neutral"]
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    # Parsing errors
    parsing_errors = len(df) - len(valid_df)
    error_rate = parsing_errors / len(df) if len(df) > 0 else 0

    return {
        "Experiment": exp_id,
        "Model": model,
        "Strategy": strategy,
        "Total_Samples": len(df),
        "Valid_Predictions": len(valid_df),
        "Parsing_Errors": parsing_errors,
        "Error_Rate": error_rate,
        "Accuracy": accuracy,
        "Macro-F1": macro_f1,
        "Weighted-F1": weighted_f1,
        "Precision": macro_precision,
        "Recall": macro_recall,
        "Positive-F1": f1_per_class[0],
        "Negative-F1": f1_per_class[1],
        "Neutral-F1": f1_per_class[2],
    }


# Calculate metrics for all loaded experiments
all_metrics = []
for exp_id, df in results_data.items():
    exp_info = experiments[exp_id]
    metrics = calculate_metrics(df, exp_id, exp_info["model"], exp_info["strategy"])
    if metrics:
        all_metrics.append(metrics)

print("\n" + "=" * 100)
print("üìä CALCULATING METRICS")
print("=" * 100)
print(f"\nProcessing {len(results_data)} experiments...\n")

# Create metrics DataFrame
if all_metrics:
    results_df = pd.DataFrame(all_metrics)

    # Validation summary
    total_samples = results_df["Total_Samples"].sum()
    valid_predictions = results_df["Valid_Predictions"].sum()
    parsing_errors = results_df["Parsing_Errors"].sum()
    avg_error_rate = results_df["Error_Rate"].mean()

    print("‚úÖ Metrics calculated successfully!\n")
    print(f"üìä Data Summary:")
    print(f"   ‚Ä¢ Total samples processed: {total_samples:,}")
    print(
        f"   ‚Ä¢ Valid predictions: {valid_predictions:,} ({valid_predictions / total_samples * 100:.1f}%)"
    )
    print(
        f"   ‚Ä¢ Parsing errors: {parsing_errors:,} ({avg_error_rate * 100:.2f}% average)"
    )

    print("\n" + "=" * 100)
    print("üìà COMPREHENSIVE METRICS SUMMARY")
    print("=" * 100)
    display(
        results_df[
            [
                "Experiment",
                "Model",
                "Strategy",
                "Accuracy",
                "Macro-F1",
                "Negative-F1",
                "Error_Rate",
            ]
        ].round(4)
    )

    # Performance alerts
    print("\n" + "=" * 100)
    print("‚ö° PERFORMANCE ALERTS")
    print("=" * 100)

    high_error = results_df[results_df["Error_Rate"] > 0.05]
    if len(high_error) > 0:
        print(f"\n‚ö†Ô∏è  HIGH PARSING ERROR RATE (>5%): {len(high_error)} experiments")
        for _, row in high_error.iterrows():
            print(
                f"   ‚Ä¢ {row['Experiment']}: {row['Error_Rate'] * 100:.2f}% errors ({row['Parsing_Errors']} samples)"
            )

    low_neg_f1 = results_df[results_df["Negative-F1"] < 0.5]
    if len(low_neg_f1) > 0:
        print(f"\n‚ö†Ô∏è  LOW NEGATIVE-F1 (<0.50): {len(low_neg_f1)} experiments")
        print("   (Critical for financial risk detection!)")
        for _, row in low_neg_f1.iterrows():
            print(f"   ‚Ä¢ {row['Experiment']}: Negative-F1 = {row['Negative-F1']:.4f}")

    if len(high_error) == 0 and len(low_neg_f1) == 0:
        print("\n‚úÖ No critical issues detected!")

else:
    results_df = pd.DataFrame()  # Empty dataframe for safety
    print("‚ùå ERROR: No metrics calculated - no valid experiment data\n")
    print("üîç Possible causes:")
    print("   ‚Ä¢ No experiment files loaded")
    print("   ‚Ä¢ All predictions failed validation")
    print("   ‚Ä¢ Missing required columns in CSV files")

In [None]:
# Define all 12 experiments with file patterns
experiments = {
    # Zero-Shot Experiments
    "E1": {
        "model": "Mixtral-8x7B",
        "strategy": "Zero-Shot",
        "file": "../Zero_Shot/e1_gpt_oss_20b_zero_shot_*.csv",
    },
    "E2": {
        "model": "Llama-3.1-70B",
        "strategy": "Zero-Shot",
        "file": "../Zero_Shot/e2_gpt_oss_120b_zero_shot_*.csv",
    },
    "E3": {
        "model": "FinBERT",
        "strategy": "Zero-Shot",
        "file": "../Zero_Shot/e3_llama_zero_shot_*.csv",
    },
    # Few-Shot Experiments
    "E4": {
        "model": "Mixtral-8x7B",
        "strategy": "Few-Shot",
        "file": "../Few_Shot/e4_gpt_oss_20b_few_shot_*.csv",
    },
    "E5": {
        "model": "Llama-3.1-70B",
        "strategy": "Few-Shot",
        "file": "../Few_Shot/e5_gpt_oss_120b_few_shot_*.csv",
    },
    "E6": {
        "model": "FinBERT",
        "strategy": "Few-Shot",
        "file": "../Few_Shot/e6_llama_few_shot_*.csv",
    },
    # Chain-of-Thought Experiments
    "E7": {
        "model": "Mixtral-8x7B",
        "strategy": "Chain-of-Thought",
        "file": "../Chain_of_Thought/e7_GPT_OSS_20B_cot_*.csv",
    },
    "E8": {
        "model": "Llama-3.1-70B",
        "strategy": "Chain-of-Thought",
        "file": "../Chain_of_Thought/e8_GPT_OSS_120B_cot_*.csv",
    },
    "E9": {
        "model": "FinBERT",
        "strategy": "Chain-of-Thought",
        "file": "../Chain_of_Thought/e9_Llama-3.3-70B_cot_*.csv",
    },
    # Tree-of-Thought Experiments
    "E10": {
        "model": "Mixtral-8x7B",
        "strategy": "Tree-of-Thought",
        "file": "../Tree_of_Thought/e10_GPT_OSS_20B_tot_*.csv",
    },
    "E11": {
        "model": "Llama-3.1-70B",
        "strategy": "Tree-of-Thought",
        "file": "../Tree_of_Thought/e11_GPT_OSS_120B_flash_tot_*.csv",
    },
    "E12": {
        "model": "FinBERT",
        "strategy": "Tree-of-Thought",
        "file": "../Tree_of_Thought/e12_Llama_3.3_70B_tot_*.csv",
    },
}


# Function to load experiment results with robust error handling
def load_experiment_data(exp_id, exp_info):
    """Load experiment data with error handling and validation"""
    try:
        files = glob(exp_info["file"])
        if not files:
            print(f"‚ö†Ô∏è  {exp_id}: No files found matching '{exp_info['file']}'")
            return None

        # Get most recent file
        latest_file = max(files)
        df = pd.read_csv(latest_file)

        # Validate required columns
        required_cols = ["true_sentiment", "predicted_sentiment"]
        if not all(col in df.columns for col in required_cols):
            print(f"‚ùå {exp_id}: Missing required columns in {latest_file}")
            return None

        print(
            f"‚úì {exp_id}: Loaded {len(df):,} samples from {os.path.basename(latest_file)}"
        )
        return df

    except Exception as e:
        print(f"‚ùå {exp_id}: Error - {str(e)}")
        return None


# Load all experiment results
print("\n" + "=" * 100)
print("üìä LOADING EXPERIMENT RESULTS")
print("=" * 100)
print("\nAttempting to load 12 experiments (E1-E12)...\n")

results_data = {}
loading_errors = []
missing_files = []

for exp_id, exp_info in experiments.items():
    df = load_experiment_data(exp_id, exp_info)
    if df is not None:
        results_data[exp_id] = df
    else:
        if not glob(exp_info["file"]):
            missing_files.append(
                f"{exp_id} ({exp_info['model']} + {exp_info['strategy']})"
            )
        else:
            loading_errors.append(exp_id)

# Summary
print("\n" + "=" * 100)
print("üìà LOADING SUMMARY")
print("=" * 100)

loaded_count = len(results_data)
total_count = len(experiments)
success_rate = (loaded_count / total_count) * 100

if loaded_count == total_count:
    print(f"\n‚úÖ SUCCESS: All {total_count} experiments loaded ({success_rate:.0f}%)")
    print(f"\n‚úì Loaded: {', '.join(sorted(results_data.keys()))}")
elif loaded_count > 0:
    print(
        f"\n‚ö†Ô∏è  PARTIAL: {loaded_count}/{total_count} experiments loaded ({success_rate:.0f}%)"
    )
    print(f"\n‚úì Loaded: {', '.join(sorted(results_data.keys()))}")

    if loading_errors:
        print(
            f"\n‚ö†Ô∏è  Loading errors ({len(loading_errors)}): {', '.join(loading_errors)}"
        )
        print("   Check file format and required columns.")

    if missing_files:
        print(f"\n‚ùå Missing files ({len(missing_files)}):")
        for mf in missing_files:
            print(f"   ‚Ä¢ {mf}")
else:
    print(f"\n‚ùå ERROR: No experiments loaded (0/{total_count})")
    print("\nüîç Troubleshooting:")
    print("   1. Run experiment notebooks (E1-E12) to generate CSV files")
    print("   2. Check that notebooks are in correct directories:")
    print("      ‚Ä¢ Zero_Shot/E1_E2_E3_zero_shot_sentiment_All_agree.ipynb")
    print("      ‚Ä¢ Few_Shot/E4_E5_E6_few_shot_sentiment.ipynb")
    print("      ‚Ä¢ Chain_of_Thought/E7_E8_E9_cot_sentiment.ipynb")
    print("      ‚Ä¢ Tree_of_Thought/E10_tot_sentiment.ipynb")
    print(
        "   3. Verify CSV files contain 'true_sentiment' and 'predicted_sentiment' columns"
    )

    print(f"\n‚úì Loaded ({loaded_count}): {', '.join(sorted(results_data.keys()))}")

    if missing_files:
        print(f"\n‚ùå Missing files ({len(missing_files)}):")
        for exp in missing_files:
            print(f"   ‚Ä¢ {exp}")
        print(
            "\nüí° TIP: Run the corresponding experiment notebooks to generate these files."
        )

## 2. Calculate Metrics for All Experiments

## 3. Strategy-wise Performance Comparison

In [None]:
if not results_df.empty:
    print("\n" + "=" * 100)
    print("üîÑ Analyzing strategy performance...")

    # Group by strategy
    strategy_summary = results_df.groupby("Strategy")[
        ["Accuracy", "Macro-F1", "Precision", "Recall", "Negative-F1"]
    ].mean()

    strategies_found = len(strategy_summary)
    print(f"‚úì Found {strategies_found} unique strategies in data")

    # Order strategies by complexity
    strategy_order = ["Zero-Shot", "Few-Shot", "Chain-of-Thought", "Tree-of-Thought"]
    strategy_summary = strategy_summary.reindex(
        [s for s in strategy_order if s in strategy_summary.index]
    )

    print("\n" + "=" * 80)
    print("AVERAGE PERFORMANCE BY PROMPTING STRATEGY")
    print("=" * 80)
    display(strategy_summary.round(4))

    # Visualize strategy comparison
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Performance metrics
    strategy_summary[["Accuracy", "Macro-F1", "Precision", "Recall"]].plot(
        kind="bar", ax=axes[0], width=0.8, alpha=0.8
    )
    axes[0].set_xlabel("Prompting Strategy", fontsize=13, weight="bold")
    axes[0].set_ylabel("Score", fontsize=13, weight="bold")
    axes[0].set_title(
        "Performance Comparison Across Prompting Strategies",
        fontsize=15,
        weight="bold",
        pad=20,
    )
    axes[0].set_xticklabels(strategy_summary.index, rotation=45, ha="right")
    axes[0].legend(title="Metrics", fontsize=11)
    axes[0].set_ylim([0, 1])
    axes[0].grid(axis="y", alpha=0.3)

    # Negative-F1 trend (critical metric)
    x = np.arange(len(strategy_summary))
    axes[1].plot(
        x,
        strategy_summary["Negative-F1"],
        marker="o",
        linewidth=3,
        markersize=10,
        color="#E74C3C",
    )
    axes[1].axhline(
        y=0.5, color="red", linestyle="--", linewidth=2, label="Minimum Threshold (0.5)"
    )
    axes[1].set_xlabel("Prompting Complexity ‚Üí", fontsize=13, weight="bold")
    axes[1].set_ylabel("Negative-F1 Score", fontsize=13, weight="bold")
    axes[1].set_title(
        "Negative Class Detection (Critical Metric)", fontsize=15, weight="bold", pad=20
    )
    axes[1].set_xticks(x)
    axes[1].set_xticklabels(strategy_summary.index, rotation=45, ha="right")
    axes[1].set_ylim([0, 1])
    axes[1].grid(alpha=0.3)
    axes[1].legend()

    plt.tight_layout()
    plt.savefig("strategy_comparison.png", dpi=300, bbox_inches="tight")
    plt.show()
else:
    print("‚ö†Ô∏è  Skipping strategy comparison - no data available")

## 4. Model-wise Performance Comparison

In [None]:
if not results_df.empty:
    print("\n" + "=" * 100)
    print("üîÑ Analyzing model performance...")

    # Group by model
    model_summary = results_df.groupby("Model")[
        ["Accuracy", "Macro-F1", "Precision", "Recall", "Negative-F1", "Error_Rate"]
    ].mean()

    models_found = len(model_summary)
    print(f"‚úì Found {models_found} unique models in data")

    print("\n" + "=" * 80)
    print("AVERAGE PERFORMANCE BY MODEL (across all strategies)")
    print("=" * 80)
    display(model_summary.round(4))

    # Visualize model comparison
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Performance metrics
    model_summary[["Accuracy", "Macro-F1", "Precision", "Recall"]].plot(
        kind="bar",
        ax=axes[0],
        width=0.7,
        alpha=0.8,
        color=["#FF6B6B", "#4ECDC4", "#45B7D1"],
    )
    axes[0].set_xlabel("Model", fontsize=13, weight="bold")
    axes[0].set_ylabel("Score", fontsize=13, weight="bold")
    axes[0].set_title(
        "Performance Comparison Across Models", fontsize=15, weight="bold", pad=20
    )
    axes[0].set_xticklabels(model_summary.index, rotation=45, ha="right")
    axes[0].legend(title="Metrics", fontsize=11)
    axes[0].set_ylim([0, 1])
    axes[0].grid(axis="y", alpha=0.3)

    # Negative-F1 and Error Rate
    x = np.arange(len(model_summary))
    width = 0.35

    ax2 = axes[1]
    bars1 = ax2.bar(
        x - width / 2,
        model_summary["Negative-F1"],
        width,
        label="Negative-F1",
        color="#27AE60",
        alpha=0.8,
    )

    ax2_twin = ax2.twinx()
    bars2 = ax2_twin.bar(
        x + width / 2,
        model_summary["Error_Rate"],
        width,
        label="Error Rate",
        color="#E74C3C",
        alpha=0.8,
    )

    ax2.set_xlabel("Model", fontsize=13, weight="bold")
    ax2.set_ylabel("Negative-F1 Score", fontsize=12, weight="bold", color="#27AE60")
    ax2_twin.set_ylabel(
        "Parsing Error Rate", fontsize=12, weight="bold", color="#E74C3C"
    )
    ax2.set_title(
        "Model Reliability: Detection vs Errors", fontsize=15, weight="bold", pad=20
    )
    ax2.set_xticks(x)
    ax2.set_xticklabels(model_summary.index, rotation=45, ha="right")
    ax2.set_ylim([0, 1])
    ax2_twin.set_ylim([0, max(0.5, model_summary["Error_Rate"].max() * 1.2)])
    ax2.tick_params(axis="y", labelcolor="#27AE60")
    ax2_twin.tick_params(axis="y", labelcolor="#E74C3C")
    ax2.grid(axis="y", alpha=0.3)

    # Combined legend
    lines1, labels1 = ax2.get_legend_handles_labels()
    lines2, labels2 = ax2_twin.get_legend_handles_labels()
    ax2.legend(lines1 + lines2, labels1 + labels2, loc="upper right")

    plt.tight_layout()
    plt.savefig("model_comparison.png", dpi=300, bbox_inches="tight")
    plt.show()
else:
    print("‚ö†Ô∏è  Skipping model comparison - no data available")

## 5. Heatmap: Model √ó Strategy Performance

In [None]:
if not results_df.empty and len(results_df) >= 3:
    # Create pivot tables for heatmaps
    strategy_order = ["Zero-Shot", "Few-Shot", "Chain-of-Thought", "Tree-of-Thought"]

    pivot_accuracy = results_df.pivot(
        index="Model", columns="Strategy", values="Accuracy"
    )
    pivot_accuracy = pivot_accuracy.reindex(
        columns=[s for s in strategy_order if s in pivot_accuracy.columns]
    )

    pivot_f1 = results_df.pivot(index="Model", columns="Strategy", values="Macro-F1")
    pivot_f1 = pivot_f1.reindex(
        columns=[s for s in strategy_order if s in pivot_f1.columns]
    )

    pivot_neg_f1 = results_df.pivot(
        index="Model", columns="Strategy", values="Negative-F1"
    )
    pivot_neg_f1 = pivot_neg_f1.reindex(
        columns=[s for s in strategy_order if s in pivot_neg_f1.columns]
    )

    # Visualize as heatmaps
    fig, axes = plt.subplots(1, 3, figsize=(20, 5))

    # Accuracy heatmap
    sns.heatmap(
        pivot_accuracy,
        annot=True,
        fmt=".3f",
        cmap="YlGnBu",
        cbar_kws={"label": "Accuracy"},
        ax=axes[0],
        vmin=0,
        vmax=1,
    )
    axes[0].set_title(
        "Accuracy by Model and Strategy", fontsize=13, weight="bold", pad=15
    )
    axes[0].set_xlabel("Prompting Strategy", fontsize=11, weight="bold")
    axes[0].set_ylabel("Model", fontsize=11, weight="bold")

    # Macro-F1 heatmap
    sns.heatmap(
        pivot_f1,
        annot=True,
        fmt=".3f",
        cmap="RdYlGn",
        cbar_kws={"label": "Macro-F1"},
        ax=axes[1],
        vmin=0,
        vmax=1,
    )
    axes[1].set_title(
        "Macro-F1 by Model and Strategy", fontsize=13, weight="bold", pad=15
    )
    axes[1].set_xlabel("Prompting Strategy", fontsize=11, weight="bold")
    axes[1].set_ylabel("Model", fontsize=11, weight="bold")

    # Negative-F1 heatmap (CRITICAL)
    sns.heatmap(
        pivot_neg_f1,
        annot=True,
        fmt=".3f",
        cmap="Reds",
        cbar_kws={"label": "Negative-F1"},
        ax=axes[2],
        vmin=0,
        vmax=1,
    )
    axes[2].set_title(
        "Negative-F1 by Model and Strategy (Critical)",
        fontsize=13,
        weight="bold",
        pad=15,
    )
    axes[2].set_xlabel("Prompting Strategy", fontsize=11, weight="bold")
    axes[2].set_ylabel("Model", fontsize=11, weight="bold")

    plt.tight_layout()
    plt.savefig("performance_heatmaps.png", dpi=300, bbox_inches="tight")
    plt.show()

    # Best performing combinations
    print("\n" + "=" * 80)
    print("BEST PERFORMING COMBINATIONS")
    print("=" * 80)
    best_acc = results_df.loc[results_df["Accuracy"].idxmax()]
    best_f1 = results_df.loc[results_df["Macro-F1"].idxmax()]
    best_neg = results_df.loc[results_df["Negative-F1"].idxmax()]

    print(
        f"Best Accuracy: {best_acc['Experiment']} ({best_acc['Model']} + {best_acc['Strategy']}) = {best_acc['Accuracy']:.4f}"
    )
    print(
        f"Best Macro-F1: {best_f1['Experiment']} ({best_f1['Model']} + {best_f1['Strategy']}) = {best_f1['Macro-F1']:.4f}"
    )
    print(
        f"Best Negative-F1: {best_neg['Experiment']} ({best_neg['Model']} + {best_neg['Strategy']}) = {best_neg['Negative-F1']:.4f}"
    )
else:
    print("‚ö†Ô∏è  Skipping heatmaps - insufficient data")

## 6. Best Performing Configurations

In [None]:
if not results_df.empty:
    # Find top 5 configurations by different metrics
    print("\n" + "=" * 80)
    print("TOP 5 CONFIGURATIONS BY ACCURACY")
    print("=" * 80)
    top_accuracy = results_df.nlargest(min(5, len(results_df)), "Accuracy")[
        ["Experiment", "Model", "Strategy", "Accuracy", "Macro-F1", "Negative-F1"]
    ]
    display(top_accuracy.round(4))

    print("\n" + "=" * 80)
    print("TOP 5 CONFIGURATIONS BY MACRO-F1")
    print("=" * 80)
    top_f1 = results_df.nlargest(min(5, len(results_df)), "Macro-F1")[
        ["Experiment", "Model", "Strategy", "Accuracy", "Macro-F1", "Negative-F1"]
    ]
    display(top_f1.round(4))

    print("\n" + "=" * 80)
    print("TOP 5 CONFIGURATIONS BY NEGATIVE-F1 (Critical for Financial Risk)")
    print("=" * 80)
    top_neg = results_df.nlargest(min(5, len(results_df)), "Negative-F1")[
        ["Experiment", "Model", "Strategy", "Accuracy", "Macro-F1", "Negative-F1"]
    ]
    display(top_neg.round(4))

    # Overall best configuration
    best_overall = results_df.loc[results_df["Macro-F1"].idxmax()]
    print("\n" + "=" * 80)
    print("üèÜ BEST OVERALL CONFIGURATION (by Macro-F1)")
    print("=" * 80)
    print(f"Experiment: {best_overall['Experiment']}")
    print(f"Model: {best_overall['Model']}")
    print(f"Strategy: {best_overall['Strategy']}")
    print(f"Accuracy: {best_overall['Accuracy']:.4f}")
    print(f"Macro-F1: {best_overall['Macro-F1']:.4f}")
    print(f"Negative-F1: {best_overall['Negative-F1']:.4f}")
    print(f"Error Rate: {best_overall['Error_Rate']:.4f}")
else:
    print("‚ö†Ô∏è  No configurations to rank - no data available")

## 7. Statistical Analysis & Improvement Trends

In [None]:
if not results_df.empty and "Zero-Shot" in results_df["Strategy"].values:
    # Calculate improvement from baseline (zero-shot)
    baseline_acc = results_df[results_df["Strategy"] == "Zero-Shot"]["Accuracy"].mean()
    baseline_f1 = results_df[results_df["Strategy"] == "Zero-Shot"]["Macro-F1"].mean()

    improvements = []
    for strategy in results_df["Strategy"].unique():
        strategy_data = results_df[results_df["Strategy"] == strategy]
        strategy_acc = strategy_data["Accuracy"].mean()
        strategy_f1 = strategy_data["Macro-F1"].mean()
        strategy_neg = strategy_data["Negative-F1"].mean()

        acc_improvement = (
            ((strategy_acc - baseline_acc) / baseline_acc * 100)
            if baseline_acc > 0
            else 0
        )
        f1_improvement = (
            ((strategy_f1 - baseline_f1) / baseline_f1 * 100) if baseline_f1 > 0 else 0
        )

        improvements.append(
            {
                "Strategy": strategy,
                "Mean_Accuracy": strategy_acc,
                "Mean_Macro-F1": strategy_f1,
                "Mean_Negative-F1": strategy_neg,
                "Accuracy_Improvement_%": acc_improvement,
                "F1_Improvement_%": f1_improvement,
            }
        )

    improvement_df = pd.DataFrame(improvements)

    print("\n" + "=" * 80)
    print("IMPROVEMENT OVER ZERO-SHOT BASELINE")
    print("=" * 80)
    display(improvement_df.round(2))

    # Visualize improvement trend
    strategy_order = ["Zero-Shot", "Few-Shot", "Chain-of-Thought", "Tree-of-Thought"]
    improvement_df["Strategy"] = pd.Categorical(
        improvement_df["Strategy"], categories=strategy_order, ordered=True
    )
    improvement_df = improvement_df.sort_values("Strategy")

    fig, ax = plt.subplots(figsize=(12, 6))

    x = np.arange(len(improvement_df))
    width = 0.35

    bars1 = ax.bar(
        x - width / 2,
        improvement_df["Accuracy_Improvement_%"],
        width,
        label="Accuracy Improvement",
        alpha=0.8,
    )
    bars2 = ax.bar(
        x + width / 2,
        improvement_df["F1_Improvement_%"],
        width,
        label="Macro-F1 Improvement",
        alpha=0.8,
    )

    ax.set_xlabel("Prompting Strategy Complexity ‚Üí", fontsize=13, weight="bold")
    ax.set_ylabel("% Improvement over Zero-Shot", fontsize=13, weight="bold")
    ax.set_title("Performance Improvement Trend", fontsize=15, weight="bold", pad=20)
    ax.set_xticks(x)
    ax.set_xticklabels(improvement_df["Strategy"], rotation=30, ha="right")
    ax.axhline(y=0, color="black", linestyle="-", linewidth=0.8)
    ax.legend()
    ax.grid(axis="y", alpha=0.3)

    plt.tight_layout()
    plt.savefig("improvement_trend.png", dpi=300, bbox_inches="tight")
    plt.show()
else:
    print(
        "‚ö†Ô∏è  Skipping improvement analysis - insufficient data or no Zero-Shot baseline"
    )

## 8. Radar Chart: Multi-Metric Comparison

In [None]:
if not results_df.empty:
    from math import pi

    # Create radar chart for strategy comparison
    categories = ["Accuracy", "Macro-F1", "Precision", "Recall", "Negative-F1"]
    N = len(categories)

    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]

    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection="polar"))

    strategy_order = ["Zero-Shot", "Few-Shot", "Chain-of-Thought", "Tree-of-Thought"]
    colors = ["#3498DB", "#2ECC71", "#F39C12", "#E74C3C"]

    for i, strategy in enumerate(strategy_order):
        if strategy in results_df["Strategy"].values:
            strategy_data = (
                results_df[results_df["Strategy"] == strategy][categories]
                .mean()
                .values.tolist()
            )
            strategy_data += strategy_data[:1]
            ax.plot(
                angles,
                strategy_data,
                "o-",
                linewidth=2,
                label=strategy,
                color=colors[i],
            )
            ax.fill(angles, strategy_data, alpha=0.15, color=colors[i])

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories, size=12)
    ax.set_ylim(0, 1)
    ax.set_title(
        "Multi-Metric Comparison of Prompting Strategies",
        size=15,
        weight="bold",
        pad=20,
    )
    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1), fontsize=11)
    ax.grid(True)

    plt.tight_layout()
    plt.savefig("radar_chart_comparison.png", dpi=300, bbox_inches="tight")
    plt.show()
else:
    print("‚ö†Ô∏è  Skipping radar chart - no data available")

## 9. Cost-Performance Analysis

Estimate computational costs based on token usage and compare with performance.

In [None]:
if not results_df.empty:
    # Relative cost estimates (based on token usage and API costs)
    cost_mapping = {
        "Zero-Shot": 1.0,  # Baseline
        "Few-Shot": 2.5,  # ~2.5x more tokens for examples
        "Chain-of-Thought": 3.0,  # ~3x for reasoning
        "Tree-of-Thought": 4.5,  # ~4.5x for multi-path exploration
    }

    # Model-specific cost multipliers (Groq API pricing)
    model_cost_multiplier = {
        "Mixtral-8x7B": 1.0,  # $0.24/1M tokens (baseline)
        "Llama-3.1-70B": 2.5,  # $0.59/1M tokens (~2.5x)
        "FinBERT": 0.0,  # Free (local)
    }

    results_df["Strategy_Cost"] = results_df["Strategy"].map(cost_mapping)
    results_df["Model_Cost"] = results_df["Model"].map(model_cost_multiplier)
    results_df["Relative_Total_Cost"] = (
        results_df["Strategy_Cost"] * results_df["Model_Cost"]
    )
    results_df["Cost_Efficiency"] = results_df.apply(
        lambda row: (
            row["Macro-F1"] / row["Relative_Total_Cost"]
            if row["Relative_Total_Cost"] > 0
            else row["Macro-F1"] * 1000
        ),
        axis=1,
    )

    # Plot cost vs performance
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Scatter plot: Cost vs Performance
    for model in results_df["Model"].unique():
        model_data = results_df[results_df["Model"] == model]
        axes[0].scatter(
            model_data["Relative_Total_Cost"],
            model_data["Macro-F1"],
            s=200,
            alpha=0.6,
            label=model,
        )

        # Add experiment labels
        for _, row in model_data.iterrows():
            axes[0].annotate(
                row["Experiment"],
                (row["Relative_Total_Cost"], row["Macro-F1"]),
                fontsize=9,
                ha="center",
                va="bottom",
            )

    axes[0].set_xlabel("Relative Computational Cost", fontsize=13, weight="bold")
    axes[0].set_ylabel("Macro-F1 Score", fontsize=13, weight="bold")
    axes[0].set_title(
        "Cost-Performance Trade-off Analysis", fontsize=15, weight="bold", pad=20
    )
    axes[0].legend(title="Model", fontsize=11)
    axes[0].grid(alpha=0.3)
    axes[0].set_ylim([0, 1])

    # Cost efficiency ranking
    top_efficient = results_df.nlargest(min(8, len(results_df)), "Cost_Efficiency")

    bars = axes[1].barh(
        range(len(top_efficient)), top_efficient["Cost_Efficiency"], alpha=0.8
    )
    axes[1].set_yticks(range(len(top_efficient)))
    axes[1].set_yticklabels(
        [
            f"{row['Experiment']} ({row['Model'][:10]}+{row['Strategy'][:5]})"
            for _, row in top_efficient.iterrows()
        ],
        fontsize=10,
    )
    axes[1].set_xlabel("Cost Efficiency (F1 / Cost)", fontsize=13, weight="bold")
    axes[1].set_title(
        "Most Cost-Efficient Experiments", fontsize=15, weight="bold", pad=20
    )
    axes[1].grid(axis="x", alpha=0.3)

    # Color bars by model
    model_colors = {
        "Mixtral-8x7B": "#FF6B6B",
        "Llama-3.1-70B": "#4ECDC4",
        "FinBERT": "#45B7D1",
    }
    for i, (_, row) in enumerate(top_efficient.iterrows()):
        bars[i].set_color(model_colors.get(row["Model"], "#95A5A6"))

    plt.tight_layout()
    plt.savefig("cost_performance_tradeoff.png", dpi=300, bbox_inches="tight")
    plt.show()

    # Cost efficiency ranking table
    print("\n" + "=" * 100)
    print("COST EFFICIENCY RANKING (F1 / Relative Cost)")
    print("=" * 100)
    cost_ranked = results_df[
        [
            "Experiment",
            "Model",
            "Strategy",
            "Macro-F1",
            "Relative_Total_Cost",
            "Cost_Efficiency",
        ]
    ].sort_values("Cost_Efficiency", ascending=False)
    display(cost_ranked.round(4))
else:
    print("‚ö†Ô∏è  Skipping cost analysis - no data available")

## 10. Key Findings & Recommendations

### Summary of Results

This section will be populated with insights once experiments are run.

In [None]:
if not results_df.empty:
    print("\n" + "=" * 100)
    print("KEY FINDINGS & RECOMMENDATIONS")
    print("=" * 100)

    # 1. Best model across all strategies
    best_model = results_df.groupby("Model")["Macro-F1"].mean().idxmax()
    best_model_f1 = results_df.groupby("Model")["Macro-F1"].mean().max()

    print(f"\n1Ô∏è‚É£  BEST OVERALL MODEL: {best_model}")
    print(f"   Average Macro-F1 across all strategies: {best_model_f1:.4f}")

    # 2. Best strategy across all models
    best_strategy = results_df.groupby("Strategy")["Macro-F1"].mean().idxmax()
    best_strategy_f1 = results_df.groupby("Strategy")["Macro-F1"].mean().max()

    print(f"\n2Ô∏è‚É£  BEST PROMPTING STRATEGY: {best_strategy}")
    print(f"   Average Macro-F1 across all models: {best_strategy_f1:.4f}")

    # 3. Best combination
    best_combo = results_df.loc[results_df["Macro-F1"].idxmax()]

    print(f"\n3Ô∏è‚É£  BEST OVERALL COMBINATION: {best_combo['Experiment']}")
    print(f"   Model: {best_combo['Model']}")
    print(f"   Strategy: {best_combo['Strategy']}")
    print(f"   Macro-F1: {best_combo['Macro-F1']:.4f}")
    print(f"   Accuracy: {best_combo['Accuracy']:.4f}")
    print(f"   Negative-F1: {best_combo['Negative-F1']:.4f}")

    # 4. Most cost-efficient
    most_efficient = results_df.loc[results_df["Cost_Efficiency"].idxmax()]

    print(f"\n4Ô∏è‚É£  MOST COST-EFFICIENT: {most_efficient['Experiment']}")
    print(f"   {most_efficient['Model']} + {most_efficient['Strategy']}")
    print(f"   Macro-F1: {most_efficient['Macro-F1']:.4f}")
    print(f"   Relative Cost: {most_efficient['Relative_Total_Cost']:.2f}x baseline")

    # 5. Negative class detection champion
    best_negative = results_df.loc[results_df["Negative-F1"].idxmax()]

    print(f"\n5Ô∏è‚É£  BEST NEGATIVE CLASS DETECTION: {best_negative['Experiment']}")
    print(f"   {best_negative['Model']} + {best_negative['Strategy']}")
    print(
        f"   Negative-F1: {best_negative['Negative-F1']:.4f} (Critical for financial risk)"
    )

    # 6. Production recommendations
    print("\n" + "=" * 100)
    print("PRODUCTION DEPLOYMENT RECOMMENDATIONS")
    print("=" * 100)

    # Production criteria
    production_candidates = results_df[
        (results_df["Macro-F1"] >= 0.70)
        & (results_df["Negative-F1"] >= 0.50)
        & (results_df["Error_Rate"] <= 0.05)
    ].sort_values("Macro-F1", ascending=False)

    if len(production_candidates) > 0:
        print("\n‚úÖ PRODUCTION-READY CONFIGURATIONS:")
        print("   (Macro-F1 ‚â• 0.70, Negative-F1 ‚â• 0.50, Error Rate ‚â§ 5%)\n")
        for idx, row in production_candidates.iterrows():
            print(f"   ‚Ä¢ {row['Experiment']}: {row['Model']} + {row['Strategy']}")
            print(
                f"     Macro-F1: {row['Macro-F1']:.4f}, Negative-F1: {row['Negative-F1']:.4f}, Cost: {row['Relative_Total_Cost']:.2f}x"
            )
    else:
        print("\n‚ö†Ô∏è  No configurations meet all production criteria")
        print("   Consider relaxing thresholds or improving prompts")

    print("\nüìä SCENARIO-BASED RECOMMENDATIONS:")
    print(f"\n   üí∞ Budget Unlimited (Best Performance):")
    print(
        f"      ‚Üí Use {best_combo['Experiment']}: {best_combo['Model']} + {best_combo['Strategy']}"
    )
    print(f"      ‚Üí Macro-F1: {best_combo['Macro-F1']:.4f}")

    print(f"\n   üí∏ Cost-Conscious (Best Value):")
    print(
        f"      ‚Üí Use {most_efficient['Experiment']}: {most_efficient['Model']} + {most_efficient['Strategy']}"
    )
    print(
        f"      ‚Üí Macro-F1: {most_efficient['Macro-F1']:.4f} at {most_efficient['Relative_Total_Cost']:.2f}x cost"
    )

    finbert_exps = results_df[results_df["Model"] == "FinBERT"].sort_values(
        "Macro-F1", ascending=False
    )
    if len(finbert_exps) > 0:
        best_finbert = finbert_exps.iloc[0]
        print(f"\n   üÜì Free/Local Deployment:")
        print(
            f"      ‚Üí Use {best_finbert['Experiment']}: FinBERT + {best_finbert['Strategy']}"
        )
        print(f"      ‚Üí Macro-F1: {best_finbert['Macro-F1']:.4f} (No API costs)")

else:
    print("‚ö†Ô∏è  No data available for generating recommendations")
    print("Please run experiment notebooks first to generate results files.")

## 11. Export Complete Results

In [None]:
if not results_df.empty:
    from datetime import datetime
    import os

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    print("\n" + "=" * 100)
    print("üíæ EXPORTING RESULTS")
    print("=" * 100)
    print(f"\nTimestamp: {timestamp}\n")

    csv_files = []
    viz_files = []

    # Save complete results
    print("Saving CSV files...")
    results_file = f"complete_results_comparison_{timestamp}.csv"
    results_df.to_csv(results_file, index=False)
    csv_files.append(
        (results_file, len(results_df), "Complete results with all metrics")
    )
    print(f"  ‚úì {results_file} ({len(results_df)} experiments)")

    # Save summaries if they exist
    if "strategy_summary" in locals() and not strategy_summary.empty:
        strategy_file = f"strategy_summary_{timestamp}.csv"
        strategy_summary.to_csv(strategy_file)
        csv_files.append(
            (strategy_file, len(strategy_summary), "Strategy-wise averages")
        )
        print(f"  ‚úì {strategy_file} ({len(strategy_summary)} strategies)")

    if "model_summary" in locals() and not model_summary.empty:
        model_file = f"model_summary_{timestamp}.csv"
        model_summary.to_csv(model_file)
        csv_files.append((model_file, len(model_summary), "Model-wise averages"))
        print(f"  ‚úì {model_file} ({len(model_summary)} models)")

    if "improvement_df" in locals() and not improvement_df.empty:
        improvement_file = f"improvement_analysis_{timestamp}.csv"
        improvement_df.to_csv(improvement_file, index=False)
        csv_files.append(
            (improvement_file, len(improvement_df), "Improvement over baseline")
        )
        print(f"  ‚úì {improvement_file} ({len(improvement_df)} strategies)")

    # List visualizations
    print("\nChecking visualizations...")
    potential_viz = [
        "strategy_comparison.png",
        "model_comparison.png",
        "performance_heatmaps.png",
        "improvement_trend.png",
        "radar_chart_comparison.png",
        "cost_performance_tradeoff.png",
    ]

    for viz in potential_viz:
        if os.path.exists(viz):
            size_kb = os.path.getsize(viz) / 1024
            viz_files.append((viz, size_kb))
            print(f"  ‚úì {viz} ({size_kb:.1f} KB)")

    # Final summary
    print("\n" + "=" * 100)
    print("‚úÖ EXPORT COMPLETE")
    print("=" * 100)

    print(f"\nüìÑ CSV Files ({len(csv_files)}):")
    for filename, count, desc in csv_files:
        print(f"   ‚Ä¢ {filename}")
        print(f"     ‚îî‚îÄ {desc}")

    if viz_files:
        total_size = sum(size for _, size in viz_files)
        print(f"\nüìä Visualizations ({len(viz_files)}):")
        for filename, size in viz_files:
            print(f"   ‚Ä¢ {filename} ({size:.1f} KB)")
        print(f"\n   Total size: {total_size:.1f} KB ({total_size / 1024:.2f} MB)")

    print(
        f"\nüéâ Analysis complete! {len(csv_files)} CSV files and {len(viz_files)} visualizations ready."
    )
    print(f"üìÅ Location: {os.getcwd()}/Results/")

else:
    print("\n" + "=" * 100)
    print("‚ùå EXPORT FAILED")
    print("=" * 100)
    print("\n‚ö†Ô∏è  No results to export - no experiment data loaded\n")
    print("üìã Next steps:")
    print("   1. Navigate to experiment folders (Zero_Shot, Few_Shot, etc.)")
    print("   2. Run notebooks E1-E12 to generate CSV results")
    print("   3. Return to this notebook to perform comparative analysis")
    print(
        "\nüí° TIP: You can run experiments incrementally - this notebook will analyze whatever is available."
    )