In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn plotly -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os

sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (16, 8)

print("✓ Libraries loaded")

## 1. Load All Results

Load metrics summaries from all prompting strategy experiments.

In [None]:
# Create comprehensive results dataframe (manual entry for demonstration)
# In practice, load from saved CSV files

all_results = [
    # Zero-Shot Results
    {
        "Experiment": "E1",
        "Model": "Gemini Pro",
        "Strategy": "Zero-Shot",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
    {
        "Experiment": "E2",
        "Model": "Gemini Flash",
        "Strategy": "Zero-Shot",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
    {
        "Experiment": "E3",
        "Model": "Llama-3.3-70B",
        "Strategy": "Zero-Shot",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
    # Few-Shot Results
    {
        "Experiment": "E4",
        "Model": "Gemini Pro",
        "Strategy": "Few-Shot",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
    {
        "Experiment": "E5",
        "Model": "Gemini Flash",
        "Strategy": "Few-Shot",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
    {
        "Experiment": "E6",
        "Model": "Llama-3.3-70B",
        "Strategy": "Few-Shot",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
    # Chain-of-Thought Results
    {
        "Experiment": "E7",
        "Model": "Gemini Pro",
        "Strategy": "Chain-of-Thought",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
    {
        "Experiment": "E8",
        "Model": "Gemini Flash",
        "Strategy": "Chain-of-Thought",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
    {
        "Experiment": "E9",
        "Model": "Llama-3.3-70B",
        "Strategy": "Chain-of-Thought",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
    # Tree-of-Thought Results
    {
        "Experiment": "E10",
        "Model": "Gemini Pro",
        "Strategy": "Tree-of-Thought",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
    {
        "Experiment": "E10b",
        "Model": "Gemini Flash",
        "Strategy": "Tree-of-Thought",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
    {
        "Experiment": "E10c",
        "Model": "Llama-3.3-70B",
        "Strategy": "Tree-of-Thought",
        "Accuracy": 0.0,
        "Macro-F1": 0.0,
        "Precision": 0.0,
        "Recall": 0.0,
    },
]

results_df = pd.DataFrame(all_results)

print("\n" + "=" * 80)
print("ALL EXPERIMENT RESULTS")
print("=" * 80)
display(results_df)

print("\nNote: Run individual experiment notebooks first to populate actual results.")
print("This notebook demonstrates the analysis framework.")

## 2. Strategy-wise Performance Comparison

In [None]:
# Group by strategy
strategy_summary = results_df.groupby("Strategy")[
    ["Accuracy", "Macro-F1", "Precision", "Recall"]
].mean()

print("\n" + "=" * 80)
print("AVERAGE PERFORMANCE BY PROMPTING STRATEGY")
print("=" * 80)
display(strategy_summary.round(4))

# Visualize strategy comparison
fig, ax = plt.subplots(figsize=(12, 6))

strategy_summary.plot(kind="bar", ax=ax, width=0.8, alpha=0.8)
ax.set_xlabel("Prompting Strategy", fontsize=13, weight="bold")
ax.set_ylabel("Score", fontsize=13, weight="bold")
ax.set_title(
    "Performance Comparison Across Prompting Strategies",
    fontsize=15,
    weight="bold",
    pad=20,
)
ax.set_xticklabels(strategy_summary.index, rotation=45, ha="right")
ax.legend(title="Metrics", fontsize=11)
ax.set_ylim([0, 1])
ax.grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("strategy_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

## 3. Model-wise Performance Comparison

In [None]:
# Group by model
model_summary = results_df.groupby("Model")[
    ["Accuracy", "Macro-F1", "Precision", "Recall"]
].mean()

print("\n" + "=" * 80)
print("AVERAGE PERFORMANCE BY MODEL")
print("=" * 80)
display(model_summary.round(4))

# Visualize model comparison
fig, ax = plt.subplots(figsize=(10, 6))

model_summary.plot(
    kind="bar", ax=ax, width=0.7, alpha=0.8, color=["#FF6B6B", "#4ECDC4", "#45B7D1"]
)
ax.set_xlabel("Model", fontsize=13, weight="bold")
ax.set_ylabel("Score", fontsize=13, weight="bold")
ax.set_title("Performance Comparison Across Models", fontsize=15, weight="bold", pad=20)
ax.set_xticklabels(model_summary.index, rotation=45, ha="right")
ax.legend(title="Metrics", fontsize=11)
ax.set_ylim([0, 1])
ax.grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("model_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

## 4. Heatmap: All Experiments

In [None]:
# Create pivot table for heatmap
pivot_accuracy = results_df.pivot(index="Model", columns="Strategy", values="Accuracy")
pivot_f1 = results_df.pivot(index="Model", columns="Strategy", values="Macro-F1")

# Visualize as heatmaps
fig, axes = plt.subplots(1, 2, figsize=(18, 5))

# Accuracy heatmap
sns.heatmap(
    pivot_accuracy,
    annot=True,
    fmt=".3f",
    cmap="YlGnBu",
    cbar_kws={"label": "Accuracy"},
    ax=axes[0],
    vmin=0,
    vmax=1,
)
axes[0].set_title("Accuracy by Model and Strategy", fontsize=13, weight="bold", pad=15)
axes[0].set_xlabel("Prompting Strategy", fontsize=11, weight="bold")
axes[0].set_ylabel("Model", fontsize=11, weight="bold")

# F1 heatmap
sns.heatmap(
    pivot_f1,
    annot=True,
    fmt=".3f",
    cmap="RdYlGn",
    cbar_kws={"label": "Macro-F1"},
    ax=axes[1],
    vmin=0,
    vmax=1,
)
axes[1].set_title("Macro-F1 by Model and Strategy", fontsize=13, weight="bold", pad=15)
axes[1].set_xlabel("Prompting Strategy", fontsize=11, weight="bold")
axes[1].set_ylabel("Model", fontsize=11, weight="bold")

plt.tight_layout()
plt.savefig("performance_heatmaps.png", dpi=300, bbox_inches="tight")
plt.show()

## 5. Best Performing Configurations

In [None]:
# Find top 5 configurations by different metrics
print("\n" + "=" * 80)
print("TOP 5 CONFIGURATIONS BY ACCURACY")
print("=" * 80)
top_accuracy = results_df.nlargest(5, "Accuracy")[
    ["Experiment", "Model", "Strategy", "Accuracy", "Macro-F1"]
]
display(top_accuracy)

print("\n" + "=" * 80)
print("TOP 5 CONFIGURATIONS BY MACRO-F1")
print("=" * 80)
top_f1 = results_df.nlargest(5, "Macro-F1")[
    ["Experiment", "Model", "Strategy", "Accuracy", "Macro-F1"]
]
display(top_f1)

# Best overall configuration
best_overall = results_df.loc[results_df["Macro-F1"].idxmax()]
print("\n" + "=" * 80)
print("BEST OVERALL CONFIGURATION")
print("=" * 80)
print(f"Experiment: {best_overall['Experiment']}")
print(f"Model: {best_overall['Model']}")
print(f"Strategy: {best_overall['Strategy']}")
print(f"Accuracy: {best_overall['Accuracy']:.4f}")
print(f"Macro-F1: {best_overall['Macro-F1']:.4f}")

## 6. Statistical Analysis

In [None]:
# Calculate improvement from baseline (zero-shot)
baseline = results_df[results_df["Strategy"] == "Zero-Shot"]["Accuracy"].mean()

improvements = []
for strategy in results_df["Strategy"].unique():
    if strategy != "Zero-Shot":
        strategy_mean = results_df[results_df["Strategy"] == strategy][
            "Accuracy"
        ].mean()
        improvement = ((strategy_mean - baseline) / baseline) * 100
        improvements.append(
            {
                "Strategy": strategy,
                "Mean Accuracy": strategy_mean,
                "% Improvement over Zero-Shot": improvement,
            }
        )

improvement_df = pd.DataFrame(improvements)

print("\n" + "=" * 80)
print("IMPROVEMENT OVER ZERO-SHOT BASELINE")
print("=" * 80)
display(improvement_df.round(2))

## 7. Radar Chart: Multi-Metric Comparison

In [None]:
from math import pi

# Create radar chart for strategy comparison
categories = ["Accuracy", "Macro-F1", "Precision", "Recall"]
N = len(categories)

angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection="polar"))

for strategy in results_df["Strategy"].unique():
    strategy_data = (
        results_df[results_df["Strategy"] == strategy][categories]
        .mean()
        .values.tolist()
    )
    strategy_data += strategy_data[:1]
    ax.plot(angles, strategy_data, "o-", linewidth=2, label=strategy)
    ax.fill(angles, strategy_data, alpha=0.15)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, size=12)
ax.set_ylim(0, 1)
ax.set_title(
    "Multi-Metric Comparison of Prompting Strategies", size=15, weight="bold", pad=20
)
ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1), fontsize=11)
ax.grid(True)

plt.tight_layout()
plt.savefig("radar_chart_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

## 8. Cost-Performance Analysis

Estimate relative computational costs and compare with performance.

In [None]:
# Add relative cost estimates (tokens, API calls, time)
cost_mapping = {
    "Zero-Shot": 1.0,  # Baseline
    "Few-Shot": 1.5,  # 50% more tokens for examples
    "Chain-of-Thought": 2.0,  # 2x for reasoning
    "Tree-of-Thought": 3.0,  # 3x for multi-path exploration
}

results_df["Relative_Cost"] = results_df["Strategy"].map(cost_mapping)
results_df["Cost_Efficiency"] = results_df["Macro-F1"] / results_df["Relative_Cost"]

# Plot cost vs performance
fig, ax = plt.subplots(figsize=(12, 7))

for model in results_df["Model"].unique():
    model_data = results_df[results_df["Model"] == model]
    ax.scatter(
        model_data["Relative_Cost"],
        model_data["Macro-F1"],
        s=200,
        alpha=0.6,
        label=model,
    )

    # Add experiment labels
    for _, row in model_data.iterrows():
        ax.annotate(
            row["Experiment"],
            (row["Relative_Cost"], row["Macro-F1"]),
            fontsize=9,
            ha="right",
        )

ax.set_xlabel("Relative Computational Cost", fontsize=13, weight="bold")
ax.set_ylabel("Macro-F1 Score", fontsize=13, weight="bold")
ax.set_title("Cost-Performance Trade-off Analysis", fontsize=15, weight="bold", pad=20)
ax.legend(title="Model", fontsize=11)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig("cost_performance_tradeoff.png", dpi=300, bbox_inches="tight")
plt.show()

# Cost efficiency ranking
print("\n" + "=" * 80)
print("COST EFFICIENCY RANKING (F1 / Cost)")
print("=" * 80)
cost_ranked = results_df[
    ["Experiment", "Model", "Strategy", "Macro-F1", "Relative_Cost", "Cost_Efficiency"]
].sort_values("Cost_Efficiency", ascending=False)
display(cost_ranked.round(4))

## 9. Key Findings & Recommendations

### Summary of Results:

Based on the comprehensive analysis above:

1. **Best Performing Strategy**: [To be determined after running experiments]
2. **Best Model**: [To be determined after running experiments]
3. **Best Overall Configuration**: [Experiment ID + details]

### Recommendations:

**For Production Deployment**:
- If cost is not a constraint: Use [Best F1 configuration]
- If cost-efficiency matters: Use [Best cost-efficiency configuration]
- If speed is critical: Use [Fastest configuration with acceptable performance]

**For Research**:
- Advanced reasoning (CoT, ToT) shows [improvement %] over baseline
- Few-shot learning provides good balance of performance and simplicity
- Model size vs prompting strategy trade-offs identified

### Next Steps:
1. Run experiments on full dataset (not just sample)
2. Perform statistical significance testing
3. Conduct error analysis on misclassifications
4. Test on different financial domains for generalization

## 10. Export Complete Results

In [None]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save complete results
results_df.to_csv(f"complete_results_comparison_{timestamp}.csv", index=False)
strategy_summary.to_csv(f"strategy_summary_{timestamp}.csv")
model_summary.to_csv(f"model_summary_{timestamp}.csv")

print(f"\n✓ All comparison results saved with timestamp: {timestamp}")
print("\nFiles created:")
print(f"  - complete_results_comparison_{timestamp}.csv")
print(f"  - strategy_summary_{timestamp}.csv")
print(f"  - model_summary_{timestamp}.csv")
print("\nVisualizations saved:")
print("  - strategy_comparison.png")
print("  - model_comparison.png")
print("  - performance_heatmaps.png")
print("  - radar_chart_comparison.png")
print("  - cost_performance_tradeoff.png")