# Comprehensive Results Comparison: All 12 Risk Assessment Experiments

**Experiment Matrix**: 4 Approaches √ó 3 Models = 12 Experiments

| Approach | Mistral-8x7B-32768 | Llama-3.1-70B-Versatile | Llama-3.3-70B |
|----------|-------------------|------------------------|---------------|
| **Zero-Shot** | R1 | R2 | R3 |
| **Few-Shot** | R4 | R5 | R6 |
| **Chain-of-Thought** | R7 | R8 | R9 |
| **Tree-of-Thought** | R10 | R11 | R12 |

**Dataset**: FinancialPhraseBank Sentences_AllAgree.txt (2,217 samples)

This notebook provides a comprehensive comparison across all risk assessment experiments with automatic data loading.

In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn plotly -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    matthews_corrcoef,
)

sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (16, 8)

print("‚úì Libraries loaded successfully")

## 2. Calculate Comprehensive Metrics

Calculate all evaluation metrics including MCC for each experiment.

In [None]:
# Define experiment metadata
experiments = {
    "R1": {"model": "Mistral-8x7B", "strategy": "Zero-Shot", "dir": "Zero_Shot"},
    "R2": {"model": "Llama-3.1-70B", "strategy": "Zero-Shot", "dir": "Zero_Shot"},
    "R3": {"model": "Llama-3.3-70B", "strategy": "Zero-Shot", "dir": "Zero_Shot"},
    "R4": {"model": "Mistral-8x7B", "strategy": "Few-Shot", "dir": "Few_Shot"},
    "R5": {"model": "Llama-3.1-70B", "strategy": "Few-Shot", "dir": "Few_Shot"},
    "R6": {"model": "Llama-3.3-70B", "strategy": "Few-Shot", "dir": "Few_Shot"},
    "R7": {
        "model": "Mistral-8x7B",
        "strategy": "Chain-of-Thought",
        "dir": "Chain_of_Thought",
    },
    "R8": {
        "model": "Llama-3.1-70B",
        "strategy": "Chain-of-Thought",
        "dir": "Chain_of_Thought",
    },
    "R9": {
        "model": "Llama-3.3-70B",
        "strategy": "Chain-of-Thought",
        "dir": "Chain_of_Thought",
    },
    "R10": {
        "model": "Mistral-8x7B",
        "strategy": "Tree-of-Thought",
        "dir": "Tree_of_Thought",
    },
    "R11": {
        "model": "Llama-3.1-70B",
        "strategy": "Tree-of-Thought",
        "dir": "Tree_of_Thought",
    },
    "R12": {
        "model": "Llama-3.3-70B",
        "strategy": "Tree-of-Thought",
        "dir": "Tree_of_Thought",
    },
}

# File patterns for each experiment
file_patterns = {
    "R1": "../Zero_Shot/r1_GPT_OSS_20B_zero_shot_*.csv",
    "R2": "../Zero_Shot/r2_GPT_OSS_120B_zero_shot_*.csv",
    "R3": "../Zero_Shot/r3_Llama_3.3_70B_zero_shot_*.csv",
    "R4": "../Few_Shot/r4_GPT_OSS_20B_few_shot_*.csv",
    "R5": "../Few_Shot/r5_GPT_OSS_120B_few_shot_*.csv",
    "R6": "../Few_Shot/r6_Llama_3.3_70B_few_shot_*.csv",
    "R7": "../Chain_of_Thought/r7_GPT_OSS_20B_cot_*.csv",
    "R8": "../Chain_of_Thought/r8_GPT_OSS_120B_cot_*.csv",
    "R9": "../Chain_of_Thought/r9_Llama-3.3-70B_cot_*.csv",
    "R10": "../Tree_of_Thought/r10_GPT_OSS_20B_tot_*.csv",
    "R11": "../Tree_of_Thought/r11_GPT_OSS_120B_flash_tot_*.csv",
    "R12": "../Tree_of_Thought/r12_Llama_3.3_70B_tot_*.csv",
}

# Load results
print("=" * 100)
print("üìÅ LOADING EXPERIMENT RESULTS")
print("=" * 100)

results_data = {}
for exp_id, pattern in file_patterns.items():
    files = sorted(glob(pattern))
    if files:
        latest_file = files[-1]
        results_data[exp_id] = pd.read_csv(latest_file)
        print(
            f"‚úì {exp_id}: {os.path.basename(latest_file)} ({len(results_data[exp_id])} samples)"
        )
    else:
        print(f"‚ö†Ô∏è  {exp_id}: No files found for pattern {pattern}")

print(f"\n‚úì Loaded {len(results_data)} experiments successfully")

## 1. Load All Results

Load metrics summaries from all prompting strategy experiments.

In [None]:
def calculate_metrics(df, exp_id, model, strategy):
    """Calculate comprehensive metrics for an experiment"""

    # Filter valid predictions
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()

    if valid_df.empty:
        print(f"‚ö†Ô∏è  {exp_id}: No valid predictions found")
        return None

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
    weighted_f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    macro_precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
    macro_recall = recall_score(y_true, y_pred, average="macro", zero_division=0)
    mcc = matthews_corrcoef(y_true, y_pred)

    # Per-class metrics
    labels = ["positive", "negative", "neutral"]
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    precision_per_class = precision_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    recall_per_class = recall_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    # Parsing errors
    parsing_errors = len(df) - len(valid_df)
    error_rate = parsing_errors / len(df) if len(df) > 0 else 0

    return {
        "Experiment": exp_id,
        "Model": model,
        "Strategy": strategy,
        "Total_Samples": len(df),
        "Valid_Predictions": len(valid_df),
        "Parsing_Errors": parsing_errors,
        "Error_Rate": error_rate,
        "Accuracy": accuracy,
        "Macro-F1": macro_f1,
        "Weighted-F1": weighted_f1,
        "Precision": macro_precision,
        "Recall": macro_recall,
        "MCC": mcc,
        "Positive-F1": f1_per_class[0],
        "Positive-Precision": precision_per_class[0],
        "Positive-Recall": recall_per_class[0],
        "Negative-F1": f1_per_class[1],
        "Negative-Precision": precision_per_class[1],
        "Negative-Recall": recall_per_class[1],
        "Neutral-F1": f1_per_class[2],
        "Neutral-Precision": precision_per_class[2],
        "Neutral-Recall": recall_per_class[2],
    }


# Calculate metrics for all loaded experiments
all_metrics = []
for exp_id, df in results_data.items():
    exp_info = experiments[exp_id]
    metrics = calculate_metrics(df, exp_id, exp_info["model"], exp_info["strategy"])
    if metrics:
        all_metrics.append(metrics)

print("\n" + "=" * 100)
print("üìä CALCULATING METRICS")
print("=" * 100)
print(f"\nProcessing {len(results_data)} experiments...\n")

# Create metrics DataFrame
metrics_df = pd.DataFrame(all_metrics)

# Display summary
print("‚úì Metrics calculated for all experiments")
print(f"\nTotal experiments: {len(metrics_df)}")
print(f"Strategies: {metrics_df['Strategy'].nunique()}")
print(f"Models: {metrics_df['Model'].nunique()}")

display(
    metrics_df[
        ["Experiment", "Model", "Strategy", "Accuracy", "Macro-F1", "MCC"]
    ].round(4)
)

In [None]:
# Save comprehensive metrics
output_file = "risk_assessment_comprehensive_metrics.csv"
metrics_df.to_csv(output_file, index=False)

print(f"\n‚úì Comprehensive metrics saved to: {output_file}")
print(f"  Total experiments: {len(metrics_df)}")
print(f"  Columns: {len(metrics_df.columns)}")

print("\n" + "=" * 100)
print("‚úì COMPREHENSIVE RESULTS COMPARISON COMPLETE")
print("=" * 100)

## 7. Export Summary

Save comprehensive results to CSV for further analysis.

In [None]:
print("\n" + "=" * 100)
print("üèÜ BEST PERFORMERS")
print("=" * 100)

print("\n1. Highest Accuracy:")
best_acc = metrics_df.nlargest(3, "Accuracy")[
    ["Experiment", "Model", "Strategy", "Accuracy", "Macro-F1", "MCC"]
]
display(best_acc.round(4))

print("\n2. Highest Macro-F1:")
best_f1 = metrics_df.nlargest(3, "Macro-F1")[
    ["Experiment", "Model", "Strategy", "Accuracy", "Macro-F1", "MCC"]
]
display(best_f1.round(4))

print("\n3. Highest MCC:")
best_mcc = metrics_df.nlargest(3, "MCC")[
    ["Experiment", "Model", "Strategy", "Accuracy", "Macro-F1", "MCC"]
]
display(best_mcc.round(4))

print("\n4. Lowest Error Rate:")
best_error = metrics_df.nsmallest(3, "Error_Rate")[
    ["Experiment", "Model", "Strategy", "Error_Rate", "Valid_Predictions"]
]
display(best_error.round(4))

## 6. Best Performers

In [None]:
# Per-class F1 scores
fig, ax = plt.subplots(figsize=(16, 8))

x = np.arange(len(metrics_df))
width = 0.25

bars1 = ax.bar(
    x - width,
    metrics_df["Positive-F1"],
    width,
    label="Positive",
    alpha=0.8,
    color="#2ecc71",
)
bars2 = ax.bar(
    x, metrics_df["Negative-F1"], width, label="Negative", alpha=0.8, color="#e74c3c"
)
bars3 = ax.bar(
    x + width,
    metrics_df["Neutral-F1"],
    width,
    label="Neutral",
    alpha=0.8,
    color="#95a5a6",
)

ax.set_xlabel("Experiment", fontsize=12, weight="bold")
ax.set_ylabel("F1 Score", fontsize=12, weight="bold")
ax.set_title(
    "Per-Class F1 Scores Across All Risk Assessment Experiments",
    fontsize=14,
    weight="bold",
)
ax.set_xticks(x)
ax.set_xticklabels(metrics_df["Experiment"], rotation=0)
ax.legend(fontsize=11)
ax.set_ylim([0, 1])
ax.grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("risk_assessment_per_class_f1.png", dpi=300, bbox_inches="tight")
plt.show()

print("‚úì Per-class F1 chart saved")

## 5. Per-Class Performance Analysis

In [None]:
# Model comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx]

    model_data = metrics_df.groupby("Model")[metric].mean().reindex(models)
    bars = ax.bar(
        range(len(models)),
        model_data,
        color=colors[idx],
        alpha=0.7,
        edgecolor="black",
        linewidth=1.5,
    )

    # Add value labels
    for i, (bar, val) in enumerate(zip(bars, model_data)):
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 0.01,
            f"{val:.4f}",
            ha="center",
            va="bottom",
            fontsize=10,
            weight="bold",
        )

    ax.set_xlabel("Model", fontsize=12, weight="bold")
    ax.set_ylabel(metric, fontsize=12, weight="bold")
    ax.set_title(f"Average {metric} by Model", fontsize=14, weight="bold")
    ax.set_xticks(range(len(models)))
    ax.set_xticklabels(models, rotation=45, ha="right")
    ax.set_ylim([0, 1 if metric != "MCC" else max(model_data) * 1.1])
    ax.grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("risk_assessment_model_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

print("‚úì Model comparison chart saved")

In [None]:
# Strategy comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

metrics_to_plot = ["Accuracy", "Macro-F1", "MCC"]
colors = ["#FF6B6B", "#4ECDC4", "#45B7D1"]

for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx]

    strategy_data = metrics_df.groupby("Strategy")[metric].mean().reindex(strategies)
    bars = ax.bar(
        range(len(strategies)),
        strategy_data,
        color=colors[idx],
        alpha=0.7,
        edgecolor="black",
        linewidth=1.5,
    )

    # Add value labels
    for i, (bar, val) in enumerate(zip(bars, strategy_data)):
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 0.01,
            f"{val:.4f}",
            ha="center",
            va="bottom",
            fontsize=10,
            weight="bold",
        )

    ax.set_xlabel("Prompting Strategy", fontsize=12, weight="bold")
    ax.set_ylabel(metric, fontsize=12, weight="bold")
    ax.set_title(f"Average {metric} by Strategy", fontsize=14, weight="bold")
    ax.set_xticks(range(len(strategies)))
    ax.set_xticklabels(strategies, rotation=45, ha="right")
    ax.set_ylim([0, 1 if metric != "MCC" else max(strategy_data) * 1.1])
    ax.grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("risk_assessment_strategy_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

print("‚úì Strategy comparison chart saved")

In [None]:
# Performance heatmap
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# Prepare data for heatmaps
strategies = ["Zero-Shot", "Few-Shot", "Chain-of-Thought", "Tree-of-Thought"]
models = ["Gemini 2.0 Flash", "Gemini 1.5 Flash", "Llama-3.3-70B"]

for idx, metric in enumerate(["Accuracy", "Macro-F1", "MCC", "Error_Rate"]):
    ax = axes[idx // 2, idx % 2]

    # Create pivot table
    pivot_data = metrics_df.pivot_table(
        values=metric, index="Strategy", columns="Model", aggfunc="mean"
    )
    pivot_data = pivot_data.reindex(strategies)[models]

    # Create heatmap
    sns.heatmap(
        pivot_data,
        annot=True,
        fmt=".4f",
        cmap="RdYlGn" if metric != "Error_Rate" else "RdYlGn_r",
        ax=ax,
        cbar_kws={"label": metric},
        vmin=0,
        vmax=1 if metric != "MCC" else None,
    )
    ax.set_title(f"{metric} by Strategy and Model", fontsize=14, weight="bold")
    ax.set_xlabel("Model", fontsize=12)
    ax.set_ylabel("Strategy", fontsize=12)

plt.tight_layout()
plt.savefig("risk_assessment_performance_heatmap.png", dpi=300, bbox_inches="tight")
plt.show()

print("‚úì Performance heatmap saved")

## 4. Visualizations

In [None]:
# Create comprehensive comparison table
print("\n" + "=" * 100)
print("üìà COMPREHENSIVE PERFORMANCE COMPARISON")
print("=" * 100)

display(
    metrics_df[
        [
            "Experiment",
            "Model",
            "Strategy",
            "Valid_Predictions",
            "Accuracy",
            "Macro-F1",
            "Weighted-F1",
            "Precision",
            "Recall",
            "MCC",
        ]
    ].round(4)
)

# Summary statistics
print("\n" + "=" * 100)
print("üìä SUMMARY STATISTICS")
print("=" * 100)

print("\nBy Strategy:")
strategy_summary = metrics_df.groupby("Strategy")[["Accuracy", "Macro-F1", "MCC"]].agg(
    ["mean", "std", "min", "max"]
)
display(strategy_summary.round(4))

print("\nBy Model:")
model_summary = metrics_df.groupby("Model")[["Accuracy", "Macro-F1", "MCC"]].agg(
    ["mean", "std", "min", "max"]
)
display(model_summary.round(4))

## 3. Overall Performance Comparison