## Set Up

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
from pathlib import Path

if "workding_dir" not in globals():
    workding_dir = str(Path.cwd().parent)

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

In [None]:
%%time

from llm_toolkit.eval_openai import *
from tqdm.notebook import tqdm

tqdm.pandas()

data_path = os.getenv("DATA_PATH")
results_path = os.getenv("RESULTS_PATH")
num_ctx = os.getenv("NUM_CTX")
data_path, results_path, num_ctx

In [None]:
# run cells above before running anything below

## Impact of Few-Shot Examples

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Set style for publication-quality plots
plt.style.use("default")
sns.set_palette("husl")

# Read the CSV files
goemotions_df = pd.read_csv("results/GoEmotions_results_metrics.csv")
amazon_df = pd.read_csv("results/amazon_reviews_results_metrics.csv")
imdb_df = pd.read_csv("results/imdb_reviews_metrics.csv")


# Function to extract model data across shots
def extract_model_data(df, model_name, use_5_level=False):
    model_data = df[df["model"] == model_name].copy()
    if len(model_data) == 0:
        return None

    model_data = model_data.sort_values("shots")

    if use_5_level:
        accuracy_col = "accuracy_5_level"
        f1_col = "f1_5_level"
    else:
        accuracy_col = "accuracy"
        f1_col = "f1"

    return {
        "shots": model_data["shots"].values,
        "accuracy": model_data[accuracy_col].values * 100,  # Convert to percentage
        "f1": model_data[f1_col].values * 100,
    }


# Extract data for key models
# DeepSeek-R1 data
deepseek_goemotions = extract_model_data(goemotions_df, "deepseek-r1")
deepseek_amazon = extract_model_data(amazon_df, "deepseek-r1", use_5_level=True)
deepseek_imdb = extract_model_data(imdb_df, "DeepSeek-R1")

# GPT-4o data
gpt4o_goemotions = extract_model_data(goemotions_df, "gpt-4o")
gpt4o_amazon = extract_model_data(amazon_df, "gpt-4o", use_5_level=True)
gpt4o_imdb = extract_model_data(imdb_df, "GPT-4o")

# Create the multi-panel figure with smaller height
fig, axes = plt.subplots(1, 3, figsize=(15, 3))
# fig.suptitle('Few-Shot Learning Efficiency Across Sentiment Analysis Tasks', fontsize=16, fontweight='bold')

# Colors for models
deepseek_color = "#2E8B57"  # Sea Green
gpt4o_color = "#FF6B6B"  # Coral Red

# Panel A: IMDB (Binary Sentiment) - Using F1 score
ax1 = axes[0]
if deepseek_imdb and gpt4o_imdb:
    ax1.plot(
        deepseek_imdb["shots"],
        deepseek_imdb["f1"],
        "o-",
        color=deepseek_color,
        linewidth=2,
        markersize=6,
        label="DeepSeek-R1",
    )
    ax1.plot(
        gpt4o_imdb["shots"],
        gpt4o_imdb["f1"],
        "s--",
        color=gpt4o_color,
        linewidth=2,
        markersize=6,
        label="GPT-4o",
    )

ax1.set_title("(a) IMDB Binary Sentiment", fontweight="bold")
ax1.set_xlabel("Number of Shots")
ax1.set_ylabel("F1 Score (%)")
ax1.grid(True, alpha=0.3)
ax1.legend()
ax1.set_ylim(60, 105)

# Add annotations for optimal points - moved down to blank space
if deepseek_imdb:
    max_idx = np.argmax(deepseek_imdb["f1"])
    ax1.annotate(
        f"5 shots\n{deepseek_imdb['f1'][max_idx]:.2f}%",
        xy=(deepseek_imdb["shots"][max_idx], deepseek_imdb["f1"][max_idx]),
        xytext=(15, -50),
        textcoords="offset points",
        bbox=dict(boxstyle="round,pad=0.3", facecolor=deepseek_color, alpha=0.3),
        arrowprops=dict(arrowstyle="->", color=deepseek_color),
    )

if gpt4o_imdb:
    max_idx = np.argmax(gpt4o_imdb["f1"])
    ax1.annotate(
        f"40 shots\n{gpt4o_imdb['f1'][max_idx]:.2f}%",
        xy=(gpt4o_imdb["shots"][max_idx], gpt4o_imdb["f1"][max_idx]),
        xytext=(5, -50),
        textcoords="offset points",
        bbox=dict(boxstyle="round,pad=0.3", facecolor=gpt4o_color, alpha=0.3),
        arrowprops=dict(arrowstyle="->", color=gpt4o_color),
    )

# Panel B: Amazon (5-Level Sentiment) - Using F1 score
ax2 = axes[1]
if deepseek_amazon and gpt4o_amazon:
    ax2.plot(
        deepseek_amazon["shots"],
        deepseek_amazon["f1"],
        "o-",
        color=deepseek_color,
        linewidth=2,
        markersize=6,
        label="DeepSeek-R1",
    )
    ax2.plot(
        gpt4o_amazon["shots"],
        gpt4o_amazon["f1"],
        "s--",
        color=gpt4o_color,
        linewidth=2,
        markersize=6,
        label="GPT-4o",
    )

ax2.set_title("(b) Amazon 5-Level Sentiment", fontweight="bold")
ax2.set_xlabel("Number of Shots")
ax2.set_ylabel("F1 Score (%)")
ax2.grid(True, alpha=0.3)
ax2.legend()
ax2.set_ylim(65, 95)

# Add annotations for optimal points
if deepseek_amazon:
    max_idx = np.argmax(deepseek_amazon["f1"])
    ax2.annotate(
        f"30 shots\n{deepseek_amazon['f1'][max_idx]:.2f}%",
        xy=(deepseek_amazon["shots"][max_idx], deepseek_amazon["f1"][max_idx]),
        xytext=(-25, -40),
        textcoords="offset points",
        bbox=dict(boxstyle="round,pad=0.3", facecolor=deepseek_color, alpha=0.3),
        arrowprops=dict(arrowstyle="->", color=deepseek_color),
    )

if gpt4o_amazon:
    max_idx = np.argmax(gpt4o_amazon["f1"])
    ax2.annotate(
        f"40 shots\n{gpt4o_amazon['f1'][max_idx]:.2f}%",
        xy=(gpt4o_amazon["shots"][max_idx], gpt4o_amazon["f1"][max_idx]),
        xytext=(20, 12),
        textcoords="offset points",
        bbox=dict(boxstyle="round,pad=0.3", facecolor=gpt4o_color, alpha=0.3),
        arrowprops=dict(arrowstyle="->", color=gpt4o_color),
    )

# Panel C: GoEmotions (27-Class Emotion) - Using F1 score
ax3 = axes[2]
if deepseek_goemotions and gpt4o_goemotions:
    ax3.plot(
        deepseek_goemotions["shots"],
        deepseek_goemotions["f1"],
        "o-",
        color=deepseek_color,
        linewidth=2,
        markersize=6,
        label="DeepSeek-R1",
    )
    ax3.plot(
        gpt4o_goemotions["shots"],
        gpt4o_goemotions["f1"],
        "s--",
        color=gpt4o_color,
        linewidth=2,
        markersize=6,
        label="GPT-4o",
    )

ax3.set_title("(c) GoEmotions 27-Class Emotion", fontweight="bold")
ax3.set_xlabel("Number of Shots")
ax3.set_ylabel("F1 Score (%)")
ax3.grid(True, alpha=0.3)
ax3.legend()
ax3.set_ylim(38, 48)

# Add annotations for optimal points
if deepseek_goemotions:
    max_idx = np.argmax(deepseek_goemotions["f1"])
    ax3.annotate(
        f"10 shots\n{deepseek_goemotions['f1'][max_idx]:.2f}%",
        xy=(deepseek_goemotions["shots"][max_idx], deepseek_goemotions["f1"][max_idx]),
        xytext=(20, 5),
        textcoords="offset points",
        bbox=dict(boxstyle="round,pad=0.3", facecolor=deepseek_color, alpha=0.3),
        arrowprops=dict(arrowstyle="->", color=deepseek_color),
    )

if gpt4o_goemotions:
    max_idx = np.argmax(gpt4o_goemotions["f1"])
    ax3.annotate(
        f"10 shots\n{gpt4o_goemotions['f1'][max_idx]:.2f}%",
        xy=(gpt4o_goemotions["shots"][max_idx], gpt4o_goemotions["f1"][max_idx]),
        xytext=(-15, -50),
        textcoords="offset points",
        bbox=dict(boxstyle="round,pad=0.3", facecolor=gpt4o_color, alpha=0.3),
        arrowprops=dict(arrowstyle="->", color=gpt4o_color),
    )

# Adjust layout
plt.tight_layout()

# Save the figure
# plt.savefig('results/few_shot_learning_analysis.png', dpi=300, bbox_inches='tight')
plt.savefig("results/few_shot_learning_analysis.pdf", bbox_inches="tight")

# Display the plot
plt.show()

# Print summary statistics using F1 scores
print("=== Few-Shot Learning Analysis Summary (F1 Scores) ===")
print("\nOptimal Shot Requirements:")
if deepseek_imdb and gpt4o_imdb:
    d_opt_imdb = deepseek_imdb["shots"][np.argmax(deepseek_imdb["f1"])]
    g_opt_imdb = gpt4o_imdb["shots"][np.argmax(gpt4o_imdb["f1"])]
    print(
        f"IMDB Binary Sentiment: DeepSeek-R1 = {d_opt_imdb} shots, GPT-4o = {g_opt_imdb} shots ({g_opt_imdb/d_opt_imdb:.2f}× difference)"
    )

if deepseek_amazon and gpt4o_amazon:
    d_opt_amz = deepseek_amazon["shots"][np.argmax(deepseek_amazon["f1"])]
    g_opt_amz = gpt4o_amazon["shots"][np.argmax(gpt4o_amazon["f1"])]
    print(
        f"Amazon 5-Level Sentiment: DeepSeek-R1 = {d_opt_amz} shots, GPT-4o = {g_opt_amz} shots ({g_opt_amz/d_opt_amz:.2f}× difference)"
    )

if deepseek_goemotions and gpt4o_goemotions:
    d_opt_go = deepseek_goemotions["shots"][np.argmax(deepseek_goemotions["f1"])]
    g_opt_go = gpt4o_goemotions["shots"][np.argmax(gpt4o_goemotions["f1"])]
    print(
        f"GoEmotions 27-Class Emotion: DeepSeek-R1 = {d_opt_go} shots, GPT-4o = {g_opt_go} shots ({g_opt_go/d_opt_go:.2f}× difference)"
    )

print("\nBest F1 Performance Achieved:")
if deepseek_imdb:
    print(f"IMDB - DeepSeek-R1: {np.max(deepseek_imdb['f1']):.2f}%")
if gpt4o_imdb:
    print(f"IMDB - GPT-4o: {np.max(gpt4o_imdb['f1']):.2f}%")
if deepseek_amazon:
    print(f"Amazon - DeepSeek-R1: {np.max(deepseek_amazon['f1']):.2f}%")
if gpt4o_amazon:
    print(f"Amazon - GPT-4o: {np.max(gpt4o_amazon['f1']):.2f}%")
if deepseek_goemotions:
    print(f"GoEmotions - DeepSeek-R1: {np.max(deepseek_goemotions['f1']):.2f}%")
if gpt4o_goemotions:
    print(f"GoEmotions - GPT-4o: {np.max(gpt4o_goemotions['f1']):.2f}%")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import os

# Set style
plt.style.use("default")
sns.set_palette("husl")

# Read data
goemotions_df = pd.read_csv("results/GoEmotions_results_metrics.csv")
amazon_df = pd.read_csv("results/amazon_reviews_results_metrics.csv")
amazon_df2 = pd.read_csv("results/amazon_reviews_results_all_metrics.csv")
amazon_df = pd.concat([amazon_df, amazon_df2], ignore_index=True)
imdb_df = pd.read_csv("results/imdb_reviews_metrics.csv")

# Fix naming inconsistency in IMDB dataframe
imdb_df["model"] = imdb_df["model"].replace("DeepSeek-R1", "deepseek-r1")


# Function to extract model data
def extract_model_data(df, model_name, use_5_level=False):
    model_data = df[df["model"] == model_name].copy()
    if model_data.empty:
        return None
    model_data = model_data.sort_values("shots")
    accuracy_col = "accuracy_5_level" if use_5_level else "accuracy"
    f1_col = "f1_5_level" if use_5_level else "f1"
    return {
        "shots": model_data["shots"].values,
        "accuracy": model_data[accuracy_col].values * 100,
        "f1": model_data[f1_col].values * 100,
    }


# Extract DeepSeek models across datasets
model_labels = [
    "deepseek-r1:8b",
    "deepseek-r1:14b",
    "deepseek-r1:32b",
    "deepseek-r1:70b",
    "deepseek-r1",
]

# Enhanced styling with colors, line styles, and different markers
model_styles = {
    "deepseek-r1:8b": {
        "color": "#9467bd",
        "linestyle": "-",
        "marker": "o",  # circle
        "legend": model_labels[0],
    },
    "deepseek-r1:14b": {
        "color": "#8c564b",
        "linestyle": "--",
        "marker": "s",  # square
        "legend": model_labels[1],
    },
    "deepseek-r1:32b": {
        "color": "#1f77b4",
        "linestyle": "-.",
        "marker": "^",  # triangle up
        "legend": model_labels[2],
    },
    "deepseek-r1:70b": {
        "color": "#ff7f0e",
        "linestyle": ":",
        "marker": "D",  # diamond
        "legend": model_labels[3],
    },
    "deepseek-r1": {
        "color": "#2E8B57",
        "linestyle": "-",
        "marker": "P",  # plus (filled) - larger and more visible than star
        "legend": "DeepSeek-R1",
    },
}

# Dataset dictionaries: {model_name: data_dict}
data = {
    "imdb": {m: extract_model_data(imdb_df, m) for m in model_labels},
    "amazon": {
        m: extract_model_data(amazon_df, m, use_5_level=True) for m in model_labels
    },
    "goemotions": {m: extract_model_data(goemotions_df, m) for m in model_labels},
}

# Create figure
fig, axes = plt.subplots(1, 3, figsize=(16, 3.5))


def add_annotations(ax, best_performances, dataset_key, y_lim):
    """Add annotations positioned within subplot boundaries"""

    # Get axis limits
    x_min, x_max = ax.get_xlim()
    y_min, y_max = y_lim

    # Define annotation positions as fractions of axis ranges to keep them within bounds
    annotation_positions = {
        "imdb": [
            {"x_frac": 0.3, "y_frac": 0.2},  # deepseek-r1:8b - bottom left
            {"x_frac": 0.35, "y_frac": 0.4},  # deepseek-r1:14b - top left
            {"x_frac": 0.85, "y_frac": 0.3},  # deepseek-r1:32b - top right
            {"x_frac": 0.65, "y_frac": 0.4},  # deepseek-r1:70b - middle right
            {"x_frac": 0.15, "y_frac": 0.2},  # deepseek-r1 - bottom right
        ],
        "amazon": [
            {"x_frac": 0.85, "y_frac": 0.2},  # deepseek-r1:8b
            {"x_frac": 0.35, "y_frac": 0.67},  # deepseek-r1:14b
            {"x_frac": 0.85, "y_frac": 0.9},  # deepseek-r1:32b
            {"x_frac": 0.67, "y_frac": 0.78},  # deepseek-r1:70b
            {"x_frac": 0.2, "y_frac": 0.9},  # deepseek-r1
        ],
        "goemotions": [
            {
                "x_frac": 0.9,
                "y_frac": 0.25,
            },  # deepseek-r1:8b - top left (moved more inward)
            {"x_frac": 0.7, "y_frac": 0.15},  # deepseek-r1:14b - bottom right
            {"x_frac": 0.85, "y_frac": 0.53},  # deepseek-r1:32b - bottom left
            {"x_frac": 0.3, "y_frac": 0.5},  # deepseek-r1:70b - top right
            {"x_frac": 0.45, "y_frac": 0.5},  # deepseek-r1 - center
        ],
    }

    positions = annotation_positions.get(
        dataset_key, [{"x_frac": 0.5, "y_frac": 0.5}] * len(best_performances)
    )

    for i, perf in enumerate(best_performances):
        if i < len(positions):
            pos = positions[i]

            # Convert fractional positions to actual coordinates
            text_x = x_min + pos["x_frac"] * (x_max - x_min)
            text_y = y_min + pos["y_frac"] * (y_max - y_min)

            # Create annotation text
            annotation_text = f"{int(perf['shots'])} shots\n{perf['f1']:.2f}%"

            # Place annotation at calculated position within subplot
            ax.annotate(
                annotation_text,
                xy=(perf["shots"], perf["f1"]),  # Point to actual data point
                xytext=(text_x, text_y),  # Text position within subplot
                bbox=dict(
                    boxstyle="round,pad=0.3",
                    facecolor=perf["color"],
                    alpha=0.4,
                    edgecolor=perf["color"],
                    linewidth=1.5,
                ),
                arrowprops=dict(arrowstyle="->", color=perf["color"], lw=2, alpha=1.0),
                fontsize=8,
                ha="center",
                va="center",
                #    color='white',  # White text for better contrast
                color="black",  # Black text for better contrast
                weight="bold",  # Bold text for better readability
                clip_on=True,
            )  # Ensure annotation stays within subplot


# Enhanced plot function with improved annotations and different markers
def plot_panel(ax, dataset_key, title, y_lim):
    best_performances = []  # Store best performance info for annotations

    for model_name, model_data in data[dataset_key].items():
        if model_data:
            style = model_styles[model_name]
            ax.plot(
                model_data["shots"],
                model_data["f1"],
                marker=style["marker"],  # Use different marker for each model
                linestyle=style["linestyle"],
                label=style["legend"],
                color=style["color"],
                linewidth=2,
                markersize=10,  # Even larger for better visibility, especially for smaller markers
                markerfacecolor=style["color"],
                markeredgecolor="white",  # White edge for better contrast
                markeredgewidth=1,
            )

            # Find best performance for this model
            best_idx = np.argmax(model_data["f1"])
            best_performances.append(
                {
                    "model": model_name,
                    "shots": model_data["shots"][best_idx],
                    "f1": model_data["f1"][best_idx],
                    "color": style["color"],
                    "legend": style["legend"],
                    "marker": style["marker"],
                }
            )

    ax.set_title(title, fontweight="bold")
    ax.set_xlabel("Number of Shots")
    ax.set_ylabel("F1 Score (%)")
    ax.grid(True, alpha=0.3)
    ax.set_ylim(y_lim)

    # Add annotations for best performances
    add_annotations(ax, best_performances, dataset_key, y_lim)

    return best_performances  # Return for legend creation


# Panels
best_perf_1 = plot_panel(axes[0], "imdb", "(a) IMDB Binary Sentiment", (77, 101))
best_perf_2 = plot_panel(axes[1], "amazon", "(b) Amazon 5-Level Sentiment", (55, 95))
best_perf_3 = plot_panel(
    axes[2], "goemotions", "(c) GoEmotions 27-Class Emotion", (20, 47)
)

# Create a single legend for all subplots using the first subplot's handles and labels
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(
    handles,
    labels,
    loc="lower center",
    bbox_to_anchor=(0.5, -0.07),
    ncol=5,
    fontsize=10,
    frameon=True,
    fancybox=True,
    shadow=True,
)

# Finalize
plt.tight_layout()
# Adjust layout to make room for the legend at the bottom
plt.subplots_adjust(bottom=0.15)
plt.savefig("results/few_shot_learning_analysis_v2.pdf", bbox_inches="tight")
plt.show()


# Generate comprehensive results CSV (updated with marker info)
def create_results_csv():
    results_data = []

    dataset_info = {
        "imdb": "IMDB Binary Sentiment",
        "amazon": "Amazon 5-Level Sentiment",
        "goemotions": "GoEmotions 27-Class Emotion",
    }

    for dataset_key, dataset_name in dataset_info.items():
        for model_name in model_labels:
            model_data = data[dataset_key][model_name]
            if model_data:
                for i, shots in enumerate(model_data["shots"]):
                    results_data.append(
                        {
                            "dataset": dataset_name,
                            "dataset_key": dataset_key,
                            "model": model_name,
                            "model_display": model_styles[model_name]["legend"],
                            "shots": shots,
                            "f1_score": model_data["f1"][i],
                            "accuracy": model_data["accuracy"][i],
                            "color": model_styles[model_name]["color"],
                            "linestyle": model_styles[model_name]["linestyle"],
                            "marker": model_styles[model_name]["marker"],
                        }
                    )

    results_df = pd.DataFrame(results_data)

    # Create results directory if it doesn't exist
    os.makedirs("results", exist_ok=True)

    # Save to CSV
    csv_filename = "results/few_shot_analysis_results.csv"
    results_df.to_csv(csv_filename, index=False)
    print(f"\nResults saved to: {csv_filename}")

    return results_df


# Generate CSV
results_df = create_results_csv()


# Summary statistics CSV
def create_summary_csv():
    summary_data = []

    dataset_info = {
        "imdb": "IMDB Binary Sentiment",
        "amazon": "Amazon 5-Level Sentiment",
        "goemotions": "GoEmotions 27-Class Emotion",
    }

    for dataset_key, dataset_name in dataset_info.items():
        for model_name in model_labels:
            model_data = data[dataset_key][model_name]
            if model_data:
                best_f1_idx = np.argmax(model_data["f1"])
                worst_f1_idx = np.argmin(model_data["f1"])

                summary_data.append(
                    {
                        "dataset": dataset_name,
                        "dataset_key": dataset_key,
                        "model": model_name,
                        "model_display": model_styles[model_name]["legend"],
                        "max_f1_score": model_data["f1"][best_f1_idx],
                        "max_f1_shots": model_data["shots"][best_f1_idx],
                        "min_f1_score": model_data["f1"][worst_f1_idx],
                        "min_f1_shots": model_data["shots"][worst_f1_idx],
                        "avg_f1_score": np.mean(model_data["f1"]),
                        "f1_improvement": model_data["f1"][best_f1_idx]
                        - model_data["f1"][worst_f1_idx],
                        "total_shots_tested": len(model_data["shots"]),
                        "marker": model_styles[model_name]["marker"],
                    }
                )

    summary_df = pd.DataFrame(summary_data)
    summary_filename = "results/few_shot_analysis_summary.csv"
    summary_df.to_csv(summary_filename, index=False)
    print(f"Summary statistics saved to: {summary_filename}")

    return summary_df


# Generate summary CSV
summary_df = create_summary_csv()

# Enhanced summary print
print("=== Few-Shot Learning Analysis Summary (F1 Scores) ===")


def print_summary(name, dataset):
    if dataset:
        best_idx = np.argmax(dataset["f1"])
        worst_idx = np.argmin(dataset["f1"])
        avg_f1 = np.mean(dataset["f1"])
        improvement = dataset["f1"][best_idx] - dataset["f1"][worst_idx]
        print(
            f"{name}: Max F1 = {dataset['f1'][best_idx]:.2f}% at {dataset['shots'][best_idx]} shots | "
            f"Min F1 = {dataset['f1'][worst_idx]:.2f}% | Avg F1 = {avg_f1:.2f}% | "
            f"Improvement = {improvement:.2f}%"
        )


for dataset_key, label in zip(
    ["imdb", "amazon", "goemotions"],
    [
        "IMDB Binary Sentiment",
        "Amazon 5-Level Sentiment",
        "GoEmotions 27-Class Emotion",
    ],
):
    print(f"\n{label}")
    for model_name in model_labels:
        display_name = model_styles[model_name]["legend"]
        print_summary(display_name, data[dataset_key][model_name])

# Display first few rows of the generated CSV for verification
print(f"\n=== Preview of Generated Results CSV ===")
print(f"Shape: {results_df.shape}")
print(results_df.head(10).to_string(index=False))

print(f"\n=== Preview of Summary CSV ===")
print(f"Shape: {summary_df.shape}")
print(summary_df.head(5).to_string(index=False))

# Print marker mapping for reference
print(f"\n=== Marker Mapping ===")
for model_name in model_labels:
    style = model_styles[model_name]
    print(f"{style['legend']}: {style['marker']} ({style['color']})")