In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
from pathlib import Path

# check if workding_dir is in local variables
if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

In [None]:
from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

In [None]:
from llm_toolkit.data_utils import *

data_path = os.getenv("DATA_PATH")
results_path = (
    "results/amazon_reviews_results_all_metrics.csv"  # os.getenv("RESULTS_PATH")
)
num_ctx = os.getenv("NUM_CTX")
data_path, results_path, num_ctx

In [None]:
import pandas as pd

df = pd.read_csv(results_path)
df.info()

In [None]:
df

In [None]:
df2 = pd.read_csv("results/amazon_reviews_results_metrics.csv")
df2.info()

In [None]:
df = pd.concat([df, df2])
df

In [None]:
models = [m for m in df["model"].unique()]
models

In [None]:
metrics_df = df[df["model"].isin(models)].copy().reset_index(drop=True)
metrics_df.info()

In [None]:
metrics_df["model"] = metrics_df["model"].apply(
    lambda x: x.replace("gpt-4o", "GPT-4o")
    .replace("deepseek-v3", "DeepSeek-V3")
    .replace("deepseek-r1", "DeepSeek-R1")
    .replace("DeepSeek-R1:", "deepseek-r1:")
)
metrics_df

In [None]:
metrics_df.to_csv("results/amazon_reviews_metrics.csv", index=False)

In [None]:
models = metrics_df["model"].unique().tolist()
models

In [None]:
from llm_toolkit.data_utils import *

plot_metrics_vs_shots(
    metrics_df,
    models,
    markers,
    ["f1_5_level"],
    ["Multi-Level F1 Score"],
    bbox_to_anchor=(0.5, -0.44),
    ylimits_offset=0.0005,
)

In [None]:
from llm_toolkit.data_utils import *

plot_metrics_vs_shots(
    metrics_df,
    models,
    markers,
    ["f1"],
    ["Aggregate F1 Score"],
    # ylimits=(0.6, 0.8),
    # log_scales=[False, True],
    bbox_to_anchor=(0.5, -0.44),
    ylimits_offset=0.0005,
)

In [None]:
from llm_toolkit.data_utils import *

plot_metrics_vs_shots(
    metrics_df,
    models,
    markers,
    ["f1_5_level", "accuracy_5_level"],
    ["F1 Score", "Accuracy"],
    # ylimits=(0.6, 0.8),
    # log_scales=[False, True],
    bbox_to_anchor=(0.5, -0.75),
    sync_y_axis=True,
)

In [None]:
from llm_toolkit.data_utils import *

plot_metrics_vs_shots(
    metrics_df,
    models,
    markers,
    ["f1_5_level", "eval_time"],
    ["F1 Score", "Mean Eval Time (s)"],
    # ylimits=(0.6, 0.8),
    # log_scales=[False, True],
    bbox_to_anchor=(0.5, -0.75),
)

In [None]:
from llm_toolkit.data_utils import *

plot_metrics_vs_shots(
    metrics_df,
    models,
    markers,
    ["f1_5_level", "eval_speed"],
    ["F1 Score", "Throughput (token/s)"],
    # ylimits=(0.6, 0.8),
    # log_scales=[False, True],
    bbox_to_anchor=(0.5, -0.75),
)

In [None]:
from llm_toolkit.data_utils import *

zero_shot_metrics_df = get_zero_shot_metrics_df(metrics_df, col="f1_5_level")
zero_shot_metrics_df

In [None]:
top_metrics_df = get_top_metrics_df(metrics_df, col="f1_5_level")
top_metrics_df["model"] = top_metrics_df.apply(
    lambda x: x["model"] + f"\n({x['shots']}-shot)", axis=1
)
top_metrics_df

In [None]:
from llm_toolkit.data_utils import plot_barcharts_for_dual_metrics

# Call the function to plot
plot_barcharts_for_dual_metrics(top_metrics_df, decimal_places=(3, 0))

In [None]:
plot_barcharts_for_dual_metrics(
    top_metrics_df,
    title="Aggregate Level F1 and Accuracy Scores Across Models",
    ylabels=("F1 Score (%)", "Accuracy (%)"),
    columns=("f1", "accuracy"),
    use_percentage=(True, True),
    decimal_places=(2, 2),
    y_limit_offsets=(30, 30),
)

In [None]:
plot_barcharts_for_dual_metrics(
    top_metrics_df,
    title="Amazon Review Dataset - Multi Level F1 and Accuracy Scores Across Models",
    columns=("f1_5_level", "accuracy_5_level"),
    ylabels=("F1 Score (%)", "Accuracy (%)"),
    use_percentage=(True, True),
    decimal_places=(2, 2),
    y_limit_offsets=(30, 30),
)

In [None]:
plot_barcharts_for_dual_metrics(
    top_metrics_df,
    title="Amazon Reviews Dataset - Aggregate Level F1 and Mean Eval Time Across Models",
    ylabels=("F1 Score (%)", "Mean Eval Time (s)"),
    columns=("f1", "eval_time"),
    use_percentage=(True, False),
    decimal_places=(2, 3),
    y_limit_offsets=(10, 1),
)

In [None]:
plot_barcharts_for_dual_metrics(
    top_metrics_df,
    title="Amazon Reviews Dataset - Best F1 and Mean Eval Time Across Models",
    ylabels=("F1 Score (%)", "Mean Eval Time (s)"),
    columns=("f1_5_level", "eval_time"),
    use_percentage=(True, False),
    decimal_places=(2, 3),
    y_limit_offsets=(10, 1),
)

In [None]:
metrics_df[metrics_df["model"].str.contains("deepseek-v3")]

In [None]:
plot_barcharts_for_dual_metrics(
    top_metrics_df,
    title="Amazon Reviews Dataset - Best F1 and Throughput Across Models",
    ylabels=("F1 Score (%)", "Throughput (token/s)"),
    columns=("f1", "eval_speed"),
    use_percentage=(True, False),
    decimal_places=(2, 0),
    y_limit_offsets=(10, 700),
)

In [None]:
perf_and_efficiency_analysis_plot(
    top_metrics_df, columns=("f1_5_level", "accuracy_5_level"), figsize=(15, 6)
)

In [None]:
perf_and_efficiency_analysis_plot(
    zero_shot_metrics_df,
    columns=("f1_5_level", "accuracy_5_level"),
    suptitle="Zero-shot Performance and Efficiency Analysis of LLMs on Amazon Reviews Dataset",
    title2="(b) Evaluation Time and Throughput Across Models",
    savefig_file="results/perf_analysis_amazon_zero_shot.png",
    figsize=(15, 6),
)

In [None]:
perf_and_efficiency_analysis_plot(
    top_metrics_df,
    columns=("f1_5_level", "accuracy_5_level"),
    suptitle="Performance and Efficiency Analysis of LLMs on Amazon Reviews Dataset Across Few-shot Configurations",
    savefig_file="results/perf_analysis_amazon.png",
    figsize=(15, 6),
)

In [None]:
perf_and_efficiency_analysis_plot(
    top_metrics_df,
    columns=("f1", "accuracy"),
    suptitle="Performance and Efficiency Analysis of LLMs on Amazon Reviews Dataset Across Few-shot Configurations",
    savefig_file="results/perf_analysis_amazon_3_level.png",
    figsize=(15, 6),
)