In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
from pathlib import Path

# check if workding_dir is in local variables
if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

In [None]:
from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

In [None]:
from llm_toolkit.data_utils import *

data_path = "dataset/GoEmotions.csv"
results_path = "results/GoEmotions_results.csv"
num_ctx = os.getenv("NUM_CTX")
data_path, results_path, num_ctx

In [None]:
import pandas as pd

df = pd.read_csv(results_path)
df.info()

In [None]:
result_col_start_idx = 2
selected_result_col_start_idx = 2

In [None]:
df2 = pd.read_csv("results/GoEmotions_api_results.csv")
df2.info()

In [None]:
api_cols = df2.columns.tolist()[result_col_start_idx:]
api_cols

In [None]:
df[api_cols] = df2[api_cols]
df.info()

In [None]:
df3 = pd.read_csv("results/GoEmotions_openai_results.csv")
df3.info()

In [None]:
openai_cols = df3.columns.tolist()[result_col_start_idx:]
openai_cols

In [None]:
df[openai_cols] = df3[openai_cols]
df.info()

In [None]:
columns = {
    col: col.replace("_8k", "").replace("-chat", "-v3").replace("-reasoner", "-r1")
    for col in df.columns
}
df.rename(columns=columns, inplace=True)
df.info()

In [None]:
datasets = prepare_dataset(data_path, "Text", "Emotion")

In [None]:
metrics = get_metrics(
    df,
    result_col_start_idx=result_col_start_idx,
    mean_eval_time=True,
    label_column="Emotion",
    label_column2="macro",
    train_dataset=datasets["train"],
    # debug=True,
)
metrics

In [None]:
def sort_model_name(model_name):
    # print("model_name:", model_name)
    if "deepseek-v3" == model_name:
        return 900
    if "deepseek-r1" == model_name:
        return 1000
    elif "gpt-4o-mini" == model_name:
        return 1001
    elif "gpt-4o" == model_name:
        return 1002
    elif "deepseek-r1:" in model_name:
        return int(model_name.split(":")[1].split("b")[0])
    else:
        return int(model_name.split(":")[1].split("b")[0]) - 1

In [None]:
metrics = metrics.sort_values(
    by="model",
    key=lambda col: col.map(sort_model_name),
    ascending=True,
    ignore_index=True,
)
metrics

In [None]:
metrics_df = metrics.copy()
models = metrics_df["model"].unique()
models

In [None]:
metrics_df.to_csv(results_path.replace(".csv", "_metrics.csv"), index=False)

In [None]:
from llm_toolkit.data_utils import *

plot_metrics_vs_shots(
    metrics_df,
    models,
    markers,
    ["f1"],
    ["F1 Score"],
    # ylimits=(0.6, 0.8),
    # log_scales=[False, True],
    bbox_to_anchor=(0.5, -0.4),
    ylimits_offset=0.0005,
)

In [None]:
from llm_toolkit.data_utils import *

zero_shot_metrics_df = get_zero_shot_metrics_df(metrics_df, col="f1")
zero_shot_metrics_df

In [None]:
top_metrics_df = get_top_metrics_df(metrics_df, col="f1")
top_metrics_df["model"] = top_metrics_df.apply(
    lambda x: x["model"] + f"\n({x['shots']}-shot)", axis=1
)
top_metrics_df

In [None]:
from llm_toolkit.data_utils import plot_barcharts_for_dual_metrics

# Call the function to plot
plot_barcharts_for_dual_metrics(top_metrics_df, decimal_places=(3, 0))

In [None]:
plot_barcharts_for_dual_metrics(
    top_metrics_df,
    title="GoEmotions Dataset - F1 and Mean Eval Time Across Models",
    ylabels=("F1 Score (%)", "Mean Eval Time (s)"),
    columns=("f1", "eval_time"),
    use_percentage=(True, False),
    decimal_places=(2, 3),
    y_limit_offsets=(10, 1),
)

In [None]:
plot_barcharts_for_dual_metrics(
    top_metrics_df,
    title="Amazon Reviews Dataset - Best F1 and Throughput Across Models",
    ylabels=("F1 Score (%)", "Throughput (token/s)"),
    columns=("f1", "eval_speed"),
    use_percentage=(True, False),
    decimal_places=(2, 0),
    y_limit_offsets=(10, 300),
)

In [None]:
perf_and_efficiency_analysis_plot(top_metrics_df, columns=("f1", "accuracy"))

In [None]:
perf_and_efficiency_analysis_plot(
    zero_shot_metrics_df,
    columns=("f1", "accuracy"),
    suptitle="Zero-shot Performance and Efficiency Analysis of LLMs on GoEmotions Dataset",
    title2="(b) Evaluation Time and Throughput Across Models",
    savefig_file="results/perf_analysis_go_emotions_zero_shot.png",
    figsize=(15, 6),
)

In [None]:
perf_and_efficiency_analysis_plot(
    top_metrics_df,
    columns=("f1", "accuracy"),
    suptitle="Performance and Efficiency Analysis of Models on GoEmotions Dataset Across Few-shot Configurations",
    savefig_file="results/GoEmotions_metrics.png",
    figsize=(15, 6),
)