In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
from pathlib import Path

# check if workding_dir is in local variables
if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

In [None]:
from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

In [None]:
from llm_toolkit.data_utils import *

data_path = "dataset/amazon_reviews.csv"
results_path = "results/qwen3-amazon_no_thinking.csv"
num_ctx = os.getenv("NUM_CTX")
data_path, results_path, num_ctx

In [None]:
import pandas as pd

df = pd.read_csv(results_path)
df.info()

In [None]:
result_col_start_idx = 3
selected_result_col_start_idx = 3

In [None]:
def update_df_columns(df, thinking=True):
    columns = {
        col: col.replace("_16k", "(T)" if thinking else "(N)")
        for col in df.columns
    }
    df.rename(columns=columns, inplace=True)
    return df

In [None]:
df = update_df_columns(df, thinking=False)
df.info()

In [None]:
datasets = prepare_dataset(data_path, "Text", "Review-sentiment")

In [None]:
metrics = get_metrics(
    df,
    result_col_start_idx=result_col_start_idx,
    mean_eval_time=True,
    train_dataset=datasets["train"],
    # debug=True,
)
metrics

In [None]:
df2 = pd.read_csv(results_path.replace("_no", ""))
df2 = update_df_columns(df2, thinking=True)
df2.info()

In [None]:
metrics2 = get_metrics(
    df2,
    result_col_start_idx=result_col_start_idx,
    mean_eval_time=True,
    train_dataset=datasets["train"],
    # debug=True,
)
metrics2

In [None]:
metrics_df = pd.concat([metrics, metrics2], axis=0)
metrics_df

In [None]:
def sort_model_name(model_name, shots=None):
    # If shots is None, assume model_name is a Series or tuple
    if shots is None:
        # Check if we're dealing with a tuple from the key function
        if isinstance(model_name, tuple):
            shots = model_name[1]
            model_name = model_name[0]
        else:
            # This is likely a single value when called from map()
            shots = 0
            
    size = int(model_name.split(":")[1].split("b")[0])
    if "(N)" in model_name:
        size -= 1
    return size * 100 + int(shots)

In [None]:
# Then update the sort_values call:
metrics = metrics_df.sort_values(
    by=["model", "shots"],
    key=lambda col: pd.Series(
        [sort_model_name(m, s) for m, s in zip(metrics_df["model"], metrics_df["shots"])]
    ),
    ascending=True,
    ignore_index=True,
)
metrics

In [None]:
metrics_df = metrics.copy()
models = metrics_df["model"].unique()
models

In [None]:
metrics_df.to_csv("results/qwen3-amazon_metrics.csv", index=False)

In [None]:
from llm_toolkit.data_utils import *

plot_metrics_vs_shots(
    metrics_df,
    models,
    markers,
    ["f1_5_level"],
    ["F1 Score"],
    # ylimits=(0.6, 0.8),
    # log_scales=[False, True],
    bbox_to_anchor=(0.5, -0.45),
    ylimits_offset=0.0005,
)

In [None]:
# iterate over models with step 2
for i in range(0, len(models), 2):
    model = models[i].split("(")[0]
    print(model)
    plot_metrics_for_model(model, metrics_df)

In [None]:
# iterate over models with step 2
for i in range(0, len(models), 2):
    model = models[i].split("(")[0]
    print(model)
    plot_metrics_for_model(model, metrics_df, f1_5_level=False)

In [None]:
top_metrics_df = get_top_metrics_df(metrics_df, col="f1_5_level")
top_metrics_df["model"] = top_metrics_df.apply(
    lambda x: x["model"] + f"\n({x['shots']}-shot)", axis=1
)
top_metrics_df

In [None]:
from llm_toolkit.data_utils import plot_barcharts_for_dual_metrics

# Call the function to plot
plot_barcharts_for_dual_metrics(top_metrics_df, decimal_places=(3, 0))

In [None]:
plot_barcharts_for_dual_metrics(
    top_metrics_df,
    title="Amazon Reviews Dataset - F1 and Mean Eval Time Across Models",
    ylabels=("F1 Score (%)", "Mean Eval Time (s)"),
    columns=("f1_5_level", "eval_time"),
    use_percentage=(True, False),
    decimal_places=(2, 3),
    y_limit_offsets=(10, 1),
)

In [None]:
plot_barcharts_for_dual_metrics(
    top_metrics_df,
    title="Amazon Reviews Dataset - Best F1 and Throughput Across Models",
    ylabels=("F1 Score (%)", "Throughput (token/s)"),
    columns=("f1_5_level", "eval_speed"),
    use_percentage=(True, False),
    decimal_places=(2, 0),
    y_limit_offsets=(10, 300),
)

In [None]:
perf_and_efficiency_analysis_plot(top_metrics_df, columns=("f1_5_level", "accuracy_5_level"))

In [None]:
perf_and_efficiency_analysis_plot(
    top_metrics_df,
    columns=("f1_5_level", "accuracy_5_level"),
    suptitle="Performance and Efficiency Analysis of Models on Amazon Reviews Dataset Across Few-shot Configurations",
    savefig_file="results/amazon_qwen3_metrics.png",
    figsize=(15, 6),
)

In [None]:
perf_and_efficiency_analysis_plot(
    top_metrics_df,
    columns=("f1", "accuracy"),
    suptitle="Performance and Efficiency Analysis of Models on Amazon Reviews Dataset Across Few-shot Configurations",
    savefig_file="results/amazon_qwen3_metrics_3_level.png",
    figsize=(15, 6),
)