In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
from pathlib import Path

# check if workding_dir is in local variables
if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

In [None]:
from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

In [None]:
from llm_toolkit.data_utils import *


data_path = "dataset/imdb_reviews.csv"  # os.getenv("DATA_PATH")
results_path = "results/imdb_reviews_results.csv"  # os.getenv("RESULTS_PATH")
num_ctx = os.getenv("NUM_CTX")
data_path, results_path, num_ctx

In [None]:
import pandas as pd

df = pd.read_csv(results_path)
df.info()

In [None]:
result_col_start_idx = 2
selected_result_col_start_idx = 2

In [None]:
columns = {
    col: col.replace("_8k", "").replace("-chat", "-v3").replace("-reasoner", "-r1")
    for col in df.columns
}
df.rename(columns=columns, inplace=True)

In [None]:
def get_key(model_name):
    parts = model_name.split("/")
    key = f"{model_orders[parts[0]]:03d}/{parts[1]}"
    # print(key)
    return key


result_cols = [
    col
    for col in df.columns[selected_result_col_start_idx:].tolist()
    if col.split("/")[0] in model_orders.keys()
]
result_cols.sort(key=get_key)
result_cols

In [None]:
df = df[df.columns[:result_col_start_idx].tolist() + result_cols]

In [None]:
df.info()

In [None]:
datasets = prepare_dataset(data_path, "Text", "Review-sentiment")

In [None]:
metrics = get_metrics(
    df,
    result_col_start_idx=result_col_start_idx,
    mean_eval_time=True,
    train_dataset=datasets["train"],  # , debug=True
)
metrics

In [None]:
metrics_df = metrics.copy()
models = metrics_df["model"].unique()
models

In [None]:
from llm_toolkit.data_utils import *

plot_metrics_vs_shots(
    metrics_df,
    models,
    markers,
    ["f1", "accuracy"],
    ["F1 Score", "Accuracy"],
    # ylimits=(0.6, 0.8),
    # log_scales=[False, True],
    bbox_to_anchor=(0.5, -0.5),
    ylimits_offset=0.0005,
    sync_y_axis=True,
)

In [None]:
from llm_toolkit.data_utils import *

plot_metrics_vs_shots(
    metrics_df,
    models,
    markers,
    ["f1", "eval_time"],
    ["F1 Score", "Mean Eval Time (s)"],
    # ylimits=(0.6, 0.8),
    # log_scales=[False, True],
    ylimits_offset=0.0005,
    bbox_to_anchor=(0.5, -0.5),
)

In [None]:
from llm_toolkit.data_utils import *

plot_metrics_vs_shots(
    metrics_df,
    models,
    markers,
    ["f1", "eval_speed"],
    ["F1 Score", "Throughput (token/s)"],
    # ylimits=(0.6, 0.8),
    # log_scales=[False, True],
    ylimits_offset=0.0005,
    bbox_to_anchor=(0.5, -0.5),
)

In [None]:
metrics["model"] = metrics.apply(
    lambda x: x["model"] + f"\n({x['shots']}-shot)", axis=1
)

In [None]:
metrics

In [None]:
from llm_toolkit.data_utils import plot_barcharts_for_dual_metrics

# Call the function to plot
plot_barcharts_for_dual_metrics(metrics)

In [None]:
plot_barcharts_for_dual_metrics(
    metrics,
    title="Aggregate Level F1 and Accuracy Scores Across Models",
    ylabels=("F1 Score (%)", "Accuracy (%)"),
    columns=("f1", "accuracy"),
    use_percentage=(True, True),
    decimal_places=(2, 2),
    y_limit_offsets=(30, 30),
)

In [None]:
top_metrics_df = get_top_metrics_df(metrics_df, col="f1")
top_metrics_df["model"] = top_metrics_df.apply(
    lambda x: x["model"] + f"\n({x['shots']}-shot)", axis=1
)
top_metrics_df

In [None]:
from llm_toolkit.data_utils import plot_barcharts_for_dual_metrics

# Call the function to plot
plot_barcharts_for_dual_metrics(top_metrics_df, decimal_places=(3, 0))

In [None]:
plot_barcharts_for_dual_metrics(
    top_metrics_df,
    title="Multi Level F1 and Accuracy Scores Across Models",
    columns=("f1", "accuracy"),
    ylabels=("F1 Score (%)", "Accuracy (%)"),
    use_percentage=(True, True),
    decimal_places=(2, 2),
    y_limit_offsets=(30, 30),
)

In [None]:
metrics_df.to_csv(results_path.replace(".csv", "_metrics.csv"), index=False)

In [None]:
df["Review-basic-sentiment"].value_counts()

In [None]:
labels = ["Positive", "Negative"]

In [None]:
plot_confusion_matrix(
    df,
    "Review-basic-sentiment",
    "deepseek-r1/shots-05(47.208)",
    "gpt-4o/shots-40(2.641)",
    labels,
)

In [None]:
datasets = prepare_dataset(data_path, "Text", "Review-basic-sentiment")

In [None]:
df2 = analyze_confusion_cases(df, "Positive", "Negative", datasets["train"])

In [None]:
df3 = analyze_confusion_cases(df, "Negative", "Positive", datasets["train"])

In [None]:
# Select only the specified columns
columns = [
    "Text",
    "Review-basic-sentiment",
    "DeepSeek-sentiment",
    "Gemini-sentiment",
    "GPT-4o-sentiment",
    "Content",
    "Reasoning-content",
    "Gemini-content",
    "Gemini-reasoning-content",
    "GPT-4o-Content",
]

# Concatenate df2 and df7 with only the specified columns
result = pd.DataFrame(columns=columns)
for df in [df2, df3]:
    if len(df) > 0:
        result = pd.concat([result, df[columns]])
result

In [None]:
result.to_csv(results_path.replace(".csv", "_confusion_cases.csv"), index=False)

In [None]:
df_result = pd.read_csv(results_path.replace(".csv", "_confusion_cases.csv"))
df_result.info()