In [23]:
import json, pathlib, pandas as pd
from pprint import pprint
import os
from pathlib import Path
from collections import Counter
from io import StringIO
import numpy as np


pd.set_option("display.max_rows", None)  # show ALL rows
pd.set_option("display.max_columns", None)  # show ALL columns


# Time Conversion function
def format_time(seconds: float) -> str:
    seconds = int(seconds)
    hours, remainder = divmod(seconds, 3600)
    minutes = remainder // 60  # drop leftover seconds

    parts = []
    if hours > 0:
        parts.append(f"{hours}h")
    if minutes > 0 or not parts:  # if no hours and no minutes, show 0m
        parts.append(f"{minutes}m")

    return " ".join(parts)


def list_json_files(directory: str):
    """
    Reads all .json files in a given directory and returns 
    their full paths as a list.
    """
    json_files = []
    for file in os.listdir(directory):
        if file.endswith(".json"):
            full_path = os.path.join(directory, file)
            json_files.append(full_path)
    return json_files


def format_params(n: int) -> str:
    """
    Convert raw parameter count (int) into human-friendly string.
    Examples:
        6851947264 -> "7B"
        12500000000 -> "12.5B"
        560000000 -> "560M"
    """
    if n >= 1_000_000_000:  # billions
        val = n / 1_000_000_000
        if val.is_integer():
            return f"{int(val)}B"
        else:
            return f"{val:.1f}B"
    elif n >= 1_000_000:  # millions
        val = n / 1_000_000
        if val.is_integer():
            return f"{int(val)}M"
        else:
            return f"{val:.1f}M"
    elif n >= 1_000:  # thousands (rare for params, but included)
        val = n / 1_000
        if val.is_integer():
            return f"{int(val)}K"
        else:
            return f"{val:.1f}K"
    else:
        return str(n)


metric_map = {
    "mmlu":"acc,none" ,
    "hellaswag": "acc_norm,none",
    "arc_challenge": "acc_norm,none",   # prefer normalized accuracy
    "bbh": "exact_match,get-answer",
    "gsm8k":"exact_match,strict-match" ,
    "gpqa_main_zeroshot":"acc_norm,none",
    "anli_r1": "acc,none",
    "anli_r2": "acc,none",
    "anli_r3": "acc,none",
    "piqa":"acc_norm,none" ,
    "winogrande": "acc,none",
    "boolq": "acc,none",
    "truthfulqa_mc1":"acc,none" ,
    "truthfulqa_mc2":"acc,none" ,
    "drop": "f1,none",
    "nq_open":"exact_match,remove_whitespace" ,
    "openbookqa":"acc_norm,none" ,
    "sciq": "acc_norm,none",
    "triviaqa":"exact_match,remove_whitespace" ,
    "qnli":"acc,none" ,
}

# Tasks from most important to least important
# tasks = [mmlu, hellaswag, arc_challenge, bbh, gsm8k, gpqa_main_zeroshot, ANLI (r1, r2, r3), piqa, winogrande, boolq, TruthfulQA (mc1, mc2), drop, nq_open, openbookqa, sciq, triviaqa, qnli]

# Path list 
directory = "/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/results"
all_json_paths = list_json_files(directory)

def parse_results(json_path: str, metric_map: dict) -> pd.DataFrame:

    with open(json_path,'r') as f:
        data = json.load(f)

    # Extract core info
    model_name = data.get("model_name")
    model_name = model_name.split("/")[-1]
    total_time_raw = float(data.get("total_evaluation_time_seconds", 0))
    total_time = format_time(float(data.get("total_evaluation_time_seconds", 0)))
    batch_size = data["config"].get("batch_size")
    batch_sizes = data["config"].get("batch_sizes")
    parameters = format_params(data["config"].get("model_num_parameters"))
    parameters_raw = data["config"].get("model_num_parameters")
    

    rows = []
    for task, metric_key in metric_map.items():
        # Skip tasks not present in the results
        if task not in data["results"]:
            raise ValueError(f"'{task}' not in results! ")  
    
        metrics = data["results"][task]
    
        # If the metric_key isn't in this task's results, raise error
        if metric_key not in metrics:
            raise ValueError(
                f"Expected metric '{metric_key}' not found for task '{task}'. "
                f"Available keys: {list(metrics.keys())}"
            )
    
        acc = metrics[metric_key]
    
        row = {
            "model_name": model_name,
            # "task": task,
            "task": task + "(" + metric_key + ")",
            "score": acc,
            "total_time": total_time,
            "total_time_raw" : total_time_raw,
            "batch_size": batch_size,
            "batch_sizes": batch_sizes,
            "parameters": parameters,
            "parameters_raw": parameters_raw,
        }
        rows.append(row)

    # Convert to tidy dataframe
    return pd.DataFrame(rows)


dfs = [parse_results(path, metric_map) for path in all_json_paths]
master_df = pd.concat(dfs, ignore_index=True)


# display(master_df)


# Wide format: one row per model, columns = tasks
#Check for duplicate rows 
key_cols = ["task", 'score', 'model_name']
dups_mask = master_df.duplicated(key_cols, keep=False)
# dups = master_df.loc[dups_mask]
# display(dups)

if dups_mask.any():
    dups = master_df.loc[dups_mask, key_cols]
    raise ValueError(f"Duplicate rows found for keys:\n{dups}")

wide_df = master_df.pivot_table(
    index=["model_name", "parameters"],
    columns=["task"],
    values="score",
    aggfunc="mean"
).reset_index()

# select the metadata columns you want from the long df
meta_cols = [
    "model_name", 
    "parameters", 
    "parameters_raw",
    "total_time", 
    "total_time_raw", 
    "batch_size", 
    "batch_sizes", 
]

# drop duplicate rows by model_name + parameters
df_meta = master_df[meta_cols].drop_duplicates(subset=["model_name", "parameters"])

# merge the metadata back into your wide dataframe
df_wide_merged = df_meta.merge(wide_df, on=["model_name", "parameters"], how="left")


# display(df_wide_merged.drop(columns=["parameters_raw", "total_time_raw", "batch_sizes"]))



In [24]:
gpu_dir = Path("/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/gpu_usage")
gpu_files = list(gpu_dir.glob("*_gpu_usage.csv"))

def model_from_filename(p: Path) -> str:
    return p.stem.replace("_gpu_usage", "").strip()

model_names_gpu = [model_from_filename(x) for x in gpu_files]

# Check if match with result
set_gpu = set(model_names_gpu)
set_results = set(wide_df['model_name'])
extra_in_gpu = set_gpu - set_results
missing_in_gpu = set_results - set_gpu
# print("Extra models in GPU logs:", extra_in_gpu)
# print("Models in results with no GPU log:", missing_in_gpu)


# Check for Dulicates
def print_duplicates(name_list, label=""):
    counts = Counter(name_list)
    dups = [name for name, cnt in counts.items() if cnt > 1]
    if dups:
        print(f"Duplicates in {label}:")
        for name in dups:
            print(f"  {name}  (count = {counts[name]})")
    else:
        print(f"No duplicates found in {label}.")
# print_duplicates(model_names_gpu, "GPU logs")


def read_last_run_csv(path: Path) -> pd.DataFrame:
    """
    Return a DataFrame for only the *last* '==== New Run ... ====' block.
    Assumes next line after the marker is the CSV header.
    """
    lines = path.read_text(encoding="utf-8").splitlines()
    # locate all run markers
    run_idx = [i for i, line in enumerate(lines) if line.startswith("==== New Run:")]
    if not run_idx:
        raise ValueError(f"No '==== New Run' marker found in {path}")
    start = run_idx[-1] + 1  # header line index

    # slice from header to end and parse CSV
    block = "\n".join(lines[start:])
    df = pd.read_csv(StringIO(block))

    # optional cleanup: strip units and cast to numbers if these columns exist
    if " utilization.gpu [%]" in df.columns:
        df[" utilization.gpu [%]"] = (
            df[" utilization.gpu [%]"].astype(str).str.replace("%", "", regex=False).str.strip().astype("float")
        )
    if " memory.used [MiB]" in df.columns:
        df[" memory.used [MiB]"] = (
            df[" memory.used [MiB]"].astype(str).str.replace("MiB", "", regex=False).str.strip().astype("float")
        )
    # parse timestamp if desired
    if "timestamp" in df.columns:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

    return df


def eq_full_util_time(df, util_col=" utilization.gpu [%]", interval_sec=60):
    # clip just in case and cast to float
    u = pd.to_numeric(df[util_col], errors="coerce")
    # u = pd.to_numeric(df[util_col], errors="coerce").fillna(0).clip(0, 100)
    eq_full_sec = float((u / 100 * interval_sec).sum())
    full_sec = float(len(u)*interval_sec)

    # pretty formatter
    h, rem = divmod(int(round(full_sec)), 3600)
    m, s = divmod(rem, 60)
    pretty_full = f"{h}h {m}m"
    h, rem = divmod(int(round(eq_full_sec)), 3600)
    m, s = divmod(rem, 60)
    pretty = f"{h}h {m}m"
    return pretty, pretty_full, eq_full_sec


gpu_df = [read_last_run_csv(df) for df in gpu_files]


results = []
for name, df in zip(model_names_gpu, gpu_df):
    pretty, pretty_full, full_sec_raw = eq_full_util_time(df)     # unpack values
    results.append((name, pretty, full_sec_raw, pretty_full))        # collect tuple

# Turn into DataFrame
gpu_util_df = pd.DataFrame(results, columns=["model_name", "gpu_util_time", "gpu_util_time_raw", 'full_time_from_gpu_log'])

result_gpu_merged = gpu_util_df.merge(df_wide_merged, on=["model_name"], how="left")


In [44]:

GROUPS = {
    "Reasoning & Math": [
        "gsm8k(exact_match,strict-match)", 
        "bbh(exact_match,get-answer)", 
        "arc_challenge(acc_norm,none)", 'anli_r1(acc,none)',
       'anli_r2(acc,none)', 'anli_r3(acc,none)',
        "gpqa_main_zeroshot(acc_norm,none)",
    ],
    "Commonsense & NLI": [
        "hellaswag(acc_norm,none)",
        "piqa(acc_norm,none)", "winogrande(acc,none)", "boolq(acc,none)",
        "openbookqa(acc_norm,none)", "sciq(acc_norm,none)", "qnli(acc,none)",
    ],
    "Knowledge & Reading": [
        "mmlu(acc,none)", "nq_open(exact_match,remove_whitespace)", "drop(f1,none)",
        "truthfulqa_mc1(acc,none)", 'truthfulqa_mc2(acc,none)','triviaqa(exact_match,remove_whitespace)',
    ],
}



def add_task_ranks(df, task_cols):
    df = df.copy()
    for col in task_cols:
        if col not in df.columns:  
            raise ValueError(f"No task: {col}")
        # rank: 1 = best; NaN scores get ranked at the bottom
        df[f"{col}_rank"] = df[col].rank(ascending=False, method="dense", na_option="bottom")
    return df

def add_group_ranks(df, groups):
    df = df.copy()
    for gname, cols in groups.items():
        # strip task name before "(" if any
        tasks = [c for c in cols]
        mean_col = f"{gname}_mean"
        rank_col = f"{gname}_rank"
        df[mean_col] = df[tasks].mean(axis=1)
        df[rank_col] = df[mean_col].rank(ascending=False, method="dense", na_option="bottom").astype(int)
    return df


def add_overall_rank(df, groups):
    df = df.copy()
    all_tasks = [c for cols in groups.values() for c in cols]

    # overall mean score across all tasks
    df["overall_mean"] = df[all_tasks].mean(axis=1, skipna=True)

    # higher = better → rank descending
    df["overall_rank"] = df["overall_mean"].rank(
        ascending=False, method="dense", na_option="bottom"
    ).astype(int)
    return df


all_task_cols = [c for cols in GROUPS.values() for c in cols]

df_task_ranked = add_task_ranks(result_gpu_merged, all_task_cols)
df_group_ranked = add_group_ranks(df_task_ranked, GROUPS)
leaderboard = add_overall_rank(df_group_ranked, GROUPS)


col = "overall_rank"   # the one you want first
cols = [col] + [c for c in leaderboard.columns if c != col]
df = leaderboard[cols]
df = df.sort_values(by=col, ascending=True).reset_index(drop=True)

# Add quantization marker
targets = ['Qwen_Qwen3-14B', 'Qwen_Qwen2.5-14B-Instruct']  # use hyphen
mask = df['model_name'].isin(targets)
df.loc[mask, 'model_name'] = df.loc[mask, 'model_name'] + ' (8bit)'

# display(df)

df_display = df.rename(columns={
    "overall_rank": "Overall Rank",
    "model_name": "Model Name",
    "gpu_util_time": "GPU Util Time",
    "total_time": "Total Time",
    "parameters": "Parameters",
    'Reasoning & Math_rank': 'Reasoning & Math Avg. Rank',
    'Commonsense & NLI_rank': 'Commonsense & NLI Avg. Rank',
    'Knowledge & Reading_rank': 'Knowledge & Reading Avg. Rank',
    'overall_mean': 'Mean Score',
    'Reasoning & Math_mean': 'Reasoning & Math Mean Score',
    'Commonsense & NLI_mean': 'Commonsense & NLI Mean Score',
    'Knowledge & Reading_mean': 'Knowledge & Reading Mean Score',
})

cols_to_round = ["Mean Score", "Reasoning & Math Mean Score", "Commonsense & NLI Mean Score", "Knowledge & Reading Mean Score"] 
df_display[cols_to_round] = df_display[cols_to_round].round(4)

display(df_display)
df.to_csv("/mnt/data8tb/Documents/project/benchmark_project/llm_benchmarks_master.csv")


    
# Total time calculation
def format_seconds(secs: int) -> str:
    days, rem = divmod(int(secs), 86400)  # 86400 sec = 1 day
    hours, rem = divmod(rem, 3600)        # 3600 sec = 1 hour
    minutes, _ = divmod(rem, 60)
    return f"{days}d {hours}h {minutes}m"

# Example usage with df_display
totals = {}
for col in ["total_time_raw", "gpu_util_time_raw"]:
    total_secs = df_display[col].sum()
    totals[col] = format_seconds(total_secs)

print(totals)

Unnamed: 0,Overall Rank,Model Name,GPU Util Time,gpu_util_time_raw,full_time_from_gpu_log,Parameters,parameters_raw,Total Time,total_time_raw,batch_size,batch_sizes,"anli_r1(acc,none)","anli_r2(acc,none)","anli_r3(acc,none)","arc_challenge(acc_norm,none)","bbh(exact_match,get-answer)","boolq(acc,none)","drop(f1,none)","gpqa_main_zeroshot(acc_norm,none)","gsm8k(exact_match,strict-match)","hellaswag(acc_norm,none)","mmlu(acc,none)","nq_open(exact_match,remove_whitespace)","openbookqa(acc_norm,none)","piqa(acc_norm,none)","qnli(acc,none)","sciq(acc_norm,none)","triviaqa(exact_match,remove_whitespace)","truthfulqa_mc1(acc,none)","truthfulqa_mc2(acc,none)","winogrande(acc,none)","gsm8k(exact_match,strict-match)_rank","bbh(exact_match,get-answer)_rank","arc_challenge(acc_norm,none)_rank","anli_r1(acc,none)_rank","anli_r2(acc,none)_rank","anli_r3(acc,none)_rank","gpqa_main_zeroshot(acc_norm,none)_rank","hellaswag(acc_norm,none)_rank","piqa(acc_norm,none)_rank","winogrande(acc,none)_rank","boolq(acc,none)_rank","openbookqa(acc_norm,none)_rank","sciq(acc_norm,none)_rank","qnli(acc,none)_rank","mmlu(acc,none)_rank","nq_open(exact_match,remove_whitespace)_rank","drop(f1,none)_rank","truthfulqa_mc1(acc,none)_rank","truthfulqa_mc2(acc,none)_rank","triviaqa(exact_match,remove_whitespace)_rank",Reasoning & Math Mean Score,Reasoning & Math Avg. Rank,Commonsense & NLI Mean Score,Commonsense & NLI Avg. Rank,Knowledge & Reading Mean Score,Knowledge & Reading Avg. Rank,Mean Score
0,1,google_gemma-3-12b-it,14h 8m,50906.4,15h 47m,12.2B,12187325040,15h 45m,56750.865892,auto,[2],0.603,0.56,0.595833,0.610922,0.801874,0.874618,0.139566,0.337054,0.87718,0.818761,0.716137,0.157064,0.498,0.78074,0.745744,0.954,0.275245,0.405141,0.581183,0.744278,3.0,1.0,2.0,5.0,3.0,2.0,9.0,3.0,19.0,4.0,3.0,1.0,6.0,8.0,6.0,8.0,8.0,8.0,8.0,23.0,0.6266,1,0.7737,3,0.3791,10,0.6038
1,2,Qwen_Qwen3-14B (8bit),17h 29m,62956.2,29h 46m,14.8B,14768307200,29h 45m,107151.802065,1,[],0.646,0.57,0.556667,0.600683,0.43296,0.891743,0.09041,0.397321,0.898408,0.787692,0.769477,0.092244,0.46,0.794886,0.844225,0.966,0.40749,0.406365,0.589404,0.7206,1.0,29.0,4.0,4.0,2.0,3.0,1.0,12.0,14.0,11.0,1.0,8.0,1.0,2.0,2.0,15.0,18.0,7.0,6.0,13.0,0.586,3,0.7807,2,0.3926,7,0.5961
2,3,openchat_openchat-3.6-8b-20240522,6h 59m,25150.8,7h 52m,8.0B,8030261248,7h 51m,28278.85947,3,[],0.556,0.513,0.48,0.603242,0.617877,0.872783,0.251569,0.332589,0.750569,0.797849,0.643071,0.170637,0.462,0.818281,0.730002,0.964,0.565927,0.352509,0.497601,0.76322,16.0,10.0,3.0,9.0,8.0,11.0,11.0,6.0,3.0,1.0,4.0,7.0,2.0,9.0,14.0,5.0,3.0,15.0,19.0,3.0,0.5505,6,0.7726,5,0.4136,2,0.5871
3,4,Qwen_Qwen3-8B,13h 44m,49497.0,15h 33m,8.2B,8190735360,15h 31m,55918.46786,auto,[1],0.669,0.542,0.555833,0.562287,0.797573,0.865749,0.109877,0.350446,0.872631,0.748656,0.728956,0.073684,0.418,0.775299,0.781805,0.958,0.320609,0.363525,0.54314,0.680347,4.0,2.0,11.0,3.0,5.0,4.0,5.0,24.0,21.0,22.0,6.0,20.0,4.0,7.0,3.0,16.0,13.0,13.0,13.0,18.0,0.6214,2,0.7468,8,0.3566,14,0.5859
4,5,Qwen_Qwen2.5-7B-Instruct,8h 33m,30831.6,9h 38m,7.6B,7615616512,9h 36m,34616.604248,3,[],0.685,0.549,0.5525,0.552901,0.448779,0.863303,0.071089,0.328125,0.762699,0.804919,0.71806,0.045706,0.486,0.803047,0.804503,0.937,0.325401,0.477356,0.648483,0.711918,12.0,27.0,12.0,2.0,4.0,5.0,12.0,5.0,10.0,14.0,7.0,2.0,12.0,4.0,4.0,23.0,27.0,2.0,2.0,17.0,0.5541,5,0.773,4,0.381,9,0.5788
5,6,Qwen_Qwen2.5-14B-Instruct (8bit),29h 32m,106374.6,52h 45m,14.8B,14770033664,52h 44m,189869.409404,1,[],0.721,0.634,0.6175,0.615188,0.106896,0.886239,0.071276,0.354911,0.792267,0.841964,0.783079,0.061496,0.476,0.817193,0.853926,0.929,0.039289,0.510404,0.683015,0.754538,9.0,41.0,1.0,1.0,1.0,1.0,4.0,1.0,4.0,2.0,2.0,4.0,17.0,1.0,1.0,19.0,25.0,1.0,1.0,35.0,0.5488,7,0.7941,1,0.3581,13,0.5775
6,7,01-ai_Yi-1.5-9B,10h 26m,37569.6,11h 44m,8.8B,8829407232,11h 43m,42212.112622,2,[],0.532,0.48,0.439167,0.546928,0.712026,0.858104,0.445686,0.294643,0.639121,0.778929,0.689289,0.153186,0.456,0.806311,0.508695,0.952,0.543803,0.321909,0.467572,0.726125,25.0,5.0,15.0,12.0,12.0,18.0,20.0,14.0,6.0,9.0,10.0,9.0,7.0,28.0,8.0,10.0,1.0,19.0,26.0,4.0,0.5206,16,0.7266,15,0.4369,1,0.5676
7,8,Qwen_Qwen2.5-7B-Instruct-1M,10h 10m,36621.0,11h 18m,7.6B,7615616512,11h 17m,40632.813397,auto,[1],0.585,0.533,0.556667,0.585324,0.277223,0.852599,0.057047,0.339286,0.795299,0.789982,0.716636,0.157618,0.48,0.816104,0.678199,0.95,0.420531,0.425949,0.600072,0.727703,8.0,38.0,7.0,6.0,6.0,3.0,8.0,11.0,5.0,8.0,12.0,3.0,8.0,11.0,5.0,7.0,29.0,3.0,3.0,12.0,0.5245,15,0.7564,7,0.3963,6,0.5672
8,9,meta-llama_Llama-3.1-8B-Instruct,10h 52m,39147.6,12h 20m,8.0B,8030261248,12h 19m,44363.24936,auto,[1],0.482,0.467,0.443333,0.550341,0.715558,0.84159,0.193729,0.34375,0.754359,0.792073,0.679319,0.177562,0.432,0.806311,0.501373,0.962,0.518168,0.365973,0.541154,0.738753,15.0,4.0,13.0,18.0,14.0,17.0,6.0,9.0,6.0,6.0,16.0,16.0,3.0,30.0,12.0,4.0,4.0,12.0,14.0,7.0,0.5366,12,0.7249,17,0.4127,3,0.5653
9,10,01-ai_Yi-1.5-9B-Chat,12h 15m,44120.4,13h 55m,8.8B,8829407232,13h 54m,50056.331345,2,[],0.535,0.509,0.525833,0.587031,0.610659,0.868196,0.125326,0.303571,0.708112,0.787293,0.684091,0.009418,0.436,0.803591,0.787662,0.954,0.338665,0.374541,0.547934,0.746646,18.0,12.0,6.0,11.0,9.0,6.0,18.0,13.0,9.0,3.0,5.0,14.0,6.0,6.0,9.0,36.0,9.0,10.0,11.0,15.0,0.5399,9,0.7691,6,0.3467,15,0.5621


{'total_time_raw': '18d 7h 55m', 'gpu_util_time_raw': '14d 23h 41m'}


In [42]:
column_map = {
    "overall": ["Model Name", "Total Time", 'Mean Score', "GPU Util Time", "Overall Rank"],
    "reasoning_and_math": ["Model Name", "Total Time", "GPU Util Time", 'Reasoning & Math Mean Score', "Reasoning & Math Avg. Rank"],
    "commonsense_and_nli": ["Model Name", "Total Time", "GPU Util Time", 'Commonsense & NLI Mean Score', "Commonsense & NLI Avg. Rank"],
    "knowledge_and_reading": ["Model Name", "Total Time", "GPU Util Time", 'Knowledge & Reading Mean Score', "Knowledge & Reading Avg. Rank"]
}



# Produce sub-dataframes and export them to csv and excel file.
with pd.ExcelWriter("/mnt/data8tb/Documents/project/benchmark_project/llm_benchmarks_all_results.xlsx") as writer:
    df_display.to_excel(writer, sheet_name="Master", index=False)
    
    for name, cols in column_map.items():
        sub_df = df_display[cols].copy()
        rank_col = [c for c in sub_df.columns if 'Rank' in c][0]
        sub_df = sub_df.sort_values(by=rank_col, ascending=True).reset_index(drop=True)
        sub_df.index = sub_df.index + 1
        print(name)
        if name == 'overall':
            overall_df = sub_df
        display(sub_df)
        
        sub_df.to_csv(f"/mnt/data8tb/Documents/project/benchmark_project/{name}_rank.csv")
        sub_df.to_excel(writer, sheet_name=name, index=False)

        table_md = sub_df.to_markdown(index=False)
        print(table_md)
        print()
        print()



overall


Unnamed: 0,Model Name,Total Time,Mean Score,GPU Util Time,Overall Rank
1,google_gemma-3-12b-it,15h 45m,0.6038,14h 8m,1
2,Qwen_Qwen3-14B (8bit),29h 45m,0.5961,17h 29m,2
3,openchat_openchat-3.6-8b-20240522,7h 51m,0.5871,6h 59m,3
4,Qwen_Qwen3-8B,15h 31m,0.5859,13h 44m,4
5,Qwen_Qwen2.5-7B-Instruct,9h 36m,0.5788,8h 33m,5
6,Qwen_Qwen2.5-14B-Instruct (8bit),52h 44m,0.5775,29h 32m,6
7,01-ai_Yi-1.5-9B,11h 43m,0.5676,10h 26m,7
8,Qwen_Qwen2.5-7B-Instruct-1M,11h 17m,0.5672,10h 10m,8
9,meta-llama_Llama-3.1-8B-Instruct,12h 19m,0.5653,10h 52m,9
10,01-ai_Yi-1.5-9B-Chat,13h 54m,0.5621,12h 15m,10


| Model Name                                | Total Time   |   Mean Score | GPU Util Time   |   Overall Rank |
|:------------------------------------------|:-------------|-------------:|:----------------|---------------:|
| google_gemma-3-12b-it                     | 15h 45m      |       0.6038 | 14h 8m          |              1 |
| Qwen_Qwen3-14B (8bit)                     | 29h 45m      |       0.5961 | 17h 29m         |              2 |
| openchat_openchat-3.6-8b-20240522         | 7h 51m       |       0.5871 | 6h 59m          |              3 |
| Qwen_Qwen3-8B                             | 15h 31m      |       0.5859 | 13h 44m         |              4 |
| Qwen_Qwen2.5-7B-Instruct                  | 9h 36m       |       0.5788 | 8h 33m          |              5 |
| Qwen_Qwen2.5-14B-Instruct (8bit)          | 52h 44m      |       0.5775 | 29h 32m         |              6 |
| 01-ai_Yi-1.5-9B                           | 11h 43m      |       0.5676 | 10h 26m         |              7 |
|

Unnamed: 0,Model Name,Total Time,GPU Util Time,Reasoning & Math Mean Score,Reasoning & Math Avg. Rank
1,google_gemma-3-12b-it,15h 45m,14h 8m,0.6266,1
2,Qwen_Qwen3-8B,15h 31m,13h 44m,0.6214,2
3,Qwen_Qwen3-14B (8bit),29h 45m,17h 29m,0.586,3
4,Qwen_Qwen3-4B,5h 51m,5h 3m,0.5712,4
5,Qwen_Qwen2.5-7B-Instruct,9h 36m,8h 33m,0.5541,5
6,openchat_openchat-3.6-8b-20240522,7h 51m,6h 59m,0.5505,6
7,Qwen_Qwen2.5-14B-Instruct (8bit),52h 44m,29h 32m,0.5488,7
8,mistralai_Ministral-8B-Instruct-2410,10h 46m,9h 27m,0.5446,8
9,01-ai_Yi-1.5-9B-Chat,13h 54m,12h 15m,0.5399,9
10,deepseek-ai_DeepSeek-R1-0528-Qwen3-8B,17h 57m,15h 30m,0.5387,10


| Model Name                                | Total Time   | GPU Util Time   |   Reasoning & Math Mean Score |   Reasoning & Math Avg. Rank |
|:------------------------------------------|:-------------|:----------------|------------------------------:|-----------------------------:|
| google_gemma-3-12b-it                     | 15h 45m      | 14h 8m          |                        0.6266 |                            1 |
| Qwen_Qwen3-8B                             | 15h 31m      | 13h 44m         |                        0.6214 |                            2 |
| Qwen_Qwen3-14B (8bit)                     | 29h 45m      | 17h 29m         |                        0.586  |                            3 |
| Qwen_Qwen3-4B                             | 5h 51m       | 5h 3m           |                        0.5712 |                            4 |
| Qwen_Qwen2.5-7B-Instruct                  | 9h 36m       | 8h 33m          |                        0.5541 |                            5 |
| open

Unnamed: 0,Model Name,Total Time,GPU Util Time,Commonsense & NLI Mean Score,Commonsense & NLI Avg. Rank
1,Qwen_Qwen2.5-14B-Instruct (8bit),52h 44m,29h 32m,0.7941,1
2,Qwen_Qwen3-14B (8bit),29h 45m,17h 29m,0.7807,2
3,google_gemma-3-12b-it,15h 45m,14h 8m,0.7737,3
4,Qwen_Qwen2.5-7B-Instruct,9h 36m,8h 33m,0.773,4
5,openchat_openchat-3.6-8b-20240522,7h 51m,6h 59m,0.7726,5
6,01-ai_Yi-1.5-9B-Chat,13h 54m,12h 15m,0.7691,6
7,Qwen_Qwen2.5-7B-Instruct-1M,11h 17m,10h 10m,0.7564,7
8,Qwen_Qwen3-8B,15h 31m,13h 44m,0.7468,8
9,mistralai_Mistral-7B-Instruct-v0.3,8h 38m,7h 41m,0.7403,9
10,01-ai_Yi-1.5-6B-Chat,8h 4m,7h 1m,0.7374,10


| Model Name                                | Total Time   | GPU Util Time   |   Commonsense & NLI Mean Score |   Commonsense & NLI Avg. Rank |
|:------------------------------------------|:-------------|:----------------|-------------------------------:|------------------------------:|
| Qwen_Qwen2.5-14B-Instruct (8bit)          | 52h 44m      | 29h 32m         |                         0.7941 |                             1 |
| Qwen_Qwen3-14B (8bit)                     | 29h 45m      | 17h 29m         |                         0.7807 |                             2 |
| google_gemma-3-12b-it                     | 15h 45m      | 14h 8m          |                         0.7737 |                             3 |
| Qwen_Qwen2.5-7B-Instruct                  | 9h 36m       | 8h 33m          |                         0.773  |                             4 |
| openchat_openchat-3.6-8b-20240522         | 7h 51m       | 6h 59m          |                         0.7726 |                         

Unnamed: 0,Model Name,Total Time,GPU Util Time,Knowledge & Reading Mean Score,Knowledge & Reading Avg. Rank
1,01-ai_Yi-1.5-9B,11h 43m,10h 26m,0.4369,1
2,openchat_openchat-3.6-8b-20240522,7h 51m,6h 59m,0.4136,2
3,meta-llama_Llama-3.1-8B-Instruct,12h 19m,10h 52m,0.4127,3
4,01-ai_Yi-1.5-6B,4h 28m,3h 54m,0.4063,4
5,mistralai_Mistral-7B-Instruct-v0.3,8h 38m,7h 41m,0.4045,5
6,Qwen_Qwen2.5-7B-Instruct-1M,11h 17m,10h 10m,0.3963,6
7,Qwen_Qwen3-14B (8bit),29h 45m,17h 29m,0.3926,7
8,meta-llama_Meta-Llama-3-8B-Instruct,6h 30m,5h 46m,0.3923,8
9,Qwen_Qwen2.5-7B-Instruct,9h 36m,8h 33m,0.381,9
10,google_gemma-3-12b-it,15h 45m,14h 8m,0.3791,10


| Model Name                                | Total Time   | GPU Util Time   |   Knowledge & Reading Mean Score |   Knowledge & Reading Avg. Rank |
|:------------------------------------------|:-------------|:----------------|---------------------------------:|--------------------------------:|
| 01-ai_Yi-1.5-9B                           | 11h 43m      | 10h 26m         |                           0.4369 |                               1 |
| openchat_openchat-3.6-8b-20240522         | 7h 51m       | 6h 59m          |                           0.4136 |                               2 |
| meta-llama_Llama-3.1-8B-Instruct          | 12h 19m      | 10h 52m         |                           0.4127 |                               3 |
| 01-ai_Yi-1.5-6B                           | 4h 28m       | 3h 54m          |                           0.4063 |                               4 |
| mistralai_Mistral-7B-Instruct-v0.3        | 8h 38m       | 7h 41m          |                           0.4045 

In [43]:
display(overall_df)
overall_df.to_html("overall.html", index=False)

Unnamed: 0,Model Name,Total Time,Mean Score,GPU Util Time,Overall Rank
1,google_gemma-3-12b-it,15h 45m,0.6038,14h 8m,1
2,Qwen_Qwen3-14B (8bit),29h 45m,0.5961,17h 29m,2
3,openchat_openchat-3.6-8b-20240522,7h 51m,0.5871,6h 59m,3
4,Qwen_Qwen3-8B,15h 31m,0.5859,13h 44m,4
5,Qwen_Qwen2.5-7B-Instruct,9h 36m,0.5788,8h 33m,5
6,Qwen_Qwen2.5-14B-Instruct (8bit),52h 44m,0.5775,29h 32m,6
7,01-ai_Yi-1.5-9B,11h 43m,0.5676,10h 26m,7
8,Qwen_Qwen2.5-7B-Instruct-1M,11h 17m,0.5672,10h 10m,8
9,meta-llama_Llama-3.1-8B-Instruct,12h 19m,0.5653,10h 52m,9
10,01-ai_Yi-1.5-9B-Chat,13h 54m,0.5621,12h 15m,10
