In [88]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
import os
import sys
from pathlib import Path

# check if workding_dir is in local variables
if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

workding dir: /Users/inflaton/code/engd/papers/maritime-incidents-ai-agents


In [90]:
from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

loading env vars from: /Users/inflaton/code/engd/papers/maritime-incidents-ai-agents/.env


True

In [91]:
%%time

from llm_toolkit.eval_openai import *
from tqdm.notebook import tqdm

tqdm.pandas()

data_path = os.getenv("DATA_PATH")
results_path = "paper/data/ollama_model_results_v3-A6000_top_metrics.csv"
num_ctx = os.getenv("NUM_CTX")
data_path, results_path, num_ctx

CPU times: user 205 μs, sys: 39 μs, total: 244 μs
Wall time: 242 μs


('dataset/GMRID_v3.csv',
 'paper/data/ollama_model_results_v3-A6000_top_metrics.csv',
 '8192')

In [92]:
import pandas as pd
from llm_toolkit.llm_utils import *
from llm_toolkit.data_utils import *

df = pd.read_csv(results_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   model                   20 non-null     object 
 1   shots                   20 non-null     int64  
 2   eval_time               20 non-null     float64
 3   f1                      20 non-null     float64
 4   accuracy                20 non-null     float64
 5   f1_raw                  20 non-null     float64
 6   accuracy_raw            20 non-null     float64
 7   ratio_valid_categories  20 non-null     float64
 8   total_tokens            20 non-null     int64  
 9   eval_speed              20 non-null     float64
dtypes: float64(7), int64(2), object(1)
memory usage: 1.7+ KB


In [93]:
df2 = pd.read_csv("paper/data/ollama_model_results_v3-RTX4090_top_metrics.csv")
df3 = pd.read_csv("paper/data/ollama_model_results_v3-M3_Max_top_metrics.csv")
df4 = pd.read_csv("paper/data/ollama_model_results_v3-M4_Max_top_metrics.csv")
df5 = pd.read_csv("paper/data/ollama_model_results_v3-RTX4090_Laptop_top_metrics.csv")
df6 = pd.read_csv("paper/data/ollama_model_results_v3-Jetson_AGX_Orin_top_metrics.csv")

In [94]:
model_orders = {
    k.replace("-coder", "").replace("-Coder", ""): v
    for k, v in model_orders.items()
    if "qwq" not in k.lower() and "/" not in k and "fp16" not in k and "gpt" not in k
}
model_orders

{'qwen2.5:0.5b': 0.5,
 'llama3.2:1b': 1,
 'qwen2.5:1.5b': 1.5,
 'llama3.2:3b': 3,
 'qwen2.5:3b': 4,
 'qwen2.5:7b': 12,
 'llama3.1:8b': 15,
 'llama3.2-vision': 21,
 'llama3.2-vision:11b': 21,
 'qwen2.5:14b': 22,
 'qwen2.5:32b': 23,
 'llama3.1:70b': 25,
 'llama3.3:70b': 30.1,
 'qwen2.5:72b': 30.2,
 'llama3.2-vision:90b': 31}

In [95]:
all_models = [model for model in df["model"].unique() if model in model_orders]
all_models

['qwen2.5:0.5b',
 'llama3.2:1b',
 'qwen2.5:1.5b',
 'llama3.2:3b',
 'qwen2.5:3b',
 'qwen2.5:7b',
 'llama3.1:8b',
 'llama3.2-vision:11b',
 'qwen2.5:14b',
 'qwen2.5:32b',
 'llama3.1:70b',
 'llama3.3:70b',
 'qwen2.5:72b',
 'llama3.2-vision:90b']

In [96]:
small_models = all_models[:5]
medium_models = all_models[5:10]
large_models = all_models[10:]

In [97]:
metrics_df = df[df["model"].isin(all_models)]
metrics_df2 = df2[df2["model"].isin(all_models)]
metrics_df3 = df3[df3["model"].isin(all_models)]
metrics_df4 = df4[df4["model"].isin(all_models)]
metrics_df5 = df5[df5["model"].isin(all_models)]
metrics_df6 = df6[df6["model"].isin(all_models)]

In [98]:
metrics_map = {
    "RTX A6000": metrics_df,
    "RTX 4090": metrics_df2,
    "M3 Max": metrics_df3,
    "M4 Max": metrics_df4,
    "RTX 4090 Laptop": metrics_df5,
    "Jetson AGX Orin": metrics_df6,
}

In [101]:
import pandas as pd
import numpy as np

def generate_performance_table(metrics_map, models):
    rows = []
    
    # Iterate over models
    for model in models:
        f1_scores = []
        throughputs = []
        eval_times = []

        # Collect data across devices
        for device, df in metrics_map.items():
            # Filter for the specific model
            model_data = df[df["model"] == model]
            if not model_data.empty:
                f1 = model_data["f1"].values[0] * 100
                throughput = model_data["eval_speed"].values[0]
                eval_time = model_data["eval_time"].values[0]
                
                # Append row for each device
                rows.append({
                    "Model": model,
                    "Platform/Stats": device,
                    "F1 (%)": f1,
                    "T-put (t/s)": throughput,
                    "Time (s)": eval_time,
                })
                f1_scores.append(f1)
                throughputs.append(throughput)
                eval_times.append(eval_time)
            else:
                # If no data, append placeholders
                rows.append({
                    "Model": model,
                    "Platform/Stats": device,
                    "F1 (%)": "-",
                    "T-put (t/s)": "-",
                    "Time (s)": "-",
                })
        
        # Add mean and std rows for the current model
        rows.append({
            "Model": model,
            "Platform/Stats": "mean",
            "F1 (%)": np.mean(f1_scores) if f1_scores else "-",
            "T-put (t/s)": np.mean(throughputs) if throughputs else "-",
            "Time (s)": np.mean(eval_times) if eval_times else "-",
        })
        rows.append({
            "Model": model,
            "Platform/Stats": "std",
            "F1 (%)": np.std(f1_scores) if f1_scores else "-",
            "T-put (t/s)": np.std(throughputs) if throughputs else "-",
            "Time (s)": np.std(eval_times) if eval_times else "-",
        })
    
    # Convert to DataFrame
    table_df = pd.DataFrame(rows)
    table_df.set_index(["Model", "Platform/Stats"], inplace=True)

    table_df["F1 (%)"] = table_df["F1 (%)"].apply(lambda x: f"{x:.2f}" if x != "-" else x)
    table_df["T-put (t/s)"] = table_df["T-put (t/s)"].apply(lambda x: f"{x:.0f}" if x != "-" else x)
    table_df["Time (s)"] = table_df["Time (s)"].apply(lambda x: f"{x:.3f}" if x != "-" else x)
    return table_df

In [102]:
small_models_perf = generate_performance_table(metrics_map, small_models)
medium_models_perf = generate_performance_table(metrics_map, medium_models)
large_models_perf = generate_performance_table(metrics_map, large_models)

In [103]:
# Display the performance tables in markdown format
print("## Small Models")
print(small_models_perf.to_markdown(index=True))
print("\n")

print("## Medium Models")
print(medium_models_perf.to_markdown(index=True))
print("\n")

print("## Large Models")
print(large_models_perf.to_markdown(index=True))

## Small Models
|                                     |   F1 (%) |   T-put (t/s) |   Time (s) |
|:------------------------------------|---------:|--------------:|-----------:|
| ('qwen2.5:0.5b', 'RTX A6000')       |    45.85 |           816 |      2.573 |
| ('qwen2.5:0.5b', 'RTX 4090')        |    48.22 |         13124 |      0.16  |
| ('qwen2.5:0.5b', 'M3 Max')          |    47.45 |          7178 |      0.258 |
| ('qwen2.5:0.5b', 'M4 Max')          |    46.08 |          9632 |      0.218 |
| ('qwen2.5:0.5b', 'RTX 4090 Laptop') |    46.58 |           776 |      2.707 |
| ('qwen2.5:0.5b', 'Jetson AGX Orin') |    46.65 |          1762 |      1.192 |
| ('qwen2.5:0.5b', 'mean')            |    46.8  |          5548 |      1.185 |
| ('qwen2.5:0.5b', 'std')             |     0.81 |          4765 |      1.087 |
| ('llama3.2:1b', 'RTX A6000')        |    63.14 |           401 |      2.698 |
| ('llama3.2:1b', 'RTX 4090')         |    63.51 |          5035 |      0.215 |
| ('llama3.2:1b', 'M3 Ma

In [104]:
print(small_models_perf.to_latex(index=True, multirow=True))

\begin{tabular}{lllll}
\toprule
 &  & F1 (%) & T-put (t/s) & Time (s) \\
Model & Platform/Stats &  &  &  \\
\midrule
\multirow[t]{8}{*}{qwen2.5:0.5b} & RTX A6000 & 45.85 & 816 & 2.573 \\
 & RTX 4090 & 48.22 & 13124 & 0.160 \\
 & M3 Max & 47.45 & 7178 & 0.258 \\
 & M4 Max & 46.08 & 9632 & 0.218 \\
 & RTX 4090 Laptop & 46.58 & 776 & 2.707 \\
 & Jetson AGX Orin & 46.65 & 1762 & 1.192 \\
 & mean & 46.80 & 5548 & 1.185 \\
 & std & 0.81 & 4765 & 1.087 \\
\cline{1-5}
\multirow[t]{8}{*}{llama3.2:1b} & RTX A6000 & 63.14 & 401 & 2.698 \\
 & RTX 4090 & 63.51 & 5035 & 0.215 \\
 & M3 Max & 63.50 & 1899 & 0.570 \\
 & M4 Max & 63.34 & 2400 & 0.451 \\
 & RTX 4090 Laptop & 62.65 & 361 & 2.998 \\
 & Jetson AGX Orin & 63.41 & 352 & 3.074 \\
 & mean & 63.26 & 1741 & 1.668 \\
 & std & 0.30 & 1680 & 1.265 \\
\cline{1-5}
\multirow[t]{8}{*}{qwen2.5:1.5b} & RTX A6000 & 74.38 & 456 & 2.571 \\
 & RTX 4090 & 73.66 & 7419 & 0.158 \\
 & M3 Max & 75.13 & 2224 & 0.527 \\
 & M4 Max & 74.09 & 4055 & 0.289 \\
 & RTX 409

In [105]:
print(medium_models_perf.to_latex(index=True, multirow=True))

\begin{tabular}{lllll}
\toprule
 &  & F1 (%) & T-put (t/s) & Time (s) \\
Model & Platform/Stats &  &  &  \\
\midrule
\multirow[t]{8}{*}{qwen2.5:7b} & RTX A6000 & 92.83 & 784 & 2.681 \\
 & RTX 4090 & 92.65 & 6607 & 0.318 \\
 & M3 Max & 92.50 & 1194 & 1.760 \\
 & M4 Max & 92.72 & 2041 & 1.030 \\
 & RTX 4090 Laptop & 92.46 & 705 & 2.983 \\
 & Jetson AGX Orin & 92.78 & 410 & 5.130 \\
 & mean & 92.66 & 1957 & 2.317 \\
 & std & 0.14 & 2143 & 1.552 \\
\cline{1-5}
\multirow[t]{8}{*}{llama3.1:8b} & RTX A6000 & 93.10 & 778 & 2.684 \\
 & RTX 4090 & 91.96 & 6761 & 0.309 \\
 & M3 Max & 92.21 & 1430 & 1.461 \\
 & M4 Max & 93.10 & 2269 & 0.921 \\
 & RTX 4090 Laptop & 93.19 & 706 & 2.959 \\
 & Jetson AGX Orin & 93.10 & 406 & 5.146 \\
 & mean & 92.78 & 2058 & 2.247 \\
 & std & 0.50 & 2189 & 1.593 \\
\cline{1-5}
\multirow[t]{8}{*}{llama3.2-vision:11b} & RTX A6000 & 92.63 & 780 & 2.680 \\
 & RTX 4090 & 92.53 & 6293 & 0.332 \\
 & M3 Max & 93.12 & 1232 & 1.696 \\
 & M4 Max & 92.64 & 2261 & 0.924 \\
 & RTX 

In [106]:
print(large_models_perf.to_latex(index=True, multirow=True))

\begin{tabular}{lllll}
\toprule
 &  & F1 (%) & T-put (t/s) & Time (s) \\
Model & Platform/Stats &  &  &  \\
\midrule
\multirow[t]{8}{*}{llama3.1:70b} & RTX A6000 & 95.96 & 312 & 4.724 \\
 & RTX 4090 & 95.46 & 94 & 15.657 \\
 & M3 Max & 95.55 & 120 & 12.293 \\
 & M4 Max & - & - & - \\
 & RTX 4090 Laptop & - & - & - \\
 & Jetson AGX Orin & - & - & - \\
 & mean & 95.66 & 175 & 10.891 \\
 & std & 0.22 & 97 & 4.572 \\
\cline{1-5}
\multirow[t]{8}{*}{llama3.3:70b} & RTX A6000 & 95.87 & 426 & 4.322 \\
 & RTX 4090 & 95.73 & 73 & 20.241 \\
 & M3 Max & 95.90 & 37 & 39.523 \\
 & M4 Max & - & - & - \\
 & RTX 4090 Laptop & - & - & - \\
 & Jetson AGX Orin & - & - & - \\
 & mean & 95.83 & 179 & 21.362 \\
 & std & 0.07 & 176 & 14.393 \\
\cline{1-5}
\multirow[t]{8}{*}{qwen2.5:72b} & RTX A6000 & 97.08 & 314 & 5.911 \\
 & RTX 4090 & 96.85 & 109 & 17.072 \\
 & M3 Max & 96.59 & 116 & 16.055 \\
 & M4 Max & - & - & - \\
 & RTX 4090 Laptop & - & - & - \\
 & Jetson AGX Orin & - & - & - \\
 & mean & 96.84 & 179 