## Set Up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from pathlib import Path

if "workding_dir" not in globals():
    workding_dir = str(Path.cwd().parent)

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

workding dir: /home/inflaton/code/maritime-incidents-ai-agents
loading env vars from: /home/inflaton/code/maritime-incidents-ai-agents/.env


True

In [3]:
%%time

from llm_toolkit.eval_openai import *
from tqdm.notebook import tqdm

tqdm.pandas()

data_path = os.getenv("DATA_PATH")
results_path = "paper/data/ollama_model_results_v3-RTX4090.csv"
num_ctx = os.getenv("NUM_CTX")
data_path, results_path, num_ctx

loading env vars from: /home/inflaton/code/maritime-incidents-ai-agents/.env
Adding /home/inflaton/code/maritime-incidents-ai-agents to sys.path
loading /home/inflaton/code/maritime-incidents-ai-agents/llm_toolkit/data_utils.py
CPU times: user 3.94 s, sys: 5.1 s, total: 9.04 s
Wall time: 5min 51s


('dataset/GMRID_v3.csv',
 'paper/data/ollama_model_results_v3-RTX4090.csv',
 '8192')

In [4]:
# run cells above before running anything below

## Evaluating 14 LLMs: 7 Llama3 + 7 Qwen2.5

### Evaluating Llama3 LLMs

In [5]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2:1b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: llama3.2:1b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:06<00:00,  4.65it/s]


*** Execution time for num_shots 0: 246.89 seconds
llama3.2:1b/shots-00(0.215) metrics: {'f1': 0.6255706762228095, 'accuracy': 0.5719267654751525}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:36<00:00,  3.41it/s]


*** Execution time for num_shots 1: 336.30 seconds
Error parsing json: {
  "category": "Worker Strike",
  "specific_tags": ["Nationwide", "Farmers', ",true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, tr
llama3.2:1b/shots-01(0.293) metrics: {'f1': 0.591252273844905, 'accuracy': 0.5945945945945946}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:01<00:00,  3.18it/s]


*** Execution time for num_shots 2: 361.04 seconds
llama3.2:1b/shots-02(0.315) metrics: {'f1': 0.6115285168098449, 'accuracy': 0.5963382737576286}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:43<00:00,  3.34it/s]


*** Execution time for num_shots 4: 343.45 seconds
llama3.2:1b/shots-04(0.299) metrics: {'f1': 0.5928588780258095, 'accuracy': 0.5797733217088056}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:31<00:00,  3.46it/s]

*** Execution time for num_shots 8: 331.15 seconds
llama3.2:1b/shots-08(0.289) metrics: {'f1': 0.5110964104820224, 'accuracy': 0.5274629468177855}
CPU times: user 3min 29s, sys: 3.56 s, total: 3min 32s
Wall time: 27min





In [4]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2:1b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: llama3.2:1b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:31<00:00,  3.46it/s]


*** Execution time for num_shots 10: 331.03 seconds
llama3.2:1b/shots-10(0.289) metrics: {'f1': 0.5854739532013948, 'accuracy': 0.6006974716652136}
CPU times: user 40.4 s, sys: 969 ms, total: 41.4 s
Wall time: 5min 33s


In [7]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2:3b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: llama3.2:3b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:35<00:00,  4.17it/s]


*** Execution time for num_shots 0: 275.23 seconds
category not in json: {}
llama3.2:3b/shots-00(0.240) metrics: {'f1': 0.6538105290240847, 'accuracy': 0.6591107236268526}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:00<00:00,  4.77it/s]


*** Execution time for num_shots 1: 240.28 seconds
llama3.2:3b/shots-01(0.209) metrics: {'f1': 0.7571035424067668, 'accuracy': 0.7471665213600698}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:48<00:00,  5.03it/s]


*** Execution time for num_shots 2: 228.03 seconds
llama3.2:3b/shots-02(0.199) metrics: {'f1': 0.773965881235158, 'accuracy': 0.7593722755013078}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:45<00:00,  5.08it/s]


*** Execution time for num_shots 4: 225.97 seconds
llama3.2:3b/shots-04(0.197) metrics: {'f1': 0.7909707674325613, 'accuracy': 0.7776809067131648}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:50<00:00,  4.97it/s]


*** Execution time for num_shots 8: 230.74 seconds
llama3.2:3b/shots-08(0.201) metrics: {'f1': 0.8186852619910396, 'accuracy': 0.8116826503923278}
CPU times: user 3min 23s, sys: 2.73 s, total: 3min 25s
Wall time: 20min 1s


In [5]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2:3b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: llama3.2:3b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:05<00:00,  4.68it/s]


*** Execution time for num_shots 10: 245.22 seconds
llama3.2:3b/shots-10(0.214) metrics: {'f1': 0.8249909032428336, 'accuracy': 0.8177855274629469}
CPU times: user 36.8 s, sys: 633 ms, total: 37.4 s
Wall time: 4min 6s


In [10]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.1:8b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: llama3.1:8b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:55<00:00,  3.23it/s]


*** Execution time for num_shots 0: 355.32 seconds
llama3.1:8b/shots-00(0.310) metrics: {'f1': 0.7281948307590247, 'accuracy': 0.7349607672188317}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:48<00:00,  3.97it/s]


*** Execution time for num_shots 1: 288.57 seconds
llama3.1:8b/shots-01(0.252) metrics: {'f1': 0.8915983835135687, 'accuracy': 0.8849171752397559}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:01<00:00,  3.80it/s]


*** Execution time for num_shots 2: 301.87 seconds
llama3.1:8b/shots-02(0.263) metrics: {'f1': 0.8733923468490629, 'accuracy': 0.8709677419354839}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:02<00:00,  3.79it/s]


*** Execution time for num_shots 4: 302.30 seconds
llama3.1:8b/shots-04(0.264) metrics: {'f1': 0.8949712659611697, 'accuracy': 0.8918918918918919}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:05<00:00,  3.75it/s]

*** Execution time for num_shots 8: 305.61 seconds
llama3.1:8b/shots-08(0.266) metrics: {'f1': 0.8993967314676187, 'accuracy': 0.8945074106364429}
CPU times: user 3min 13s, sys: 3.12 s, total: 3min 16s
Wall time: 25min 55s





In [6]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.1:8b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: llama3.1:8b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:54<00:00,  3.23it/s]

*** Execution time for num_shots 10: 354.68 seconds
llama3.1:8b/shots-10(0.309) metrics: {'f1': 0.9045562363131495, 'accuracy': 0.902353966870096}
CPU times: user 38.2 s, sys: 594 ms, total: 38.8 s
Wall time: 5min 55s





In [11]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2-vision:11b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: llama3.2-vision:11b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:46<00:00,  2.82it/s]


*** Execution time for num_shots 0: 406.18 seconds
llama3.2-vision:11b/shots-00(0.354) metrics: {'f1': 0.7294705267087005, 'accuracy': 0.7436791630340017}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:04<00:00,  3.77it/s]


*** Execution time for num_shots 1: 304.09 seconds
llama3.2-vision:11b/shots-01(0.265) metrics: {'f1': 0.8895416225805178, 'accuracy': 0.8857890148212729}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:12<00:00,  3.66it/s]


*** Execution time for num_shots 2: 312.98 seconds
llama3.2-vision:11b/shots-02(0.273) metrics: {'f1': 0.8582916202482987, 'accuracy': 0.8578901482127289}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:14<00:00,  3.65it/s]


*** Execution time for num_shots 4: 314.03 seconds
llama3.2-vision:11b/shots-04(0.274) metrics: {'f1': 0.9083711069265794, 'accuracy': 0.9067131647776809}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:18<00:00,  3.60it/s]

*** Execution time for num_shots 8: 318.58 seconds
llama3.2-vision:11b/shots-08(0.278) metrics: {'f1': 0.9077890471207182, 'accuracy': 0.9058413251961639}
CPU times: user 3min 19s, sys: 2.72 s, total: 3min 21s
Wall time: 27min 37s





In [7]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2-vision:11b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: llama3.2-vision:11b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:20<00:00,  3.01it/s]

*** Execution time for num_shots 10: 380.69 seconds
llama3.2-vision:11b/shots-10(0.332) metrics: {'f1': 0.9148318352710795, 'accuracy': 0.9136878814298169}
CPU times: user 37.6 s, sys: 717 ms, total: 38.3 s
Wall time: 6min 21s





In [10]:
%%time

evaluate_model_with_num_shots(
    "llama3.1:70b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: llama3.1:70b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:36:53<00:00, 14.48s/it] 


*** Execution time for num_shots 0: 16613.23 seconds
llama3.1:70b/shots-00(14.484) metrics: {'f1': 0.9296168632652474, 'accuracy': 0.925893635571055}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:12:46<00:00, 13.22s/it] 


*** Execution time for num_shots 1: 15166.41 seconds
llama3.1:70b/shots-01(13.223) metrics: {'f1': 0.9185472277146968, 'accuracy': 0.9145597210113339}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:36:17<00:00, 14.45s/it] 


*** Execution time for num_shots 2: 16577.91 seconds
llama3.1:70b/shots-02(14.453) metrics: {'f1': 0.9354249346715559, 'accuracy': 0.9337401918047079}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:59:19<00:00, 15.66s/it] 


*** Execution time for num_shots 4: 17959.08 seconds
llama3.1:70b/shots-04(15.657) metrics: {'f1': 0.9343945572910344, 'accuracy': 0.9354838709677419}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [5:49:42<00:00, 18.29s/it]    


*** Execution time for num_shots 8: 20982.15 seconds
llama3.1:70b/shots-08(18.293) metrics: {'f1': 0.9292136939042248, 'accuracy': 0.9293809938971229}
CPU times: user 3min 26s, sys: 3.95 s, total: 3min 30s
Wall time: 1d 15min 1s


In [8]:
%%time

evaluate_model_with_num_shots(
    "llama3.1:70b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: llama3.1:70b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [5:12:47<00:00, 16.36s/it] 


*** Execution time for num_shots 10: 18767.68 seconds
llama3.1:70b/shots-10(16.362) metrics: {'f1': 0.9330426320466346, 'accuracy': 0.9337401918047079}
CPU times: user 43.7 s, sys: 935 ms, total: 44.6 s
Wall time: 5h 12min 50s


In [12]:
%%time

evaluate_model_with_num_shots(
    "llama3.3:70b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: llama3.3:70b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [6:06:07<00:00, 19.15s/it]  


*** Execution time for num_shots 0: 21967.04 seconds
llama3.3:70b/shots-00(19.152) metrics: {'f1': 0.9315590100133763, 'accuracy': 0.9319965126416739}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [5:52:12<00:00, 18.42s/it]  


*** Execution time for num_shots 1: 21132.95 seconds
llama3.3:70b/shots-01(18.425) metrics: {'f1': 0.9317334399125254, 'accuracy': 0.93025283347864}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [6:07:54<00:00, 19.25s/it] 


*** Execution time for num_shots 2: 22074.94 seconds
llama3.3:70b/shots-02(19.246) metrics: {'f1': 0.9414254360088393, 'accuracy': 0.941586748038361}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [6:26:56<00:00, 20.24s/it]  


*** Execution time for num_shots 4: 23216.46 seconds
llama3.3:70b/shots-04(20.241) metrics: {'f1': 0.9347766596251843, 'accuracy': 0.9354838709677419}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [6:06:40<00:00, 19.18s/it] 


*** Execution time for num_shots 8: 22000.28 seconds
llama3.3:70b/shots-08(19.181) metrics: {'f1': 0.9317246222785865, 'accuracy': 0.9311246730601569}
CPU times: user 3min 40s, sys: 4.34 s, total: 3min 44s
Wall time: 1d 6h 39min 54s


In [9]:
%%time

evaluate_model_with_num_shots(
    "llama3.3:70b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: llama3.3:70b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [6:13:35<00:00, 19.54s/it] 


*** Execution time for num_shots 10: 22415.71 seconds
llama3.3:70b/shots-10(19.543) metrics: {'f1': 0.9345560589137629, 'accuracy': 0.934612031386225}
CPU times: user 44.6 s, sys: 1.05 s, total: 45.6 s
Wall time: 6h 13min 37s


In [None]:
%%time

evaluate_model_with_num_shots(
    "llama3.2-vision:90b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: llama3.2-vision:90b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [8:04:47<00:00, 25.36s/it]  


*** Execution time for num_shots 0: 29087.97 seconds
llama3.2-vision:90b/shots-00(25.360) metrics: {'f1': 0.9330832463249541, 'accuracy': 0.9319965126416739}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


 15%|█▍        | 167/1147 [1:05:32<5:02:53, 18.54s/it]

In [None]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2-vision:90b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[1, 2, 4, 8],
)

Evaluating model: llama3.2-vision:90b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [6:51:26<00:00, 21.52s/it]  


*** Execution time for num_shots 1: 24686.97 seconds
llama3.2-vision:90b/shots-01(21.523) metrics: {'f1': 0.9287381697733155, 'accuracy': 0.9241499564080209}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [7:52:36<00:00, 24.72s/it]  


*** Execution time for num_shots 2: 28356.78 seconds
llama3.2-vision:90b/shots-02(24.723) metrics: {'f1': 0.9371526231652012, 'accuracy': 0.9363557105492589}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [8:26:40<00:00, 26.50s/it]  


*** Execution time for num_shots 4: 30400.97 seconds
llama3.2-vision:90b/shots-04(26.505) metrics: {'f1': 0.9421389363150309, 'accuracy': 0.942458587619878}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


 25%|██▌       | 290/1147 [2:04:43<9:05:38, 38.20s/it] 

In [4]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2-vision:90b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[8],
)

Evaluating model: llama3.2-vision:90b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [8:58:22<00:00, 28.16s/it]  


*** Execution time for num_shots 8: 32302.94 seconds
llama3.2-vision:90b/shots-08(28.163) metrics: {'f1': 0.9311880425266882, 'accuracy': 0.93025283347864}
CPU times: user 47.6 s, sys: 1.69 s, total: 49.3 s
Wall time: 8h 58min 26s


In [10]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2-vision:90b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: llama3.2-vision:90b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [9:50:04<00:00, 30.87s/it]  


*** Execution time for num_shots 10: 35404.82 seconds
llama3.2-vision:90b/shots-10(30.867) metrics: {'f1': 0.9371262039721728, 'accuracy': 0.937227550130776}
CPU times: user 51.4 s, sys: 2.99 s, total: 54.4 s
Wall time: 9h 50min 9s


## Evaluating Qwen2.5 LLMs

In [11]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:0.5b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: qwen2.5:0.5b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [02:49<00:00,  6.75it/s]


*** Execution time for num_shots 0: 169.82 seconds
qwen2.5:0.5b/shots-00(0.148) metrics: {'f1': 0.42707683258051304, 'accuracy': 0.37053182214472535}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [02:34<00:00,  7.42it/s]


*** Execution time for num_shots 1: 154.60 seconds
qwen2.5:0.5b/shots-01(0.135) metrics: {'f1': 0.47788702827256546, 'accuracy': 0.44027898866608545}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [02:39<00:00,  7.18it/s]


*** Execution time for num_shots 2: 159.74 seconds
qwen2.5:0.5b/shots-02(0.139) metrics: {'f1': 0.4510605054122932, 'accuracy': 0.45161290322580644}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [02:37<00:00,  7.30it/s]


*** Execution time for num_shots 4: 157.18 seconds
qwen2.5:0.5b/shots-04(0.137) metrics: {'f1': 0.41343800705742506, 'accuracy': 0.44638186573670446}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [02:44<00:00,  6.96it/s]


*** Execution time for num_shots 8: 164.79 seconds
qwen2.5:0.5b/shots-08(0.144) metrics: {'f1': 0.4759601983564923, 'accuracy': 0.5091543156059285}
CPU times: user 3min 5s, sys: 3.2 s, total: 3min 8s
Wall time: 13min 28s


In [11]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:0.5b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: qwen2.5:0.5b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:03<00:00,  6.25it/s]


*** Execution time for num_shots 10: 183.60 seconds
qwen2.5:0.5b/shots-10(0.160) metrics: {'f1': 0.48198734788883857, 'accuracy': 0.5082824760244115}
CPU times: user 37 s, sys: 630 ms, total: 37.6 s
Wall time: 3min 5s


In [9]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:1.5b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: qwen2.5:1.5b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:20<00:00,  5.73it/s]


*** Execution time for num_shots 0: 200.25 seconds
qwen2.5:1.5b/shots-00(0.175) metrics: {'f1': 0.592991085561572, 'accuracy': 0.5152571926765476}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:05<00:00,  6.17it/s]


*** Execution time for num_shots 1: 185.82 seconds
qwen2.5:1.5b/shots-01(0.162) metrics: {'f1': 0.7329231942249087, 'accuracy': 0.6538796861377506}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:05<00:00,  6.20it/s]


*** Execution time for num_shots 2: 185.10 seconds
qwen2.5:1.5b/shots-02(0.161) metrics: {'f1': 0.7104085808217803, 'accuracy': 0.6129032258064516}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:06<00:00,  6.15it/s]


*** Execution time for num_shots 4: 186.50 seconds
qwen2.5:1.5b/shots-04(0.163) metrics: {'f1': 0.6798071114535961, 'accuracy': 0.5823888404533566}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:12<00:00,  5.97it/s]


*** Execution time for num_shots 8: 192.11 seconds
qwen2.5:1.5b/shots-08(0.167) metrics: {'f1': 0.6536118700778348, 'accuracy': 0.5527462946817786}
CPU times: user 3min 7s, sys: 3.08 s, total: 3min 10s
Wall time: 15min 51s


In [12]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:1.5b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: qwen2.5:1.5b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:39<00:00,  5.22it/s]


*** Execution time for num_shots 10: 219.69 seconds
qwen2.5:1.5b/shots-10(0.192) metrics: {'f1': 0.6812815500160857, 'accuracy': 0.5876198779424586}
CPU times: user 37.7 s, sys: 648 ms, total: 38.3 s
Wall time: 3min 41s


In [10]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:3b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: qwen2.5:3b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:24<00:00,  4.34it/s]


*** Execution time for num_shots 0: 264.21 seconds
qwen2.5:3b/shots-00(0.230) metrics: {'f1': 0.7269346386310314, 'accuracy': 0.7166521360069747}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:55<00:00,  4.87it/s]


*** Execution time for num_shots 1: 235.31 seconds
qwen2.5:3b/shots-01(0.205) metrics: {'f1': 0.88200986081474, 'accuracy': 0.8578901482127289}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:56<00:00,  4.84it/s]


*** Execution time for num_shots 2: 236.95 seconds
qwen2.5:3b/shots-02(0.207) metrics: {'f1': 0.877444204954895, 'accuracy': 0.8500435919790759}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:59<00:00,  4.80it/s]


*** Execution time for num_shots 4: 239.16 seconds
qwen2.5:3b/shots-04(0.209) metrics: {'f1': 0.8757238889897819, 'accuracy': 0.8491717523975588}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:05<00:00,  4.67it/s]


*** Execution time for num_shots 8: 245.66 seconds
qwen2.5:3b/shots-08(0.214) metrics: {'f1': 0.8981282023035045, 'accuracy': 0.8796861377506539}
CPU times: user 3min 21s, sys: 3.29 s, total: 3min 25s
Wall time: 20min 23s


In [12]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:3b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
    debug=True,
)

Evaluating model: qwen2.5:3b
loading train/test data files


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


  0%|          | 0/1147 [00:00<?, ?it/s]

Setting debug mode to: True
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Courts, schools and hospitals evacuated across Saint Petersberg due to anonymous threats Russian media sources are reporting that courts, schools, and hospitals across Saint Petersberg have been evacuated today due to anonymous threats. It is understood that people have been evacuated from Petrodvorets, Oktyabrsky, Kolpinsky, Petrogradsky, Kuibyshevsky and Sestroretsky district courts. Furthermore, the State University of the Sea and River Fleet, St. Petersburg State University of Railway Engineering, Higher School of Folk Arts, St. Petersburg State University of Telecommunications, and S.M. Military Medical Academy Kirov have all been evacuated. This is the fourth consecutive week of evacuations from public buildings due to such threats. It is not known when the situation will normalise."
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence >

  0%|          | 1/1147 [00:16<5:24:26, 16.99s/it]

[36;1m[1;3m[llm/end][0m [1m[chain:RunnableSequence > llm:ChatOllama] [16.92s] Exiting LLM run with output:
[0m[response]
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence] [16.92s] Exiting Chain run with output:
[0m[outputs]
Setting debug mode to: False


100%|██████████| 1147/1147 [04:27<00:00,  4.28it/s]


*** Execution time for num_shots 10: 267.82 seconds
qwen2.5:3b/shots-10(0.233) metrics: {'f1': 0.8912371574079978, 'accuracy': 0.8709677419354839}
CPU times: user 37.2 s, sys: 672 ms, total: 37.9 s
Wall time: 4min 29s


In [14]:
print(
    "System: Task: Classify Inputs into Predefined Categories\n\nYour primary objective is to analyze the given input and assign it to one of the predefined categories: ['Weather', 'Worker Strike', 'Administrative Issue', 'Human Error', 'Cyber Attack', 'Terrorism', 'Accident', 'Others']. Evaluate the content carefully and use the defining characteristics of each category to ensure an accurate classification.\n\nGuidelines:\n1. Understand the Categories:\nEach category has specific attributes that distinguish it. Familiarize yourself with these attributes by referring to the category descriptions provided in the JSON below. Use these details to guide your classification:\n\n{'Weather': ['Flooding', 'Severe Winds', 'Weather Advisory', 'Tropical Cyclone', 'Storm', 'Ice Storm', 'Earthquake', 'Tornado', 'Typhoon', 'Landslide', 'Water', 'Hurricane', 'Wildfire', 'Blizzard', 'Hail'], 'Worker Strike': ['Mine Workers Strike', 'Production Halt', 'Protest', 'Riot', 'Port Strike', 'General Strike', 'Civil Service Strike', 'Civil Unrest Advisory', 'Cargo Transportation Strike', 'Energy Sector Strike'], 'Administrative Issue': ['Port Congestion', 'Police Operations', 'Roadway Closure', 'Disruption', 'Cargo', 'Industrial Action', 'Port Disruption', 'Cargo Disruption', 'Power Outage', 'Port Closure', 'Maritime Advisory', 'Train Delays', 'Ground Transportation Advisory', 'Public Transportation Disruption', 'Trade Regulation', 'Customs Regulation', 'Regulatory Advisory', 'Industry Directives', 'Security Advisory', 'Public Holidays', 'Customs Delay', 'Public Health Advisory', 'Detention', 'Aviation Advisory', 'Waterway Closure', 'Plant Closure', 'Border Closure', 'Delay', 'Industrial zone shutdown', 'Trade Restrictions', 'Closure', 'Truck Driving Ban', 'Insolvency', 'Environmental Regulations', 'Postal Disruption', 'Travel Warning'], 'Human Error': ['Workplace Accident', 'Individuals in Focus', 'Military Operations', 'Flight Delays', 'Cancellations', 'Political Info', 'Political Event'], 'Cyber Attack': ['Network Disruption', 'Ransomware', 'Data breach', 'Phishing'], 'Terrorism': ['Bombing', 'Warehouse Theft', 'Public Safety', 'Security', 'Organized Crime', 'Piracy', 'Kidnap', 'Shooting', 'Robbery', 'Cargo theft', 'Bomb Detonation', 'Terror Attack', 'Outbreak Of War', 'Militant Action'], 'Accident': ['Hazmat Response', 'Maritime Accident', 'Vehicle Accident', 'Death', 'Injury', 'Non-industrial Fire', 'Chemical Spill', 'Industrial Fire', 'Fuel Disruption', 'Airline Incident', 'Crash', 'Explosion', 'Train Accident', 'Derailment', 'Sewage Disruption', 'Barge Accident', 'Bridge Collapse', 'Structure Collapse', 'Airport Accident', 'Force Majeure', 'Telecom Outage'], 'Others': ['Miscellaneous Events', 'Miscellaneous Strikes', 'Outbreak of disease']}\n\n2. Contextual Analysis:\nConsider the broader context of the input. If an input could potentially fit into multiple categories, select the one that most closely aligns with its primary intent or focus.\n3. Handling Ambiguity:\nFor ambiguous inputs or those that do not clearly align with any category, choose the category that most closely matches the content provided.\n4. Ensure Accuracy and Consistency:\nStrive for consistent and accurate classifications. Avoid arbitrary or random assignments.\n5. Provide Feedback:\nIf the input cannot be classified into any of the given categories, classify it as “Others.”\n\nInstructions for Output:\n1. Once the category is identified, provide “specific tags” by selecting from the list corresponding to the identified category, as defined in the JSON.\n2. Ensure the selected “specific tags” accurately reflect the details and context of the input.\n\nOutput Format:\n\nReturn your classification in the following JSON format:\n\n{\n  \"category\": \"<Selected Category>\",\n  \"specific_tags\": [\"<Selected Tag 1>\", \"<Selected Tag 2>\", ...]\n}\n\n\n\nExample Inputs and Outputs:\n\n- Input:\n\nLocal sources reported that operations at Pier 1 and 2 container terminals at the Port of Durban have suspended due to strong winds on December 27 from 18:50 (local time) and resumed at 23:10 on the same day. For Pier 2 terminal, operations stopped at 19:30 and resumed at 20:35 respectively.\n\n- Output:\n\n{\n  \"category\": \"Weather\",\n  \"specific_tags\": [\"Severe Winds\"]\n}\n\n- Input:\n\nInformation received states that emergency personnel are working to contain a blaze at Off Road Warehouse in commercial San Diego, on 17 November. It is detailed that the store is located at 7915 Balboa Avenue. Traffic maps show that Balboa Avenue is closed both ways between Mercury Street and Convoy Street. Travelers should use caution in the area and divert away from any encountered fire suppression operations.\n\n- Output:\n\n{\n  \"category\": \"Administrative Issue\",\n  \"specific_tags\": [\"Roadway Closure\", \"Public Safety Advisory\"]\n}\n\n- Input:\n\nProtests against climate change are anticipated nationwide on 29 November and 6 December as part of the ‘Fridays for Future’ global climate strike. Specific details of planned events have not been confirmed, but are likely to occur in major cities across the country. Previous climate strikes have seen large turnout in cities such as New York City, Philadelphia, and Washington, D.C.\n\n- Output:\n\n{\n  \"category\": \"Worker Strike\",\n  \"specific_tags\": [\"Protest\", \"Civil Unrest Advisory\"]\n}\n\n- Input:\n\nGovernment sources reported a fire at the Woolwich Dockyard, located near Belson Rd and Borgard Rd. No injuries were immediately reported. All rail lines from London towards Slade Green are running again. This incident is closed.\n\n- Output:\n\n{\n  \"category\": \"Accident\",\n  \"specific_tags\": [\"Non-industrial Fire\"]\n}\n\n- Input:\n\nLocal media sources indicated on November 30 that the Ekurhuleni Central Crime Intelligence Unit arrested 4 suspects and recovered computer printer equipment cargo from their November 21 truck theft at the corner of Main Reef Road and Ulysses Street in Cleveland. The truck was en route from Durban to Johannesburg when it was hijacked in Randfontein. The cargo was worth ZAR 5 million (EUR 309018.21; USD 352673.95), and some laptops are still missing. Distributors should be mindful of cargo theft risks in Randfontein and should plan accordingly.\n\n- Output:\n\n{\n  \"category\": \"Terrorism\",\n  \"specific_tags\": [\"Cargo Theft\", \"Organized Crime\"]\n}\n\n- Input:\n\nAnonymous sources have reported that a ransomware attack has disrupted network operations for a major logistics provider. The attack occurred on November 15, and data breaches were confirmed, exposing sensitive customer and shipment details. The company has stated that recovery is underway but advised customers to expect delays.\n\n- Output:\n\n{\n  \"category\": \"Cyber Attack\",\n  \"specific_tags\": [\"Ransomware\", \"Data Breach\"]\n}\n\n- Input:\n\nThe Selangor Health Department reported that two students of a Secondary School in Pandamaran Jaya in Port Klang had been infected with COVID-19 virus.\n\n- Output:\n\n{\n  \"category\": \"Others\",\n  \"specific_tags\": [\"Outbreak of Disease\"]\n}\n\n- Input:\n\nAn incident of workplace negligence was reported at a construction site in downtown Chicago on November 19, where an unfastened scaffolding collapsed, injuring two workers. Investigations are ongoing to determine accountability.\n\n- Output:\n\n{\n  \"category\": \"Human Error\",\n  \"specific_tags\": [\"Workplace Accident\"]\n}\n\n- Input:\n\nShipping delays were reported at the Port of Los Angeles on December 1 due to a customs system outage. Containers requiring clearance were delayed for up to 12 hours, affecting supply chains across the region.\n\n- Output:\n\n{\n  \"category\": \"Administrative Issue\",\n  \"specific_tags\": [\"Customs Delay\", \"Port Disruption\"]\n}\n\n- Input:\n\nRussian media sources are reporting that courts, schools, and hospitals across Saint Petersburg have been evacuated today due to anonymous threats. It is understood that people have been evacuated from Petrodvorets, Oktyabrsky, Kolpinsky, Petrogradsky, Kuibyshevsky, and Sestroretsky district courts. Furthermore, the State University of the Sea and River Fleet, St. Petersburg State University of Railway Engineering, Higher School of Folk Arts, St. Petersburg State University of Telecommunications, and S.M. Military Medical Academy Kirov have all been evacuated. This is the fourth consecutive week of evacuations from public buildings due to such threats. It is not known when the situation will normalize.\n\n- Output:\n\n{\n  \"category\": \"Terrorism\",\n  \"specific_tags\": [\"Bomb Threat\", \"Public Safety\"]\n}\n\n\nHuman: - Input:\n\nCourts, schools and hospitals evacuated across Saint Petersberg due to anonymous threats Russian media sources are reporting that courts, schools, and hospitals across Saint Petersberg have been evacuated today due to anonymous threats. It is understood that people have been evacuated from Petrodvorets, Oktyabrsky, Kolpinsky, Petrogradsky, Kuibyshevsky and Sestroretsky district courts. Furthermore, the State University of the Sea and River Fleet, St. Petersburg State University of Railway Engineering, Higher School of Folk Arts, St. Petersburg State University of Telecommunications, and S.M. Military Medical Academy Kirov have all been evacuated. This is the fourth consecutive week of evacuations from public buildings due to such threats. It is not known when the situation will normalise.\n\n- Output:"
)

System: Task: Classify Inputs into Predefined Categories

Your primary objective is to analyze the given input and assign it to one of the predefined categories: ['Weather', 'Worker Strike', 'Administrative Issue', 'Human Error', 'Cyber Attack', 'Terrorism', 'Accident', 'Others']. Evaluate the content carefully and use the defining characteristics of each category to ensure an accurate classification.

Guidelines:
1. Understand the Categories:
Each category has specific attributes that distinguish it. Familiarize yourself with these attributes by referring to the category descriptions provided in the JSON below. Use these details to guide your classification:


2. Contextual Analysis:
Consider the broader context of the input. If an input could potentially fit into multiple categories, select the one that most closely aligns with its primary intent or focus.
3. Handling Ambiguity:
For ambiguous inputs or those that do not clearly align with any category, choose the category that most c

In [11]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:7b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: qwen2.5:7b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:09<00:00,  3.11it/s]


*** Execution time for num_shots 0: 369.13 seconds
qwen2.5:7b/shots-00(0.322) metrics: {'f1': 0.8914163706983638, 'accuracy': 0.8875326939843069}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:18<00:00,  3.60it/s]


*** Execution time for num_shots 1: 318.95 seconds
qwen2.5:7b/shots-01(0.278) metrics: {'f1': 0.9000324085944773, 'accuracy': 0.8814298169136879}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:34<00:00,  3.43it/s]


*** Execution time for num_shots 2: 334.84 seconds
qwen2.5:7b/shots-02(0.292) metrics: {'f1': 0.9111955024756718, 'accuracy': 0.9032258064516129}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:42<00:00,  3.35it/s]


*** Execution time for num_shots 4: 342.61 seconds
qwen2.5:7b/shots-04(0.299) metrics: {'f1': 0.896465276359428, 'accuracy': 0.8866608544027899}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:43<00:00,  3.34it/s]

*** Execution time for num_shots 8: 343.35 seconds
qwen2.5:7b/shots-08(0.299) metrics: {'f1': 0.9023657593437988, 'accuracy': 0.8875326939843069}
CPU times: user 3min 31s, sys: 3.57 s, total: 3min 35s
Wall time: 28min 30s





In [13]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:7b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: qwen2.5:7b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:04<00:00,  3.14it/s]

*** Execution time for num_shots 10: 364.95 seconds
qwen2.5:7b/shots-10(0.318) metrics: {'f1': 0.9134490287756456, 'accuracy': 0.9032258064516129}
CPU times: user 38.9 s, sys: 713 ms, total: 39.6 s
Wall time: 6min 6s





In [12]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:14b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: qwen2.5:14b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:00<00:00,  1.91it/s]


*** Execution time for num_shots 0: 600.69 seconds
qwen2.5:14b/shots-00(0.524) metrics: {'f1': 0.8775560935306219, 'accuracy': 0.8753269398430689}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [08:41<00:00,  2.20it/s]


*** Execution time for num_shots 1: 521.51 seconds
qwen2.5:14b/shots-01(0.455) metrics: {'f1': 0.9328390837102657, 'accuracy': 0.927637314734089}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [08:37<00:00,  2.22it/s]


*** Execution time for num_shots 2: 517.08 seconds
qwen2.5:14b/shots-02(0.451) metrics: {'f1': 0.9266889485377222, 'accuracy': 0.9215344376634699}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [08:46<00:00,  2.18it/s]


*** Execution time for num_shots 4: 526.59 seconds
qwen2.5:14b/shots-04(0.459) metrics: {'f1': 0.9092368934605444, 'accuracy': 0.902353966870096}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [09:07<00:00,  2.10it/s]

*** Execution time for num_shots 8: 547.29 seconds
qwen2.5:14b/shots-08(0.477) metrics: {'f1': 0.9187035432915808, 'accuracy': 0.9136878814298169}
CPU times: user 3min 38s, sys: 3.89 s, total: 3min 42s
Wall time: 45min 15s





In [14]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:14b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: qwen2.5:14b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:02<00:00,  1.90it/s]


*** Execution time for num_shots 10: 602.82 seconds
qwen2.5:14b/shots-10(0.526) metrics: {'f1': 0.9249566623090326, 'accuracy': 0.918918918918919}
CPU times: user 40.3 s, sys: 727 ms, total: 41 s
Wall time: 10min 4s


In [13]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:32b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

loading /home/inflaton/code/maritime-incidents-ai-agents/llm_toolkit/data_utils.py
Evaluating model: qwen2.5:32b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [19:26<00:00,  1.02s/it] 


*** Execution time for num_shots 0: 1166.07 seconds
qwen2.5:32b/shots-00(1.017) metrics: {'f1': 0.9138651364813484, 'accuracy': 0.9110723626852659}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [16:52<00:00,  1.13it/s]


*** Execution time for num_shots 1: 1012.11 seconds
qwen2.5:32b/shots-01(0.882) metrics: {'f1': 0.9420573291137532, 'accuracy': 0.937227550130776}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [17:20<00:00,  1.10it/s]


*** Execution time for num_shots 2: 1040.37 seconds
qwen2.5:32b/shots-02(0.907) metrics: {'f1': 0.9382118835784461, 'accuracy': 0.9363557105492589}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [16:41<00:00,  1.15it/s]


*** Execution time for num_shots 4: 1001.71 seconds
qwen2.5:32b/shots-04(0.873) metrics: {'f1': 0.9385299403440788, 'accuracy': 0.9354838709677419}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [16:40<00:00,  1.15it/s]


*** Execution time for num_shots 8: 1000.47 seconds
qwen2.5:32b/shots-08(0.872) metrics: {'f1': 0.944298125306216, 'accuracy': 0.941586748038361}
CPU times: user 3min 43s, sys: 4.39 s, total: 3min 47s
Wall time: 1h 27min 2s


In [15]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:32b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: qwen2.5:32b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [19:44<00:00,  1.03s/it] 

*** Execution time for num_shots 10: 1184.91 seconds
qwen2.5:32b/shots-10(1.033) metrics: {'f1': 0.9415531738983712, 'accuracy': 0.9380993897122929}
CPU times: user 42 s, sys: 646 ms, total: 42.6 s
Wall time: 19min 46s





In [14]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:72b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8],
)

Evaluating model: qwen2.5:72b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [5:25:55<00:00, 17.05s/it] 


*** Execution time for num_shots 0: 19555.41 seconds
qwen2.5:72b/shots-00(17.049) metrics: {'f1': 0.9206032343145244, 'accuracy': 0.9110723626852659}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:44:04<00:00, 14.86s/it] 


*** Execution time for num_shots 1: 17044.93 seconds
qwen2.5:72b/shots-01(14.860) metrics: {'f1': 0.9350378402871492, 'accuracy': 0.9224062772449869}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [5:01:07<00:00, 15.75s/it] 


*** Execution time for num_shots 2: 18067.83 seconds
qwen2.5:72b/shots-02(15.752) metrics: {'f1': 0.9374460250802068, 'accuracy': 0.9293809938971229}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [5:00:32<00:00, 15.72s/it] 


*** Execution time for num_shots 4: 18032.29 seconds
qwen2.5:72b/shots-04(15.721) metrics: {'f1': 0.9426519730397988, 'accuracy': 0.9389712292938099}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [5:26:21<00:00, 17.07s/it] 


*** Execution time for num_shots 8: 19581.29 seconds
qwen2.5:72b/shots-08(17.072) metrics: {'f1': 0.9442709282880565, 'accuracy': 0.939843068875327}
CPU times: user 3min 38s, sys: 4.07 s, total: 3min 42s
Wall time: 1d 1h 38min 4s


In [16]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:72b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
)

Evaluating model: qwen2.5:72b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [5:39:43<00:00, 17.77s/it] 


*** Execution time for num_shots 10: 20383.07 seconds
qwen2.5:72b/shots-10(17.771) metrics: {'f1': 0.9376545841356065, 'accuracy': 0.9319965126416739}
CPU times: user 45.1 s, sys: 1.15 s, total: 46.3 s
Wall time: 5h 39min 45s


In [4]:
!./scripts/ollama-pull-fp16.sh

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ 

In [None]:
all_models = [
    "qwen2.5:0.5b-instruct-fp16",
    "llama3.2:1b-instruct-fp16",
    "qwen2.5:1.5b-instruct-fp16",
    "llama3.2:3b-instruct-fp16",
    "qwen2.5:3b-instruct-fp16",
    "qwen2.5:7b-instruct-fp16",
    "llama3.1:8b-instruct-fp16",
    "llama3.2-vision:11b-instruct-fp16",
    "qwen2.5:14b-instruct-fp16",
]

for model in all_models:
    evaluate_model_with_num_shots(
        model,
        data_path,
        results_path=results_path,
        ollama=True,
        range_num_shots=[0, 1, 2, 4, 8, 10],
    )

Evaluating model: qwen2.5:0.5b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:04<00:00,  6.21it/s]


*** Execution time for num_shots 0: 184.77 seconds
qwen2.5:0.5b-instruct-fp16/shots-00(0.161) metrics: {'f1': 0.3784368384137439, 'accuracy': 0.3147340889276373}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [02:44<00:00,  6.96it/s]


*** Execution time for num_shots 1: 164.83 seconds
qwen2.5:0.5b-instruct-fp16/shots-01(0.144) metrics: {'f1': 0.3908166383485488, 'accuracy': 0.36617262423714037}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [02:50<00:00,  6.72it/s]


*** Execution time for num_shots 2: 170.59 seconds
qwen2.5:0.5b-instruct-fp16/shots-02(0.149) metrics: {'f1': 0.43601987300428463, 'accuracy': 0.4010462074978204}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [02:46<00:00,  6.90it/s]


*** Execution time for num_shots 4: 166.16 seconds
qwen2.5:0.5b-instruct-fp16/shots-04(0.145) metrics: {'f1': 0.4088610120655872, 'accuracy': 0.4115082824760244}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [02:51<00:00,  6.70it/s]


*** Execution time for num_shots 8: 171.14 seconds
qwen2.5:0.5b-instruct-fp16/shots-08(0.149) metrics: {'f1': 0.46714200514510845, 'accuracy': 0.48299912816041846}
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [02:53<00:00,  6.61it/s]


*** Execution time for num_shots 10: 173.44 seconds
qwen2.5:0.5b-instruct-fp16/shots-10(0.151) metrics: {'f1': 0.4652496267245179, 'accuracy': 0.4742807323452485}
Evaluating model: llama3.2:1b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Deta

100%|██████████| 1147/1147 [05:04<00:00,  3.77it/s]


*** Execution time for num_shots 0: 304.29 seconds
llama3.2:1b-instruct-fp16/shots-00(0.265) metrics: {'f1': 0.6350365677771576, 'accuracy': 0.5823888404533566}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:43<00:00,  2.84it/s]


*** Execution time for num_shots 1: 403.92 seconds
llama3.2:1b-instruct-fp16/shots-01(0.352) metrics: {'f1': 0.6014706784726838, 'accuracy': 0.6068003487358327}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [07:09<00:00,  2.67it/s]


*** Execution time for num_shots 2: 429.90 seconds
llama3.2:1b-instruct-fp16/shots-02(0.375) metrics: {'f1': 0.6250765501595553, 'accuracy': 0.6111595466434177}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [07:00<00:00,  2.73it/s]


*** Execution time for num_shots 4: 420.49 seconds
llama3.2:1b-instruct-fp16/shots-04(0.367) metrics: {'f1': 0.5984334574171082, 'accuracy': 0.5850043591979076}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:55<00:00,  2.76it/s]


*** Execution time for num_shots 8: 415.32 seconds
llama3.2:1b-instruct-fp16/shots-08(0.362) metrics: {'f1': 0.5202776199221165, 'accuracy': 0.5387968613775065}
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:48<00:00,  2.81it/s]


*** Execution time for num_shots 10: 408.11 seconds
llama3.2:1b-instruct-fp16/shots-10(0.356) metrics: {'f1': 0.61010144894781, 'accuracy': 0.6251089799476897}
Evaluating model: qwen2.5:1.5b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Detail

100%|██████████| 1147/1147 [04:12<00:00,  4.53it/s]


*** Execution time for num_shots 0: 253.00 seconds
qwen2.5:1.5b-instruct-fp16/shots-00(0.221) metrics: {'f1': 0.6450955098364489, 'accuracy': 0.5928509154315605}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:59<00:00,  4.79it/s]


*** Execution time for num_shots 1: 239.51 seconds
qwen2.5:1.5b-instruct-fp16/shots-01(0.209) metrics: {'f1': 0.8099975553031544, 'accuracy': 0.7663469921534438}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:01<00:00,  4.75it/s]


*** Execution time for num_shots 2: 241.58 seconds
qwen2.5:1.5b-instruct-fp16/shots-02(0.211) metrics: {'f1': 0.7673275420699172, 'accuracy': 0.7306015693112468}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:56<00:00,  4.85it/s]


*** Execution time for num_shots 4: 236.42 seconds
qwen2.5:1.5b-instruct-fp16/shots-04(0.206) metrics: {'f1': 0.7545788865283544, 'accuracy': 0.7131647776809067}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:01<00:00,  4.75it/s]


*** Execution time for num_shots 8: 241.33 seconds
qwen2.5:1.5b-instruct-fp16/shots-08(0.210) metrics: {'f1': 0.7148453176725256, 'accuracy': 0.6765475152571927}
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:05<00:00,  4.68it/s]


*** Execution time for num_shots 10: 245.17 seconds
qwen2.5:1.5b-instruct-fp16/shots-10(0.214) metrics: {'f1': 0.7486284539185515, 'accuracy': 0.7088055797733217}
Evaluating model: llama3.2:3b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Deta

100%|██████████| 1147/1147 [07:51<00:00,  2.43it/s]


*** Execution time for num_shots 0: 471.98 seconds
category not in json: {}
llama3.2:3b-instruct-fp16/shots-00(0.411) metrics: {'f1': 0.6239756200301722, 'accuracy': 0.6425457715780296}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:25<00:00,  2.97it/s]


*** Execution time for num_shots 1: 385.61 seconds
llama3.2:3b-instruct-fp16/shots-01(0.336) metrics: {'f1': 0.7776471870622037, 'accuracy': 0.7715780296425457}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:13<00:00,  3.07it/s]


*** Execution time for num_shots 2: 373.06 seconds
llama3.2:3b-instruct-fp16/shots-02(0.325) metrics: {'f1': 0.7877024345784488, 'accuracy': 0.7776809067131648}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:19<00:00,  3.02it/s]


*** Execution time for num_shots 4: 379.64 seconds
llama3.2:3b-instruct-fp16/shots-04(0.331) metrics: {'f1': 0.7763100865022129, 'accuracy': 0.7663469921534438}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:13<00:00,  3.07it/s]


*** Execution time for num_shots 8: 373.65 seconds
llama3.2:3b-instruct-fp16/shots-08(0.326) metrics: {'f1': 0.8222628761915279, 'accuracy': 0.8169136878814298}
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:13<00:00,  3.07it/s]


*** Execution time for num_shots 10: 373.61 seconds
llama3.2:3b-instruct-fp16/shots-10(0.326) metrics: {'f1': 0.8196191979883316, 'accuracy': 0.8160418482999128}
Evaluating model: qwen2.5:3b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Detail

100%|██████████| 1147/1147 [06:59<00:00,  2.73it/s]


*** Execution time for num_shots 0: 419.46 seconds
qwen2.5:3b-instruct-fp16/shots-00(0.366) metrics: {'f1': 0.7514006750577407, 'accuracy': 0.7497820401046208}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:52<00:00,  3.25it/s]


*** Execution time for num_shots 1: 352.45 seconds
qwen2.5:3b-instruct-fp16/shots-01(0.307) metrics: {'f1': 0.8883423151735911, 'accuracy': 0.8735832606800349}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:55<00:00,  3.22it/s]


*** Execution time for num_shots 2: 355.85 seconds
qwen2.5:3b-instruct-fp16/shots-02(0.310) metrics: {'f1': 0.8761850843323824, 'accuracy': 0.8561464690496948}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:57<00:00,  3.21it/s]


*** Execution time for num_shots 4: 357.08 seconds
qwen2.5:3b-instruct-fp16/shots-04(0.311) metrics: {'f1': 0.8811884288867157, 'accuracy': 0.8631211857018308}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:00<00:00,  3.18it/s]


*** Execution time for num_shots 8: 360.72 seconds
qwen2.5:3b-instruct-fp16/shots-08(0.314) metrics: {'f1': 0.8915492706959436, 'accuracy': 0.8735832606800349}
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [06:06<00:00,  3.13it/s]


*** Execution time for num_shots 10: 366.46 seconds
qwen2.5:3b-instruct-fp16/shots-10(0.319) metrics: {'f1': 0.9025388694132519, 'accuracy': 0.8875326939843069}
Evaluating model: qwen2.5:7b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details

100%|██████████| 1147/1147 [12:49<00:00,  1.49it/s] 


*** Execution time for num_shots 0: 769.83 seconds
qwen2.5:7b-instruct-fp16/shots-00(0.671) metrics: {'f1': 0.9110521307374398, 'accuracy': 0.9075850043591979}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [09:41<00:00,  1.97it/s]


*** Execution time for num_shots 1: 581.87 seconds
qwen2.5:7b-instruct-fp16/shots-01(0.507) metrics: {'f1': 0.9175814467223833, 'accuracy': 0.9040976460331299}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [09:52<00:00,  1.94it/s]


*** Execution time for num_shots 2: 592.51 seconds
qwen2.5:7b-instruct-fp16/shots-02(0.517) metrics: {'f1': 0.9299232251052649, 'accuracy': 0.9241499564080209}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [09:56<00:00,  1.92it/s]


*** Execution time for num_shots 4: 596.19 seconds
qwen2.5:7b-instruct-fp16/shots-04(0.520) metrics: {'f1': 0.9147275679252457, 'accuracy': 0.9058413251961639}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [09:50<00:00,  1.94it/s]


*** Execution time for num_shots 8: 590.17 seconds
qwen2.5:7b-instruct-fp16/shots-08(0.515) metrics: {'f1': 0.9172825376681377, 'accuracy': 0.9084568439407149}
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [09:58<00:00,  1.92it/s]


*** Execution time for num_shots 10: 598.42 seconds
qwen2.5:7b-instruct-fp16/shots-10(0.522) metrics: {'f1': 0.9281184960793053, 'accuracy': 0.9241499564080209}
Evaluating model: llama3.1:8b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Detail

100%|██████████| 1147/1147 [14:11<00:00,  1.35it/s] 


*** Execution time for num_shots 0: 851.23 seconds
llama3.1:8b-instruct-fp16/shots-00(0.742) metrics: {'f1': 0.7879278800093833, 'accuracy': 0.7933740191804708}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:06<00:00,  1.89it/s]


*** Execution time for num_shots 1: 607.00 seconds
llama3.1:8b-instruct-fp16/shots-01(0.529) metrics: {'f1': 0.9222101699108429, 'accuracy': 0.918918918918919}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:28<00:00,  1.82it/s]


*** Execution time for num_shots 2: 628.75 seconds
llama3.1:8b-instruct-fp16/shots-02(0.548) metrics: {'f1': 0.8979327889547895, 'accuracy': 0.8979947689625108}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:28<00:00,  1.82it/s]


*** Execution time for num_shots 4: 628.75 seconds
llama3.1:8b-instruct-fp16/shots-04(0.548) metrics: {'f1': 0.9387107317408934, 'accuracy': 0.9380993897122929}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:26<00:00,  1.83it/s]


*** Execution time for num_shots 8: 626.03 seconds
llama3.1:8b-instruct-fp16/shots-08(0.546) metrics: {'f1': 0.9368861455984167, 'accuracy': 0.9354838709677419}
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:29<00:00,  1.82it/s]


*** Execution time for num_shots 10: 629.99 seconds
llama3.1:8b-instruct-fp16/shots-10(0.549) metrics: {'f1': 0.9414668936152565, 'accuracy': 0.9407149084568439}
Evaluating model: llama3.2-vision:11b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headli

100%|██████████| 1147/1147 [1:46:30<00:00,  5.57s/it] 


*** Execution time for num_shots 0: 6390.45 seconds
llama3.2-vision:11b-instruct-fp16/shots-00(5.571) metrics: {'f1': 0.7866116241574936, 'accuracy': 0.7916303400174368}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [1:23:14<00:00,  4.35s/it]


*** Execution time for num_shots 1: 4994.86 seconds
llama3.2-vision:11b-instruct-fp16/shots-01(4.355) metrics: {'f1': 0.9227085958202175, 'accuracy': 0.9197907585004359}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [1:25:33<00:00,  4.48s/it]


*** Execution time for num_shots 2: 5133.51 seconds
llama3.2-vision:11b-instruct-fp16/shots-02(4.476) metrics: {'f1': 0.8902588452495396, 'accuracy': 0.8901482127288579}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [1:25:45<00:00,  4.49s/it]


*** Execution time for num_shots 4: 5146.00 seconds
llama3.2-vision:11b-instruct-fp16/shots-04(4.486) metrics: {'f1': 0.932990256760145, 'accuracy': 0.932868352223191}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [1:25:57<00:00,  4.50s/it]


*** Execution time for num_shots 8: 5157.03 seconds
llama3.2-vision:11b-instruct-fp16/shots-08(4.496) metrics: {'f1': 0.9232233831620001, 'accuracy': 0.9215344376634699}
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [1:25:22<00:00,  4.47s/it]


*** Execution time for num_shots 10: 5122.56 seconds
llama3.2-vision:11b-instruct-fp16/shots-10(4.466) metrics: {'f1': 0.9444460669324957, 'accuracy': 0.944202266782912}
Evaluating model: qwen2.5:14b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headli

  6%|▋         | 73/1147 [20:09<5:04:12, 16.99s/it] 

In [None]:
evaluate_model_with_num_shots(
    "qwen2.5:14b-instruct-fp16",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

Evaluating model: qwen2.5:14b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:26:00<00:00, 13.91s/it] 


*** Execution time for num_shots 0: 15960.26 seconds
qwen2.5:14b-instruct-fp16/shots-00(13.915) metrics: {'f1': 0.882887080820649, 'accuracy': 0.8814298169136879}
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:11:17<00:00, 13.14s/it] 


*** Execution time for num_shots 1: 15077.31 seconds
qwen2.5:14b-instruct-fp16/shots-01(13.145) metrics: {'f1': 0.9419874181269471, 'accuracy': 0.934612031386225}
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:08:25<00:00, 12.99s/it] 


*** Execution time for num_shots 2: 14905.26 seconds
qwen2.5:14b-instruct-fp16/shots-02(12.995) metrics: {'f1': 0.9338108116887441, 'accuracy': 0.9267654751525719}
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:11:51<00:00, 13.18s/it] 


*** Execution time for num_shots 4: 15111.99 seconds
qwen2.5:14b-instruct-fp16/shots-04(13.175) metrics: {'f1': 0.9231431628136756, 'accuracy': 0.9145597210113339}
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


 53%|█████▎    | 610/1147 [2:17:08<2:40:36, 17.95s/it]

In [5]:
evaluate_model_with_num_shots(
    "qwen2.5:14b-instruct-fp16",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[8, 10],
)

Evaluating model: qwen2.5:14b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:08:08<00:00, 12.98s/it] 


*** Execution time for num_shots 8: 14888.74 seconds
qwen2.5:14b-instruct-fp16/shots-08(12.981) metrics: {'f1': 0.9283390924894633, 'accuracy': 0.9250217959895379}
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:09:57<00:00, 13.08s/it] 


*** Execution time for num_shots 10: 14997.22 seconds
qwen2.5:14b-instruct-fp16/shots-10(13.075) metrics: {'f1': 0.9414869418415307, 'accuracy': 0.9354838709677419}


## System Info

In [6]:
!pip install -q psutil

In [7]:
import psutil

# Get the system memory information
memory_info = psutil.virtual_memory()

# Print the total, available, and used memory
print(f"Total memory: {memory_info.total / (1024 ** 3):.2f} GB")
print(f"Available memory: {memory_info.available / (1024 ** 3):.2f} GB")
print(f"Used memory: {memory_info.used / (1024 ** 3):.2f} GB")
print(f"Memory percentage: {memory_info.percent}%")

Total memory: 47.04 GB
Available memory: 43.00 GB
Used memory: 3.03 GB
Memory percentage: 8.6%


In [8]:
import platform


def get_os_info():
    os_info = {
        "System": platform.system(),
        "Node Name": platform.node(),
        "Release": platform.release(),
        "Version": platform.version(),
        "Machine": platform.machine(),
        "Processor": platform.processor(),
    }
    return os_info


os_info = get_os_info()
for key, value in os_info.items():
    print(f"{key}: {value}")

System: Linux
Node Name: Gen-AI
Release: 5.15.133.1-microsoft-standard-WSL2
Version: #1 SMP Thu Oct 5 21:02:42 UTC 2023
Machine: x86_64
Processor: x86_64


In [9]:
# printe current date & time
from datetime import datetime

now = datetime.now()
print("Current date/time:", now)

Current date/time: 2024-12-21 01:31:48.402926
