## Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from pathlib import Path

if "workding_dir" not in globals():
    workding_dir = str(Path.cwd().parent)

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

workding dir: /Users/inflaton/code/papers/maritime-incidents-ai-agents
loading env vars from: /Users/inflaton/code/papers/maritime-incidents-ai-agents/.env.example


True

In [3]:
%%time

from llm_toolkit.eval_openai import *
from tqdm.notebook import tqdm

tqdm.pandas()

data_path = os.getenv("DATA_PATH")
results_path = "paper/data/ollama_model_results_v3-M4_Max.csv"
num_ctx = os.getenv("NUM_CTX")
data_path, results_path, num_ctx

loading env vars from: /Users/inflaton/code/papers/maritime-incidents-ai-agents/.env.example
Adding /Users/inflaton/code/papers/maritime-incidents-ai-agents to sys.path
loading /Users/inflaton/code/papers/maritime-incidents-ai-agents/llm_toolkit/data_utils.py
CPU times: user 2.09 s, sys: 3.87 s, total: 5.96 s
Wall time: 1min 20s


('dataset/GMRID_v3.csv',
 'paper/data/ollama_model_results_v3-M4_Max.csv',
 '8192')

In [4]:
# run cells above before running anything below

## Creating GMRID_v3.csv

In [4]:
import pandas as pd

df = pd.read_csv("dataset/cleaned_data.csv")
df.shape

(5780, 17)

In [5]:
df.drop_duplicates(subset=["Details"], inplace=True, keep="first")
df.shape

(5750, 17)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5750 entries, 0 to 5779
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  5750 non-null   int64  
 1   Headline            5750 non-null   object 
 2   Details             5750 non-null   object 
 3   Severity            5750 non-null   object 
 4   Category            5750 non-null   object 
 5   Region              5750 non-null   object 
 6   Datetime            5750 non-null   object 
 7   Year                5750 non-null   int64  
 8   lat                 3861 non-null   float64
 9   lon                 3861 non-null   float64
 10  maritime_label      5750 non-null   bool   
 11  found_ports         5748 non-null   object 
 12  contains_port_info  5748 non-null   float64
 13  if_labeled          5748 non-null   object 
 14  Month               5748 non-null   float64
 15  Week                5748 non-null   float64
 16  Headline_De

In [7]:
# convert Headline, Details to string
df["Headline"] = df["Headline"].astype(str)
df["Details"] = df["Details"].astype(str)

# convert Datetime to datetime
df["Datetime"] = pd.to_datetime(df["Datetime"])

# convert maritime_label, if_labeled to bool
df["maritime_label"] = df["maritime_label"].astype(bool)
df["contains_port_info"] = df["contains_port_info"].astype(bool)

# convert found_ports to list
df["found_ports"] = df["found_ports"].apply(
    lambda x: eval(x) if isinstance(x, str) else []
)

# convert id to int
df["id"] = df["id"].astype(int)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5750 entries, 0 to 5779
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   id                  5750 non-null   int64         
 1   Headline            5750 non-null   object        
 2   Details             5750 non-null   object        
 3   Severity            5750 non-null   object        
 4   Category            5750 non-null   object        
 5   Region              5750 non-null   object        
 6   Datetime            5750 non-null   datetime64[ns]
 7   Year                5750 non-null   int64         
 8   lat                 3861 non-null   float64       
 9   lon                 3861 non-null   float64       
 10  maritime_label      5750 non-null   bool          
 11  found_ports         5750 non-null   object        
 12  contains_port_info  5750 non-null   bool          
 13  if_labeled          5748 non-null   object        
 1

  df["Datetime"] = pd.to_datetime(df["Datetime"])


In [8]:
# find rows with datatime does not match the Year and Month
df["year"] = df["Datetime"].dt.year.astype("int64")
df["month"] = df["Datetime"].dt.month.astype("int64")
df["week"] = df["Datetime"].dt.isocalendar().week.astype("int64")

In [9]:
df["Year"].info()

<class 'pandas.core.series.Series'>
Index: 5750 entries, 0 to 5779
Series name: Year
Non-Null Count  Dtype
--------------  -----
5750 non-null   int64
dtypes: int64(1)
memory usage: 89.8 KB


In [10]:
df["Month"].info()

<class 'pandas.core.series.Series'>
Index: 5750 entries, 0 to 5779
Series name: Month
Non-Null Count  Dtype  
--------------  -----  
5748 non-null   float64
dtypes: float64(1)
memory usage: 89.8 KB


In [11]:
df["Week"].info()

<class 'pandas.core.series.Series'>
Index: 5750 entries, 0 to 5779
Series name: Week
Non-Null Count  Dtype  
--------------  -----  
5748 non-null   float64
dtypes: float64(1)
memory usage: 89.8 KB


In [12]:
df["Year"].equals(df["year"]), df["Month"].equals(df["month"])

(True, False)

In [13]:
df[df["Year"] != df["year"]]

Unnamed: 0,id,Headline,Details,Severity,Category,Region,Datetime,Year,lat,lon,maritime_label,found_ports,contains_port_info,if_labeled,Month,Week,Headline_Details,year,month,week


In [14]:
df[df["Week"] != df["week"]]

Unnamed: 0,id,Headline,Details,Severity,Category,Region,Datetime,Year,lat,lon,maritime_label,found_ports,contains_port_info,if_labeled,Month,Week,Headline_Details,year,month,week
5078,5079,Daily COVID-19 roundup: Serum Institute sets p...,IHS Markit perspectiveImplicationsBahrain has ...,Minor,Regulatory Advisory,Bahrain,2020-07-12 14:44:00,2020,26.0667,50.5577,False,[],True,,,,,2020,7,28
5079,5080,Daily COVID-19 roundup: US’s COVAXX enters USD...,IHS Markit perspectiveImplicationsThe number o...,Minor,Regulatory Advisory,Argentina,2020-11-26 14:03:00,2020,52.13263,5.29127,False,[],True,,,,,2020,11,48


In [15]:
df[df["Month"] != df["month"]]

Unnamed: 0,id,Headline,Details,Severity,Category,Region,Datetime,Year,lat,lon,maritime_label,found_ports,contains_port_info,if_labeled,Month,Week,Headline_Details,year,month,week
5078,5079,Daily COVID-19 roundup: Serum Institute sets p...,IHS Markit perspectiveImplicationsBahrain has ...,Minor,Regulatory Advisory,Bahrain,2020-07-12 14:44:00,2020,26.0667,50.5577,False,[],True,,,,,2020,7,28
5079,5080,Daily COVID-19 roundup: US’s COVAXX enters USD...,IHS Markit perspectiveImplicationsThe number o...,Minor,Regulatory Advisory,Argentina,2020-11-26 14:03:00,2020,52.13263,5.29127,False,[],True,,,,,2020,11,48


In [16]:
print_row_details(df, [5053])

--------------------------------------------------
id: 5079
--------------------------------------------------
Headline: Daily COVID-19 roundup: Serum Institute sets price for Oxford COVID-19 vaccine at USD8 for private market, Bahrain grants regulatory approval to Pfizer’s COVID-19 vaccine
--------------------------------------------------
Details: IHS Markit perspectiveImplicationsBahrain has become the second country after the UK to grant emergency use authorisation for Pfizer (US) and BioNTech (Germany)’s coronavirus disease 2019 (COVID-19) vaccine. Pfizer also became the first pharmaceutical firm to seek emergency approval of a COVID-19 vaccine by the Drugs Controller General of India (DCGI); this was quickly followed by the Serum Institute of India (SII) that has also filed for emergency use authorisation in India of its domestically produced version of AstraZeneca (UK) and Oxford University’s COVID-19 vaccine, Covishield. World Health Organization (WHO) representatives have conf

In [17]:
df_v2 = pd.read_csv("dataset/global_incidents_v2.csv")
df_v2.shape

(5744, 4)

In [18]:
df_v2.drop_duplicates(subset=["Details"], inplace=True)
df_v2.shape

(5744, 4)

In [19]:
df_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5744 entries, 0 to 5743
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Details           5744 non-null   object
 1   Details_cleaned   5744 non-null   object
 2   Category          5744 non-null   object
 3   Summarized_label  5744 non-null   object
dtypes: object(4)
memory usage: 179.6+ KB


In [20]:
# Combine DataFrames on 'Details'
combined_df = pd.merge(df, df_v2, on="Details", how="inner")

# Display the combined DataFrame
combined_df.shape

(5744, 23)

In [21]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5744 entries, 0 to 5743
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   id                  5744 non-null   int64         
 1   Headline            5744 non-null   object        
 2   Details             5744 non-null   object        
 3   Severity            5744 non-null   object        
 4   Category_x          5744 non-null   object        
 5   Region              5744 non-null   object        
 6   Datetime            5744 non-null   datetime64[ns]
 7   Year                5744 non-null   int64         
 8   lat                 3856 non-null   float64       
 9   lon                 3856 non-null   float64       
 10  maritime_label      5744 non-null   bool          
 11  found_ports         5744 non-null   object        
 12  contains_port_info  5744 non-null   bool          
 13  if_labeled          5742 non-null   object      

In [22]:
combined_df["week"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 5744 entries, 0 to 5743
Series name: week
Non-Null Count  Dtype
--------------  -----
5744 non-null   int64
dtypes: int64(1)
memory usage: 45.0 KB


In [23]:
combined_df.drop(columns=["Category_x", "Year", "Month", "Week"], inplace=True)
combined_df.rename(
    columns={
        "Category_y": "Category",
        "year": "Year",
        "month": "Month",
        "week": "Week",
    },
    inplace=True,
)

combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5744 entries, 0 to 5743
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   id                  5744 non-null   int64         
 1   Headline            5744 non-null   object        
 2   Details             5744 non-null   object        
 3   Severity            5744 non-null   object        
 4   Region              5744 non-null   object        
 5   Datetime            5744 non-null   datetime64[ns]
 6   lat                 3856 non-null   float64       
 7   lon                 3856 non-null   float64       
 8   maritime_label      5744 non-null   bool          
 9   found_ports         5744 non-null   object        
 10  contains_port_info  5744 non-null   bool          
 11  if_labeled          5742 non-null   object        
 12  Headline_Details    5742 non-null   object        
 13  Year                5744 non-null   int64       

In [24]:
combined_df.to_csv(data_path, index=False)

In [25]:
df_v3 = combined_df
df_v3

Unnamed: 0,id,Headline,Details,Severity,Region,Datetime,lat,lon,maritime_label,found_ports,contains_port_info,if_labeled,Headline_Details,Year,Month,Week,Details_cleaned,Category,Summarized_label
0,1,Grasberg Mine- Grasberg mine workers extend st...,Media sources indicate that workers at the Gra...,Moderate,Indonesia,2017-05-28 17:08:00,-4.05608,137.11302,False,[freeport],True,False,Grasberg Mine- Grasberg mine workers extend st...,2017,5,21,medium source indicate worker grasberg mine ex...,Mine Workers Strike,Worker Strike
1,2,Indonesia: Undersea internet cables damaged by...,News sources are stating that recent typhoons ...,Minor,Indonesia,2017-04-09 14:30:00,,,False,[hong kong],True,False,Indonesia: Undersea internet cables damaged by...,2017,4,14,news source stating recent typhoon impact hong...,Travel Warning,Weather
2,3,Shanghai port congestion impacts terminals in ...,The persisting port congestion at Shanghai’s Y...,Minor,China,2017-04-27 09:16:00,29.52000,121.33190,True,"[ningbo, qingdao, shanghai]",True,False,Shanghai port congestion impacts terminals in ...,2017,4,17,persisting port congestion shanghai ’ yangshan...,Port Congestion,Administrative Issue
3,4,UPDATE - Indonesia: Explosion at KP Terminal i...,Updated local media sources from Jakarta indic...,Extreme,Indonesia,2017-05-24 15:15:00,-6.22465,106.86700,True,[jakarta],True,False,UPDATE - Indonesia: Explosion at KP Terminal i...,2017,5,21,updated local medium source jakarta indicate e...,"Bombing, Police Operations",Terrorism
4,5,UPDATE - Indonesia: Police confirm two explosi...,"According to local police in Jakarta, two expl...",Extreme,Indonesia,2017-05-24 16:20:00,,,True,[jakarta],True,True,UPDATE - Indonesia: Police confirm two explosi...,2017,5,21,according local police jakarta two explosion c...,"Bombing, Police Operations",Terrorism
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5739,5776,Winter storm may bring inclement weather condi...,Intelligence received by Everstream Analytics ...,Moderate,United States,2020-12-16 02:47:00,38.90072,-77.05440,False,[new york],True,False,Winter storm may bring inclement weather condi...,2020,12,51,intelligence received everstream analytics ind...,Ice Storm,Weather
5740,5777,Winter weather expected to continue to impact ...,Meteorological sources indicate that a series ...,Minor,United States,2020-12-31 18:15:00,41.30357,-72.90561,False,"[new york, virginia]",True,True,Winter weather expected to continue to impact ...,2020,12,53,meteorological source indicate series winter s...,"Roadway Closure / Disruption, Ground Transport...",Weather
5741,5778,Workers of Svitzer Australia plan to strike on...,Industry sources report on December 7 that Svi...,Moderate,Australia,2020-07-12 06:16:00,-38.35169,145.25050,False,"[geelong, melbourne]",True,False,Workers of Svitzer Australia plan to strike on...,2020,7,28,industry source report december 7 svitzer aust...,Industrial Action,Worker Strike
5742,5779,Workers stage 24-hour stoppage at DP World Ter...,Industry sources indicate on December 14 that ...,Moderate,Australia,2020-12-14 16:52:00,,,True,[fremantle],True,True,Workers stage 24-hour stoppage at DP World Ter...,2020,12,51,industry source indicate december 14 worker dp...,Port Strike,Worker Strike


In [26]:
df_v2.columns

Index(['Details', 'Details_cleaned', 'Category', 'Summarized_label'], dtype='object')

In [27]:
df_v3[df_v2.columns].equals(df_v2)

True

In [28]:
for col in df_v3.columns:
    nan_count = df_v3[col].isna().sum()
    if nan_count > 0:
        print(f"{col}: {nan_count}")

lat: 1888
lon: 1888
if_labeled: 2
Headline_Details: 2


In [29]:
df_v3[
    [
        "Headline",
        "contains_port_info",
        "lat",
        "lon",
        "found_ports",
        "Region",
        "Datetime",
        "Summarized_label",
    ]
]

Unnamed: 0,Headline,contains_port_info,lat,lon,found_ports,Region,Datetime,Summarized_label
0,Grasberg Mine- Grasberg mine workers extend st...,True,-4.05608,137.11302,[freeport],Indonesia,2017-05-28 17:08:00,Worker Strike
1,Indonesia: Undersea internet cables damaged by...,True,,,[hong kong],Indonesia,2017-04-09 14:30:00,Weather
2,Shanghai port congestion impacts terminals in ...,True,29.52000,121.33190,"[ningbo, qingdao, shanghai]",China,2017-04-27 09:16:00,Administrative Issue
3,UPDATE - Indonesia: Explosion at KP Terminal i...,True,-6.22465,106.86700,[jakarta],Indonesia,2017-05-24 15:15:00,Terrorism
4,UPDATE - Indonesia: Police confirm two explosi...,True,,,[jakarta],Indonesia,2017-05-24 16:20:00,Terrorism
...,...,...,...,...,...,...,...,...
5739,Winter storm may bring inclement weather condi...,True,38.90072,-77.05440,[new york],United States,2020-12-16 02:47:00,Weather
5740,Winter weather expected to continue to impact ...,True,41.30357,-72.90561,"[new york, virginia]",United States,2020-12-31 18:15:00,Weather
5741,Workers of Svitzer Australia plan to strike on...,True,-38.35169,145.25050,"[geelong, melbourne]",Australia,2020-07-12 06:16:00,Worker Strike
5742,Workers stage 24-hour stoppage at DP World Ter...,True,,,[fremantle],Australia,2020-12-14 16:52:00,Worker Strike


In [30]:
v2_dfs = {}
for type in ["train", "test"]:
    df_v2_test = pd.read_csv(f"dataset/global_incidents_v2-{type}.csv")
    v2_dfs[type] = df_v2_test

    combined_df = pd.merge(df_v2_test, df_v3, on="Details", how="inner")
    combined_df.rename(
        columns={
            "Category_y": "Category",
            "Details_cleaned_y": "Details_cleaned",
            "Summarized_label_y": "Summarized_label",
        },
        inplace=True,
    )
    df_v3_test = combined_df[df_v3.columns]
    df_v3_test.to_csv(data_path.replace(".csv", f"-{type}.csv"), index=False)

In [31]:
df_v3_test = pd.read_csv(data_path.replace(".csv", "-test.csv"))
df_v3_test[v2_dfs["test"].columns].equals(v2_dfs["test"])

True

In [32]:
df_v3_train = pd.read_csv(data_path.replace(".csv", "-train.csv"))
df_v3_train[v2_dfs["train"].columns].equals(v2_dfs["train"])

False

In [33]:
df_v3_train.shape, df_v3_test.shape

((4594, 19), (1147, 19))

In [34]:
v2_dfs["train"].shape, v2_dfs["test"].shape

((4597, 4), (1147, 4))

In [35]:
v2_dfs["train"][~v2_dfs["train"].Details.isin(df_v3_train.Details)]

Unnamed: 0,Details,Details_cleaned,Category,Summarized_label
465,Local media sources indicate on September 13 t...,local medium source indicate september 13 hurr...,Weather Advisory,Weather
1277,"In the Yellow Sea, north of the island of Taiw...",yellow sea north island taiwan people 's liber...,"Miscellaneous Events, Political Info / Event, ...",Human Error
1669,Shipping sources on April 25 indicate that adv...,shipping source april 25 indicate adverse weat...,Port Congestion,Weather


In [36]:
v2_dfs["train"][~v2_dfs["train"].Details.isin(df_v2.Details)]

Unnamed: 0,Details,Details_cleaned,Category,Summarized_label
465,Local media sources indicate on September 13 t...,local medium source indicate september 13 hurr...,Weather Advisory,Weather
1277,"In the Yellow Sea, north of the island of Taiw...",yellow sea north island taiwan people 's liber...,"Miscellaneous Events, Political Info / Event, ...",Human Error
1669,Shipping sources on April 25 indicate that adv...,shipping source april 25 indicate adverse weat...,Port Congestion,Weather


In [37]:
# find id in [465, 1277, 1669]
df_v3_train[df_v3_train.id.isin([465, 1277, 1669])]

Unnamed: 0,id,Headline,Details,Severity,Region,Datetime,lat,lon,maritime_label,found_ports,contains_port_info,if_labeled,Headline_Details,Year,Month,Week,Details_cleaned,Category,Summarized_label


In [38]:
# find id in [465, 1277, 1669]
df_v3[df_v3.id.isin([465, 1277, 1669])]

Unnamed: 0,id,Headline,Details,Severity,Region,Datetime,lat,lon,maritime_label,found_ports,contains_port_info,if_labeled,Headline_Details,Year,Month,Week,Details_cleaned,Category,Summarized_label
462,465,Police Presence Bolstered in Zhuhai ahead of I...,"On Monday, media sources reported that police ...",Minor,China,2018-10-22 23:44:00,22.27073,113.57668,False,"[hong kong, macau, zhuhai]",True,False,Police Presence Bolstered in Zhuhai ahead of I...,2018,10,43,monday medium source reported police begun dep...,Miscellaneous Events,Administrative Issue
1267,1277,Canada: Snowfall warning in effect for Vancouv...,Environment Canada currently has a snowfall wa...,Minor,Canada,2019-02-22 15:40:00,49.28674,-123.13742,False,[vancouver],True,True,Canada: Snowfall warning in effect for Vancouv...,2019,2,8,environment canada currently snowfall warning ...,"Ground Transportation Advisory, Weather Advisory",Weather
1657,1669,Hydrofluoric acid leak triggers hazmat respons...,Updated sources indicate that the hydrofluoric...,Moderate,Australia,2019-08-05 15:02:00,-37.83597,144.79576,True,[victoria],True,True,Hydrofluoric acid leak triggers hazmat respons...,2019,8,32,updated source indicate hydrofluoric acid leak...,Hazmat Response,Accident


In [39]:
# drop id in [465, 1277, 1669]
df_v3.drop(df_v3[df_v3.id.isin([465, 1277, 1669])].index, inplace=True)

In [40]:
# find id in [465, 1277, 1669]
df_v3[df_v3.id.isin([465, 1277, 1669])]

Unnamed: 0,id,Headline,Details,Severity,Region,Datetime,lat,lon,maritime_label,found_ports,contains_port_info,if_labeled,Headline_Details,Year,Month,Week,Details_cleaned,Category,Summarized_label


In [41]:
df_v3.to_csv(data_path, index=False)

In [42]:
df_v3 = pd.read_csv(data_path)
df_v3.shape

(5741, 19)

In [43]:
df_v3_train.shape, df_v3_test.shape

((4594, 19), (1147, 19))

In [44]:
df_v3_train.shape[0] / df_v3_test.shape[0]

4.005231037489102

In [45]:
df_v3_train[v2_dfs["train"].columns][:465].equals(v2_dfs["train"][:465])

True

In [46]:
df_v3_train[v2_dfs["train"].columns][:466].equals(v2_dfs["train"][:466])

False

In [47]:
df_v3_test[v2_dfs["test"].columns].equals(v2_dfs["test"])

True

## Debug Few-shot Prompting for JSON Outputs

In [6]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:0.5b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[10],
    max_entries=1,
    debug=True,
)

Evaluating model: qwen2.5:0.5b
loading train/test data files
--- evaluating 1 entries
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1
    })
})
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


  0%|          | 0/1 [00:00<?, ?it/s]

Setting debug mode to: True
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Courts, schools and hospitals evacuated across Saint Petersberg due to anonymous threats Russian media sources are reporting that courts, schools, and hospitals across Saint Petersberg have been evacuated today due to anonymous threats. It is understood that people have been evacuated from Petrodvorets, Oktyabrsky, Kolpinsky, Petrogradsky, Kuibyshevsky and Sestroretsky district courts. Furthermore, the State University of the Sea and River Fleet, St. Petersburg State University of Railway Engineering, Higher School of Folk Arts, St. Petersburg State University of Telecommunications, and S.M. Military Medical Academy Kirov have all been evacuated. This is the fourth consecutive week of evacuations from public buildings due to such threats. It is not known when the situation will normalise."
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence >

100%|██████████| 1/1 [00:00<00:00,  3.34it/s]

[36;1m[1;3m[llm/end][0m [1m[chain:RunnableSequence > llm:ChatOllama] [261ms] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "{\n  \"category\": \"Terrorism\",\n  \"specific_tags\": [\"Bomb Threat\", \"Public Safety\"]\n}",
        "generation_info": {
          "model": "qwen2.5:0.5b",
          "created_at": "2024-12-17T15:38:39.644191Z",
          "done": true,
          "done_reason": "stop",
          "total_duration": 256028750,
          "load_duration": 34964959,
          "prompt_eval_count": 2081,
          "prompt_eval_duration": 75000000,
          "eval_count": 25,
          "eval_duration": 132000000,
          "message": {
            "role": "assistant",
            "content": "",
            "images": null,
            "tool_calls": null
          }
        },
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",





'float' object has no attribute 'size'
Error saving results:  ['{\n  "category": "Terrorism",\n  "specific_tags": ["Bomb Threat", "Public Safety"]\n}']
Dataset({
    features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
    num_rows: 1
})
CPU times: user 77 ms, sys: 17.2 ms, total: 94.2 ms
Wall time: 2.33 s


In [9]:
prompt = "System: Task: Classify Inputs into Predefined Categories\n\nYour primary objective is to analyze the given input and assign it to one of the predefined categories: ['Weather', 'Worker Strike', 'Administrative Issue', 'Human Error', 'Cyber Attack', 'Terrorism', 'Accident', 'Others']. Evaluate the content carefully and use the defining characteristics of each category to ensure an accurate classification.\n\nGuidelines:\n1. Understand the Categories:\nEach category has specific attributes that distinguish it. Familiarize yourself with these attributes by referring to the category descriptions provided in the JSON below. Use these details to guide your classification:\n\n{'Weather': ['Flooding', 'Severe Winds', 'Weather Advisory', 'Tropical Cyclone', 'Storm', 'Ice Storm', 'Earthquake', 'Tornado', 'Typhoon', 'Landslide', 'Water', 'Hurricane', 'Wildfire', 'Blizzard', 'Hail'], 'Worker Strike': ['Mine Workers Strike', 'Production Halt', 'Protest', 'Riot', 'Port Strike', 'General Strike', 'Civil Service Strike', 'Civil Unrest Advisory', 'Cargo Transportation Strike', 'Energy Sector Strike'], 'Administrative Issue': ['Port Congestion', 'Police Operations', 'Roadway Closure', 'Disruption', 'Cargo', 'Industrial Action', 'Port Disruption', 'Cargo Disruption', 'Power Outage', 'Port Closure', 'Maritime Advisory', 'Train Delays', 'Ground Transportation Advisory', 'Public Transportation Disruption', 'Trade Regulation', 'Customs Regulation', 'Regulatory Advisory', 'Industry Directives', 'Security Advisory', 'Public Holidays', 'Customs Delay', 'Public Health Advisory', 'Detention', 'Aviation Advisory', 'Waterway Closure', 'Plant Closure', 'Border Closure', 'Delay', 'Industrial zone shutdown', 'Trade Restrictions', 'Closure', 'Truck Driving Ban', 'Insolvency', 'Environmental Regulations', 'Postal Disruption', 'Travel Warning'], 'Human Error': ['Workplace Accident', 'Individuals in Focus', 'Military Operations', 'Flight Delays', 'Cancellations', 'Political Info', 'Political Event'], 'Cyber Attack': ['Network Disruption', 'Ransomware', 'Data breach', 'Phishing'], 'Terrorism': ['Bombing', 'Warehouse Theft', 'Public Safety', 'Security', 'Organized Crime', 'Piracy', 'Kidnap', 'Shooting', 'Robbery', 'Cargo theft', 'Bomb Detonation', 'Terror Attack', 'Outbreak Of War', 'Militant Action'], 'Accident': ['Hazmat Response', 'Maritime Accident', 'Vehicle Accident', 'Death', 'Injury', 'Non-industrial Fire', 'Chemical Spill', 'Industrial Fire', 'Fuel Disruption', 'Airline Incident', 'Crash', 'Explosion', 'Train Accident', 'Derailment', 'Sewage Disruption', 'Barge Accident', 'Bridge Collapse', 'Structure Collapse', 'Airport Accident', 'Force Majeure', 'Telecom Outage'], 'Others': ['Miscellaneous Events', 'Miscellaneous Strikes', 'Outbreak of disease']}\n\n2. Contextual Analysis:\nConsider the broader context of the input. If an input could potentially fit into multiple categories, select the one that most closely aligns with its primary intent or focus.\n3. Handling Ambiguity:\nFor ambiguous inputs or those that do not clearly align with any category, choose the category that most closely matches the content provided.\n4. Ensure Accuracy and Consistency:\nStrive for consistent and accurate classifications. Avoid arbitrary or random assignments.\n5. Provide Feedback:\nIf the input cannot be classified into any of the given categories, classify it as “Others.”\n\nInstructions for Output:\n1. Once the category is identified, provide “specific tags” by selecting from the list corresponding to the identified category, as defined in the JSON.\n2. Ensure the selected “specific tags” accurately reflect the details and context of the input.\n\nOutput Format:\n\nReturn your classification in the following JSON format:\n\n{\n  \"category\": \"<Selected Category>\",\n  \"specific_tags\": [\"<Selected Tag 1>\", \"<Selected Tag 2>\", ...]\n}\n\n\n\nExample Inputs and Outputs:\n\n- Input:\n\nLocal sources reported that operations at Pier 1 and 2 container terminals at the Port of Durban have suspended due to strong winds on December 27 from 18:50 (local time) and resumed at 23:10 on the same day. For Pier 2 terminal, operations stopped at 19:30 and resumed at 20:35 respectively.\n\n- Output:\n\n{\n  \"category\": \"Weather\",\n  \"specific_tags\": [\"Severe Winds\"]\n}\n\n- Input:\n\nInformation received states that emergency personnel are working to contain a blaze at Off Road Warehouse in commercial San Diego, on 17 November. It is detailed that the store is located at 7915 Balboa Avenue. Traffic maps show that Balboa Avenue is closed both ways between Mercury Street and Convoy Street. Travelers should use caution in the area and divert away from any encountered fire suppression operations.\n\n- Output:\n\n{\n  \"category\": \"Administrative Issue\",\n  \"specific_tags\": [\"Roadway Closure\", \"Public Safety Advisory\"]\n}\n\n- Input:\n\nProtests against climate change are anticipated nationwide on 29 November and 6 December as part of the ‘Fridays for Future’ global climate strike. Specific details of planned events have not been confirmed, but are likely to occur in major cities across the country. Previous climate strikes have seen large turnout in cities such as New York City, Philadelphia, and Washington, D.C.\n\n- Output:\n\n{\n  \"category\": \"Worker Strike\",\n  \"specific_tags\": [\"Protest\", \"Civil Unrest Advisory\"]\n}\n\n- Input:\n\nGovernment sources reported a fire at the Woolwich Dockyard, located near Belson Rd and Borgard Rd. No injuries were immediately reported. All rail lines from London towards Slade Green are running again. This incident is closed.\n\n- Output:\n\n{\n  \"category\": \"Accident\",\n  \"specific_tags\": [\"Non-industrial Fire\"]\n}\n\n- Input:\n\nLocal media sources indicated on November 30 that the Ekurhuleni Central Crime Intelligence Unit arrested 4 suspects and recovered computer printer equipment cargo from their November 21 truck theft at the corner of Main Reef Road and Ulysses Street in Cleveland. The truck was en route from Durban to Johannesburg when it was hijacked in Randfontein. The cargo was worth ZAR 5 million (EUR 309018.21; USD 352673.95), and some laptops are still missing. Distributors should be mindful of cargo theft risks in Randfontein and should plan accordingly.\n\n- Output:\n\n{\n  \"category\": \"Terrorism\",\n  \"specific_tags\": [\"Cargo Theft\", \"Organized Crime\"]\n}\n\n- Input:\n\nAnonymous sources have reported that a ransomware attack has disrupted network operations for a major logistics provider. The attack occurred on November 15, and data breaches were confirmed, exposing sensitive customer and shipment details. The company has stated that recovery is underway but advised customers to expect delays.\n\n- Output:\n\n{\n  \"category\": \"Cyber Attack\",\n  \"specific_tags\": [\"Ransomware\", \"Data Breach\"]\n}\n\n- Input:\n\nThe Selangor Health Department reported that two students of a Secondary School in Pandamaran Jaya in Port Klang had been infected with COVID-19 virus.\n\n- Output:\n\n{\n  \"category\": \"Others\",\n  \"specific_tags\": [\"Outbreak of Disease\"]\n}\n\n- Input:\n\nAn incident of workplace negligence was reported at a construction site in downtown Chicago on November 19, where an unfastened scaffolding collapsed, injuring two workers. Investigations are ongoing to determine accountability.\n\n- Output:\n\n{\n  \"category\": \"Human Error\",\n  \"specific_tags\": [\"Workplace Accident\"]\n}\n\n- Input:\n\nShipping delays were reported at the Port of Los Angeles on December 1 due to a customs system outage. Containers requiring clearance were delayed for up to 12 hours, affecting supply chains across the region.\n\n- Output:\n\n{\n  \"category\": \"Administrative Issue\",\n  \"specific_tags\": [\"Customs Delay\", \"Port Disruption\"]\n}\n\n- Input:\n\nRussian media sources are reporting that courts, schools, and hospitals across Saint Petersburg have been evacuated today due to anonymous threats. It is understood that people have been evacuated from Petrodvorets, Oktyabrsky, Kolpinsky, Petrogradsky, Kuibyshevsky, and Sestroretsky district courts. Furthermore, the State University of the Sea and River Fleet, St. Petersburg State University of Railway Engineering, Higher School of Folk Arts, St. Petersburg State University of Telecommunications, and S.M. Military Medical Academy Kirov have all been evacuated. This is the fourth consecutive week of evacuations from public buildings due to such threats. It is not known when the situation will normalize.\n\n- Output:\n\n{\n  \"category\": \"Terrorism\",\n  \"specific_tags\": [\"Bomb Threat\", \"Public Safety\"]\n}\n\n\nHuman: - Input:\n\nCourts, schools and hospitals evacuated across Saint Petersberg due to anonymous threats Russian media sources are reporting that courts, schools, and hospitals across Saint Petersberg have been evacuated today due to anonymous threats. It is understood that people have been evacuated from Petrodvorets, Oktyabrsky, Kolpinsky, Petrogradsky, Kuibyshevsky and Sestroretsky district courts. Furthermore, the State University of the Sea and River Fleet, St. Petersburg State University of Railway Engineering, Higher School of Folk Arts, St. Petersburg State University of Telecommunications, and S.M. Military Medical Academy Kirov have all been evacuated. This is the fourth consecutive week of evacuations from public buildings due to such threats. It is not known when the situation will normalise.\n\n- Output:"
print(prompt)

System: Task: Classify Inputs into Predefined Categories

Your primary objective is to analyze the given input and assign it to one of the predefined categories: ['Weather', 'Worker Strike', 'Administrative Issue', 'Human Error', 'Cyber Attack', 'Terrorism', 'Accident', 'Others']. Evaluate the content carefully and use the defining characteristics of each category to ensure an accurate classification.

Guidelines:
1. Understand the Categories:
Each category has specific attributes that distinguish it. Familiarize yourself with these attributes by referring to the category descriptions provided in the JSON below. Use these details to guide your classification:


2. Contextual Analysis:
Consider the broader context of the input. If an input could potentially fit into multiple categories, select the one that most closely aligns with its primary intent or focus.
3. Handling Ambiguity:
For ambiguous inputs or those that do not clearly align with any category, choose the category that most c

## Evaluating 12 LLMs: 6 Llama3 + 6 Qwen2.5

### Evaluating Smaller LLMs (<=8B): 3 Llama3 + 4 Qwen2.5

In [7]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:0.5b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

Evaluating model: qwen2.5:0.5b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:37<00:00,  5.28it/s]


*** Execution time for num_shots 0: 217.32 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:42<00:00,  5.16it/s]


*** Execution time for num_shots 1: 222.12 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:34<00:00,  5.35it/s]


*** Execution time for num_shots 2: 214.35 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:44<00:00,  5.10it/s]


*** Execution time for num_shots 4: 224.97 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [03:56<00:00,  4.86it/s]


*** Execution time for num_shots 8: 236.11 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:09<00:00,  4.59it/s]


*** Execution time for num_shots 10: 249.95 seconds
'float' object has no attribute 'size'
CPU times: user 4min 43s, sys: 12.6 s, total: 4min 55s
Wall time: 22min 49s


In [8]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2:1b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

Evaluating model: llama3.2:1b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [08:37<00:00,  2.22it/s]


*** Execution time for num_shots 0: 517.12 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [09:20<00:00,  2.05it/s]


*** Execution time for num_shots 1: 560.40 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [11:13<00:00,  1.70it/s]


*** Execution time for num_shots 2: 673.91 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [11:13<00:00,  1.70it/s]


*** Execution time for num_shots 4: 673.63 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:56<00:00,  1.75it/s]


*** Execution time for num_shots 8: 656.68 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [11:44<00:00,  1.63it/s]


*** Execution time for num_shots 10: 704.34 seconds
'float' object has no attribute 'size'
CPU times: user 6min 5s, sys: 20.3 s, total: 6min 25s
Wall time: 1h 3min 13s


In [9]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:1.5b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

Evaluating model: qwen2.5:1.5b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:08<00:00,  3.71it/s]


*** Execution time for num_shots 0: 308.82 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:31<00:00,  3.46it/s]


*** Execution time for num_shots 1: 331.11 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:31<00:00,  3.46it/s]


*** Execution time for num_shots 2: 331.86 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:30<00:00,  3.47it/s]


*** Execution time for num_shots 4: 330.78 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:43<00:00,  3.34it/s]


*** Execution time for num_shots 8: 343.77 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:55<00:00,  3.23it/s]


*** Execution time for num_shots 10: 355.29 seconds
'float' object has no attribute 'size'
CPU times: user 5min 46s, sys: 13.6 s, total: 6min
Wall time: 33min 26s


In [10]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2:3b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

Evaluating model: llama3.2:3b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:16<00:00,  1.86it/s]


*** Execution time for num_shots 0: 616.88 seconds
category not in json: {}
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [09:50<00:00,  1.94it/s]


*** Execution time for num_shots 1: 590.75 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:20<00:00,  1.85it/s]


*** Execution time for num_shots 2: 621.00 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:12<00:00,  1.87it/s]


*** Execution time for num_shots 4: 612.47 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:27<00:00,  1.83it/s]


*** Execution time for num_shots 8: 627.92 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [10:34<00:00,  1.81it/s]


*** Execution time for num_shots 10: 634.15 seconds
'float' object has no attribute 'size'
CPU times: user 6min 17s, sys: 15.9 s, total: 6min 32s
Wall time: 1h 1min 47s


In [24]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:3b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

loading /Users/inflaton/code/papers/maritime-incidents-ai-agents/llm_toolkit/data_utils.py
Evaluating model: qwen2.5:3b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [08:52<00:00,  2.15it/s]


*** Execution time for num_shots 0: 532.29 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [08:42<00:00,  2.19it/s]


*** Execution time for num_shots 1: 523.00 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [08:51<00:00,  2.16it/s]


*** Execution time for num_shots 2: 531.95 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [09:03<00:00,  2.11it/s]


*** Execution time for num_shots 4: 543.26 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [09:18<00:00,  2.05it/s]


*** Execution time for num_shots 8: 558.89 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [09:34<00:00,  2.00it/s]


*** Execution time for num_shots 10: 574.13 seconds
'float' object has no attribute 'size'
CPU times: user 6min 15s, sys: 16.1 s, total: 6min 31s
Wall time: 54min 31s


In [11]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "qwen2.5:7b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

Evaluating model: qwen2.5:7b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [18:00<00:00,  1.06it/s]


*** Execution time for num_shots 0: 1080.31 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [17:03<00:00,  1.12it/s]


*** Execution time for num_shots 1: 1023.36 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [18:00<00:00,  1.06it/s]


*** Execution time for num_shots 2: 1080.77 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [18:23<00:00,  1.04it/s]


*** Execution time for num_shots 4: 1103.48 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [19:20<00:00,  1.01s/it]


*** Execution time for num_shots 8: 1160.39 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [19:41<00:00,  1.03s/it]


*** Execution time for num_shots 10: 1181.05 seconds
'float' object has no attribute 'size'
CPU times: user 6min 24s, sys: 15.3 s, total: 6min 39s
Wall time: 1h 50min 33s


In [12]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.1:8b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

Evaluating model: llama3.1:8b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [18:41<00:00,  1.02it/s]


*** Execution time for num_shots 0: 1121.91 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [16:23<00:00,  1.17it/s]


*** Execution time for num_shots 1: 983.44 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [16:56<00:00,  1.13it/s]


*** Execution time for num_shots 2: 1016.40 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [17:03<00:00,  1.12it/s]


*** Execution time for num_shots 4: 1023.60 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [17:24<00:00,  1.10it/s]


*** Execution time for num_shots 8: 1044.12 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [17:36<00:00,  1.09it/s]


*** Execution time for num_shots 10: 1056.64 seconds
'float' object has no attribute 'size'
CPU times: user 6min 24s, sys: 15.8 s, total: 6min 40s
Wall time: 1h 44min 10s


### Evaluating Bigger LLMs (>=11B): 3 Llama3 + 2 Qwen2.5

In [13]:
%%time

from llm_toolkit.eval_openai import *

evaluate_model_with_num_shots(
    "llama3.2-vision:11b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

Evaluating model: llama3.2-vision:11b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [18:40<00:00,  1.02it/s]


*** Execution time for num_shots 0: 1120.07 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [16:19<00:00,  1.17it/s]


*** Execution time for num_shots 1: 979.91 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [16:32<00:00,  1.16it/s]


*** Execution time for num_shots 2: 992.37 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [16:00<00:00,  1.19it/s]


*** Execution time for num_shots 4: 960.12 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [16:09<00:00,  1.18it/s]


*** Execution time for num_shots 8: 969.08 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [17:40<00:00,  1.08it/s]


*** Execution time for num_shots 10: 1060.18 seconds
'float' object has no attribute 'size'
CPU times: user 6min 32s, sys: 18.4 s, total: 6min 50s
Wall time: 1h 41min 26s


In [14]:
%%time

evaluate_model_with_num_shots(
    "qwen2.5:14b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

Evaluating model: qwen2.5:14b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [34:55<00:00,  1.83s/it] 


*** Execution time for num_shots 0: 2095.02 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [34:30<00:00,  1.81s/it] 


*** Execution time for num_shots 1: 2070.40 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [34:41<00:00,  1.81s/it] 


*** Execution time for num_shots 2: 2081.53 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [35:36<00:00,  1.86s/it] 


*** Execution time for num_shots 4: 2136.65 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [37:36<00:00,  1.97s/it] 


*** Execution time for num_shots 8: 2256.40 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [38:54<00:00,  2.04s/it] 


*** Execution time for num_shots 10: 2334.58 seconds
'float' object has no attribute 'size'
CPU times: user 6min 36s, sys: 16.6 s, total: 6min 53s
Wall time: 3h 36min 19s


In [15]:
%%time

evaluate_model_with_num_shots(
    "qwen2.5:32b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

Evaluating model: qwen2.5:32b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [9:22:32<00:00, 29.43s/it]    


*** Execution time for num_shots 0: 33752.17 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [1:41:59<00:00,  5.34s/it] 


*** Execution time for num_shots 1: 6119.69 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [1:13:01<00:00,  3.82s/it]


*** Execution time for num_shots 2: 4381.54 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [8:45:21<00:00, 27.48s/it]    


*** Execution time for num_shots 4: 31521.06 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [4:08:24<00:00, 12.99s/it]   


*** Execution time for num_shots 8: 14904.96 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [1:58:36<00:00,  6.20s/it] 


*** Execution time for num_shots 10: 7116.68 seconds
'float' object has no attribute 'size'
CPU times: user 6min 59s, sys: 21.1 s, total: 7min 20s
Wall time: 1d 3h 10min


In [16]:
%%time

evaluate_model_with_num_shots(
    "llama3.1:70b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

loading /Users/inflaton/code/papers/maritime-incidents-ai-agents/llm_toolkit/data_utils.py
Evaluating model: llama3.1:70b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


  0%|          | 1/1147 [1:23:06<1587:22:39, 4986.53s/it]


KeyboardInterrupt: 

In [17]:
%%time

evaluate_model_with_num_shots(
    "llama3.3:70b",
    data_path,
    results_path=results_path,
    ollama=True,
    range_num_shots=[0, 1, 2, 4, 8, 10],
)

Evaluating model: llama3.3:70b
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


  0%|          | 3/1147 [1:24:52<539:27:01, 1697.57s/it]


KeyboardInterrupt: 

In [18]:
# %%time

# from llm_toolkit.eval_openai import *

# evaluate_model_with_num_shots(
#     "qwen2.5:72b",
#     data_path,
#     results_path=results_path,
#     ollama=True,
#     range_num_shots=[0, 1, 2, 4, 8, 10],
# )

In [19]:
# %%time

# from llm_toolkit.eval_openai import *

# evaluate_model_with_num_shots(
#     "llama3.2-vision:90b",
#     data_path,
#     results_path=results_path,
#     ollama=True,
#     range_num_shots=[0, 1, 2, 4, 8, 10],
# )

In [25]:
!./scripts/ollama-pull-fp16.sh

python(72797) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest 
pulling 6f96e01a3f55... 100% ▕████████████████▏ 994 MB                         
pulling 66b9ea09bd5b... 100% ▕████████████████▏   68 B                         
pulling eb4402837c78... 100% ▕████████████████▏ 1.5 KB                         
pulling 832dd9e00a68... 100% ▕████████████████▏  11 KB                         
pulling ff54cb6e5ade... 100% ▕████████████████▏  487 B                         
verifying sha256 digest 
writing manifest 
success [?25h
[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulli

In [26]:
all_models = [
    "qwen2.5:0.5b-instruct-fp16",
    "llama3.2:1b-instruct-fp16",
    "qwen2.5:1.5b-instruct-fp16",
    "llama3.2:3b-instruct-fp16",
    "qwen2.5:3b-instruct-fp16",
    "qwen2.5:7b-instruct-fp16",
    "llama3.1:8b-instruct-fp16",
    "llama3.2-vision:11b-instruct-fp16",
    "qwen2.5:14b-instruct-fp16",
]

for model in all_models:
    evaluate_model_with_num_shots(
        model,
        data_path,
        results_path=results_path,
        ollama=True,
        range_num_shots=[0, 1, 2, 4, 8, 10],
    )

Evaluating model: qwen2.5:0.5b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:41<00:00,  4.07it/s]


*** Execution time for num_shots 0: 281.63 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:36<00:00,  4.16it/s]


*** Execution time for num_shots 1: 276.01 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:43<00:00,  4.04it/s]


*** Execution time for num_shots 2: 283.84 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:46<00:00,  4.00it/s]


*** Execution time for num_shots 4: 286.50 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [04:58<00:00,  3.84it/s]


*** Execution time for num_shots 8: 298.65 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [05:08<00:00,  3.71it/s]


*** Execution time for num_shots 10: 308.94 seconds
'float' object has no attribute 'size'
Evaluating model: llama3.2:1b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [11:14<00:00,  1.70it/s]


*** Execution time for num_shots 0: 674.95 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [13:09<00:00,  1.45it/s]


*** Execution time for num_shots 1: 789.58 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [14:03<00:00,  1.36it/s]


*** Execution time for num_shots 2: 843.99 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [14:01<00:00,  1.36it/s]


*** Execution time for num_shots 4: 841.41 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [13:52<00:00,  1.38it/s]


*** Execution time for num_shots 8: 832.57 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [15:25<00:00,  1.24it/s]


*** Execution time for num_shots 10: 925.65 seconds
'float' object has no attribute 'size'
Evaluating model: qwen2.5:1.5b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [07:01<00:00,  2.72it/s]


*** Execution time for num_shots 0: 421.83 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [07:15<00:00,  2.63it/s]


*** Execution time for num_shots 1: 435.66 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [07:20<00:00,  2.60it/s]


*** Execution time for num_shots 2: 440.98 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [07:19<00:00,  2.61it/s]


*** Execution time for num_shots 4: 439.76 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [07:36<00:00,  2.51it/s]


*** Execution time for num_shots 8: 456.26 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [07:51<00:00,  2.43it/s]


*** Execution time for num_shots 10: 471.72 seconds
'float' object has no attribute 'size'
Evaluating model: llama3.2:3b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [15:07<00:00,  1.26it/s]


*** Execution time for num_shots 0: 907.90 seconds
category not in json: {}
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [13:35<00:00,  1.41it/s]


*** Execution time for num_shots 1: 815.91 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [13:53<00:00,  1.38it/s]


*** Execution time for num_shots 2: 834.00 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [13:57<00:00,  1.37it/s]


*** Execution time for num_shots 4: 837.88 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [14:19<00:00,  1.33it/s]


*** Execution time for num_shots 8: 859.86 seconds
category not in json: {}
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [14:27<00:00,  1.32it/s]


*** Execution time for num_shots 10: 867.44 seconds
'float' object has no attribute 'size'
Evaluating model: qwen2.5:3b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [13:02<00:00,  1.47it/s]


*** Execution time for num_shots 0: 782.25 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [12:19<00:00,  1.55it/s]


*** Execution time for num_shots 1: 739.19 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [12:24<00:00,  1.54it/s]


*** Execution time for num_shots 2: 744.18 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [12:28<00:00,  1.53it/s]


*** Execution time for num_shots 4: 748.26 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [12:45<00:00,  1.50it/s]


*** Execution time for num_shots 8: 765.28 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [12:56<00:00,  1.48it/s]


*** Execution time for num_shots 10: 776.44 seconds
'float' object has no attribute 'size'
Evaluating model: qwen2.5:7b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [26:25<00:00,  1.38s/it]


*** Execution time for num_shots 0: 1585.92 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [41:04<00:00,  2.15s/it] 


*** Execution time for num_shots 1: 2464.91 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [1:19:26<00:00,  4.16s/it] 


*** Execution time for num_shots 2: 4766.45 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [25:55<00:00,  1.36s/it]


*** Execution time for num_shots 4: 1555.24 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [26:10<00:00,  1.37s/it]


*** Execution time for num_shots 8: 1570.16 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [26:36<00:00,  1.39s/it]


*** Execution time for num_shots 10: 1596.85 seconds
'float' object has no attribute 'size'
Evaluating model: llama3.1:8b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [28:45<00:00,  1.50s/it]


*** Execution time for num_shots 0: 1725.20 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [24:22<00:00,  1.27s/it]


*** Execution time for num_shots 1: 1462.17 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [25:18<00:00,  1.32s/it]


*** Execution time for num_shots 2: 1518.25 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [25:27<00:00,  1.33s/it]


*** Execution time for num_shots 4: 1527.42 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [25:41<00:00,  1.34s/it]


*** Execution time for num_shots 8: 1541.14 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [26:06<00:00,  1.37s/it]


*** Execution time for num_shots 10: 1566.67 seconds
'float' object has no attribute 'size'
Evaluating model: llama3.2-vision:11b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [30:07<00:00,  1.58s/it]


*** Execution time for num_shots 0: 1807.09 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [24:56<00:00,  1.30s/it]


*** Execution time for num_shots 1: 1496.35 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [25:52<00:00,  1.35s/it]


*** Execution time for num_shots 2: 1552.33 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [25:58<00:00,  1.36s/it]


*** Execution time for num_shots 4: 1558.72 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [26:27<00:00,  1.38s/it]


*** Execution time for num_shots 8: 1587.68 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [26:47<00:00,  1.40s/it]


*** Execution time for num_shots 10: 1607.12 seconds
'float' object has no attribute 'size'
Evaluating model: qwen2.5:14b-instruct-fp16
loading train/test data files
DatasetDict({
    train: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 4594
    })
    test: Dataset({
        features: ['id', 'Headline', 'Details', 'Severity', 'Region', 'Datetime', 'lat', 'lon', 'maritime_label', 'found_ports', 'contains_port_info', 'if_labeled', 'Headline_Details', 'Year', 'Month', 'Week', 'Details_cleaned', 'Category', 'Summarized_label', 'gpt-4o_label'],
        num_rows: 1147
    })
})
* Evaluating with num_shots: 0
Generating prompt templates for 0 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [56:59<00:00,  2.98s/it] 


*** Execution time for num_shots 0: 3419.68 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 1
Generating prompt templates for 1 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [53:32<00:00,  2.80s/it] 


*** Execution time for num_shots 1: 3212.56 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 2
Generating prompt templates for 2 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [1:38:08<00:00,  5.13s/it] 


*** Execution time for num_shots 2: 5888.03 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 4
Generating prompt templates for 4 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [2:21:27<00:00,  7.40s/it]   


*** Execution time for num_shots 4: 8487.94 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 8
Generating prompt templates for 8 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [58:48<00:00,  3.08s/it] 


*** Execution time for num_shots 8: 3528.64 seconds
'float' object has no attribute 'size'
* Evaluating with num_shots: 10
Generating prompt templates for 10 shots with Headline_Details and Summarized_label


100%|██████████| 1147/1147 [59:50<00:00,  3.13s/it] 


*** Execution time for num_shots 10: 3590.54 seconds
'float' object has no attribute 'size'


## System Info

In [27]:
!pip install -q psutil

loading env vars from: /Users/inflaton/code/papers/maritime-incidents-ai-agents/.env.example
Adding /Users/inflaton/code/papers/maritime-incidents-ai-agents to sys.path


python(13916) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [28]:
import psutil

# Get the system memory information
memory_info = psutil.virtual_memory()

# Print the total, available, and used memory
print(f"Total memory: {memory_info.total / (1024 ** 3):.2f} GB")
print(f"Available memory: {memory_info.available / (1024 ** 3):.2f} GB")
print(f"Used memory: {memory_info.used / (1024 ** 3):.2f} GB")
print(f"Memory percentage: {memory_info.percent}%")

Total memory: 48.00 GB
Available memory: 13.33 GB
Used memory: 29.85 GB
Memory percentage: 72.2%


In [29]:
import platform


def get_os_info():
    os_info = {
        "System": platform.system(),
        "Node Name": platform.node(),
        "Release": platform.release(),
        "Version": platform.version(),
        "Machine": platform.machine(),
        "Processor": platform.processor(),
    }
    return os_info


os_info = get_os_info()
for key, value in os_info.items():
    print(f"{key}: {value}")

System: Darwin
Node Name: Donghaos-MacBook-Pro.local
Release: 24.1.0
Version: Darwin Kernel Version 24.1.0: Thu Oct 10 21:06:57 PDT 2024; root:xnu-11215.41.3~3/RELEASE_ARM64_T6041
Machine: arm64
Processor: arm


In [30]:
# printe current date & time
from datetime import datetime

now = datetime.now()
print("Current date/time:", now)

Current date/time: 2024-12-20 22:06:23.400096
