# YOLO
The goal of this notebook is to do transfer learning of Fishial´s already implemented YOLO model. We do this by using Ultralytics

In [4]:
from ultralytics import YOLO
import os
import requests
import logging
from zipfile import ZipFile
import pandas as pd
import torch
import cv2
import matplotlib.pyplot as plt
import copy
from IPython.display import Image, display

from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models

In [5]:
DATA_DIR = "../data_csv/"
timor_leste_data_path = os.path.join(DATA_DIR, "timor-leste.csv") # Annotation info for ground truth
images_path = "../data_images/"

In [6]:
import re

def extract_and_format_species(big_string):
    # Find all genus_species patterns (ignore numbers)
    matches = re.findall(r"\b([a-z]+_[a-z]+)\b", big_string)

    formatted = []
    for species in matches:
        genus, epithet = species.split("_")
        formatted.append(f"{genus.capitalize()} {epithet}")

    return formatted

big_string = """
acanthocybium_solandri	
29
alectis_ciliaris	
15
amblygaster_sirm	
1
aphareus_rutilans	
3
arius_arius	
1
aurigequula_fasciata	
70
carangoides_praeustus	
1
caranx_ignobilis	
41
caranx_lugubris	
1
caranx_melampygus	
9
caranx_sexfasciatus	
1
caranx_tille	
1
chirocentrus_dorab	
81
chirocentrus_nudus	
2
decapterus_macarellus	
52
decapterus_macrosoma	
4
elagatis_bipinnulata	
8
elagatos_bipinnulata	
1
epinephelus_maculatus	
19
epinephelus_radiatus	
6
etelis_carbunculus	
16
gazza_minuta	
59
gymnocranius_grandoculis	
38
katsuwonus_pelamis	
10
lethrinus_erythracanthus	
2
lethrinus_mahsena	
13
lethrinus_obsoletus	
1
lethrinus_ornatus	
1
lethrinus_reticulatus	
1
lujanus_gibbus	
51
lutjanus_bohar	
3
lutjanus_fulviflamma	
27
lutjanus_fulvus	
47
lutjanus_gibbus	
3
lutjanus_johnii	
68
lutjanus_kasmira	
10
lutjanus_rivulatus	
1
lutjanus_rufolineatus	
2
lutjanus_russellii	
7
monotaxis_grandoculis	
7
mulloidichtys_vanicolensis	
75
neoniphon_sammara	
15
parupeneus_heptacanthus	
24
parupeneus_indicus	
13
platax_boersii	
1
plectorhinchus_vittatus	
2
pomacanthus_annularis	
1
pomadasys_argenteus	
33
pomadasys_kaakan	
46
priacanthus_sagittarius	
1
pristigenys_niphonia	
15
psettodes_erumei	
3
pseudobalistes_flavimarginatus	
8
rastrelliger_kanagurta	
129
sardinella_albella	
16
scarus_quoyi	
2
scolopsis_vosmeri	
4
scomberoides_lysan	
1
scomberoides_tol	
106
scomberomorus_commerson	
28
seriola_dumerili	
1
siganus_spinus	
1
upeneus_vittatus	
77
variola_albimarginata	
7
"""

relevant_species = extract_and_format_species(big_string)

print(relevant_species)

['Acanthocybium solandri', 'Alectis ciliaris', 'Amblygaster sirm', 'Aphareus rutilans', 'Arius arius', 'Aurigequula fasciata', 'Carangoides praeustus', 'Caranx ignobilis', 'Caranx lugubris', 'Caranx melampygus', 'Caranx sexfasciatus', 'Caranx tille', 'Chirocentrus dorab', 'Chirocentrus nudus', 'Decapterus macarellus', 'Decapterus macrosoma', 'Elagatis bipinnulata', 'Elagatos bipinnulata', 'Epinephelus maculatus', 'Epinephelus radiatus', 'Etelis carbunculus', 'Gazza minuta', 'Gymnocranius grandoculis', 'Katsuwonus pelamis', 'Lethrinus erythracanthus', 'Lethrinus mahsena', 'Lethrinus obsoletus', 'Lethrinus ornatus', 'Lethrinus reticulatus', 'Lujanus gibbus', 'Lutjanus bohar', 'Lutjanus fulviflamma', 'Lutjanus fulvus', 'Lutjanus gibbus', 'Lutjanus johnii', 'Lutjanus kasmira', 'Lutjanus rivulatus', 'Lutjanus rufolineatus', 'Lutjanus russellii', 'Monotaxis grandoculis', 'Mulloidichtys vanicolensis', 'Neoniphon sammara', 'Parupeneus heptacanthus', 'Parupeneus indicus', 'Platax boersii', 'Ple

In [7]:
# Read annotation info (Timor-leste)
df_tl_ann = pd.read_csv(timor_leste_data_path, encoding="utf-8-sig", header=0, skiprows=1)
df_tl_ann = df_tl_ann[["image_file", "catch_name_en", "Species_name", "Family"]]

# Filter by species in relevant_species
df_filtered = df_tl_ann[df_tl_ann["Species_name"].isin(relevant_species)]

# Keep only images that actually exist
existing_files = set(os.listdir(images_path))

df_filtered = df_filtered[df_filtered["image_file"].isin(existing_files)]

# Convert to list of filenames
filtered_images= df_filtered["image_file"].tolist()

print("Total annotated images:", len(df_tl_ann))
print("Relevant images", len(df_filtered))
print("Unique final JPGs:", len(set(filtered_images)))


Total annotated images: 603
Relevant images 295
Unique final JPGs: 247


In [5]:
def on_train_epoch_end(trainer):
    print(f"Finished epoch {trainer.epoch}/{trainer.epochs}")

model = YOLO("yolo11n.pt")

model.add_callback("on_train_epoch_end", on_train_epoch_end)

# Train the model
train_results = model.train(
    data="full_dataset/data.yaml",
    epochs=300,
    imgsz=640,
    device="cuda",
    patience=30
)

# Evaluate the model's performance on the validation set
metrics = model.val()

New https://pypi.org/project/ultralytics/8.3.236 available  Update with 'pip install -U ultralytics'
Ultralytics 8.3.235  Python-3.12.10 torch-2.9.1+cu128 CUDA:0 (NVIDIA GeForce RTX 5070 Ti, 16303MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=full_dataset/data.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=300, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train6, nbs=64,

In [14]:
import pandas as pd
import pathlib
import ast
import re

# ----------------- USER ADJUSTABLE VARIABLES -----------------
# Provide GT (path or DataFrame). Example uses your variable name:
gt_input = pd.read_csv(timor_leste_data_path, encoding="utf-8-sig", header=0, skiprows=1)

# Predictions CSV path or DataFrame
preds_input = "predictions_with_groundtruth_matches.csv"

# Column names used in your files
gt_image_col = "image_file"
gt_species_col = "Species_name"
preds_image_col = "image_file"
preds_class_col = "pred_class"   # one-row-per-detection format (optional)
preds_list_col = "pred_list"     # one-row-per-image list column (optional)

# Choose Title case for predictions
PRED_CASE = "title"
# ------------------------------------------------------------

# ----------------- Helper functions -----------------
def title_case_name(s):
    """Return string in 'Genus species' style: Genus capitalized, rest lowercased."""
    if s is None:
        return ""
    s = str(s).strip()
    # collapse whitespace and separators
    s = re.sub(r"[ _\-\.\s]+", " ", s).strip()
    parts = s.split()
    if len(parts) >= 2:
        return parts[0].capitalize() + " " + " ".join(p.lower() for p in parts[1:])
    return s.capitalize()

def normalize_label_title(x):
    """Normalize input to Title Case for consistent comparison."""
    if pd.isna(x):
        return ""
    return title_case_name(x)

def snake_or_token_to_name(token: str, case="title"):
    """Convert tokens like 'scomberomorus_commerson' to Title Case or raw string."""
    if token is None:
        return ""
    s = str(token).strip().strip("'\"")
    s = re.sub(r"[ _\-\.\s]+", " ", s).strip()
    if case == "lower":
        return s.lower()
    if case == "title":
        return title_case_name(s)
    return s

def parse_pred_list_cell(val, case="title"):
    """
    Robust parse for pred list cells -> returns list of converted labels in chosen case.
    """
    if val is None or (isinstance(val, float) and pd.isna(val)):
        return []
    if isinstance(val, (list, tuple)):
        return [snake_or_token_to_name(x, case=case) for x in val]
    s = str(val).strip()
    # try safe literal eval
    if s.startswith("[") and s.endswith("]"):
        try:
            parsed = ast.literal_eval(s)
            if isinstance(parsed, (list, tuple)):
                return [snake_or_token_to_name(x, case=case) for x in parsed]
        except Exception:
            pass
    # separators
    if "|" in s:
        parts = [p.strip() for p in s.split("|") if p.strip()]
        return [snake_or_token_to_name(x, case=case) for x in parts]
    if "," in s:
        parts = [p.strip() for p in s.split(",") if p.strip()]
        return [snake_or_token_to_name(x, case=case) for x in parts]
    return [snake_or_token_to_name(s, case=case)]

# ----------------- Load inputs -----------------
def load_gt(gt_input):
    if isinstance(gt_input, pd.DataFrame):
        return gt_input.copy()
    if isinstance(gt_input, (str, pathlib.Path)):
        return pd.read_csv(str(gt_input))
    if hasattr(gt_input, "read") and callable(getattr(gt_input, "read")):
        return pd.read_csv(gt_input)
    raise TypeError("gt_input must be a DataFrame, filename (str/Path), or file-like object.")

def load_preds(preds_input):
    if isinstance(preds_input, pd.DataFrame):
        return preds_input.copy()
    if isinstance(preds_input, (str, pathlib.Path)):
        return pd.read_csv(str(preds_input))
    if hasattr(preds_input, "read") and callable(getattr(preds_input, "read")):
        return pd.read_csv(preds_input)
    raise TypeError("preds_input must be a DataFrame, filename (str/Path), or file-like object.")

gt = load_gt(gt_input)
preds_df = load_preds(preds_input)

# Normalize image filename columns
gt[gt_image_col] = gt[gt_image_col].astype(str).str.strip()
preds_df[preds_image_col] = preds_df[preds_image_col].astype(str).str.strip()

# ----------------- Prepare ground-truth lists (Title Case) -----------------
def split_gt_species(s):
    if pd.isna(s):
        return []
    s = str(s)
    for sep in [" & ", "&", "/", ";", ",", "|"]:
        s = s.replace(sep, " | ")
    parts = [p.strip() for p in s.split("|") if p.strip()]
    return parts

gt["gt_list"] = gt[gt_species_col].apply(split_gt_species)
# Normalize GT list to Title Case
gt["gt_norm_list"] = gt["gt_list"].apply(lambda lst: [normalize_label_title(x) for x in lst])

# ----------------- Prepare predictions grouped per image (converted to Title Case) -----------------
if preds_class_col in preds_df.columns:
    # convert token labels in pred_class directly to Title Case, then group
    preds_df["_pred_converted"] = preds_df[preds_class_col].apply(lambda v: snake_or_token_to_name(v, case=PRED_CASE))
    preds_grouped = preds_df.groupby(preds_image_col)["_pred_converted"].apply(list).reset_index()
    preds_grouped.columns = [preds_image_col, "predicted_list"]
elif preds_list_col in preds_df.columns:
    preds_grouped = preds_df[[preds_image_col, preds_list_col]].copy()
    preds_grouped["predicted_list"] = preds_grouped[preds_list_col].apply(lambda v: parse_pred_list_cell(v, case=PRED_CASE))
    preds_grouped = preds_grouped[[preds_image_col, "predicted_list"]]
else:
    # fallback search for commonly named columns
    fallback_names = ["predicted_list", "predictions", "preds"]
    found = False
    for name in fallback_names:
        if name in preds_df.columns:
            preds_grouped = preds_df[[preds_image_col, name]].copy()
            preds_grouped["predicted_list"] = preds_grouped[name].apply(lambda v: parse_pred_list_cell(v, case=PRED_CASE))
            preds_grouped = preds_grouped[[preds_image_col, "predicted_list"]]
            found = True
            break
    if not found:
        raise ValueError(f"Predictions must contain either '{preds_class_col}' or '{preds_list_col}' or one of {fallback_names}.")

# ----------------- Merge and compute matches -----------------
merged = pd.merge(
    gt[[gt_image_col, gt_species_col, "gt_list", "gt_norm_list"]],
    preds_grouped,
    how="left",
    left_on=gt_image_col,
    right_on=preds_image_col
)

# ensure predicted_list is a real list
merged["predicted_list"] = merged["predicted_list"].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else list(x)))
# Normalize predicted labels to Title Case for preds_norm (ensures Title)
merged["preds_norm"] = merged["predicted_list"].apply(lambda lst: [normalize_label_title(p) for p in lst])

def any_match(gt_norm_list, preds_norm_list):
    if not gt_norm_list:
        return False
    return any((g == p) for g in gt_norm_list for p in preds_norm_list)

merged["match_any"] = merged.apply(lambda r: any_match(r["gt_norm_list"], r["preds_norm"]), axis=1)
merged["n_predictions"] = merged["preds_norm"].apply(len)
merged["multi_predictions"] = merged["n_predictions"] > 1
merged["zero_predictions"] = merged["n_predictions"] == 0

def first_matching_pred(gt_norm_list, preds_norm_list, preds_orig_list):
    for p_norm, p_orig in zip(preds_norm_list, preds_orig_list):
        if any(p_norm == g for g in gt_norm_list):
            return p_orig
    return None

merged["matching_pred"] = merged.apply(lambda r: first_matching_pred(r["gt_norm_list"], r["preds_norm"], r["predicted_list"]), axis=1)

# ----------------- Outputs & summary -----------------
match_flag_series = merged[[gt_image_col, "match_any"]].copy()
multi_pred_filenames = merged.loc[merged["multi_predictions"], gt_image_col].tolist()
zero_pred_filenames = merged.loc[merged["zero_predictions"], gt_image_col].tolist()

total = len(merged)
num_matches = int(merged["match_any"].sum())
pct = num_matches / total * 100 if total > 0 else 0.0

print(f"Total GT rows: {total}")
print(f"Images with >=1 matching prediction: {num_matches} ({pct:.2f}%)")
print(f"Images with multiple predictions (>1): {merged['multi_predictions'].sum()}")
print(f"Images with zero predictions: {merged['zero_predictions'].sum()}")

# quick sanity-check: show a few rows and their normalized columns
print("\nExample rows (image_file, gt_norm_list, predicted_list, preds_norm, match_any):")
print(merged[[gt_image_col, "gt_norm_list", "predicted_list", "preds_norm", "match_any"]].head(10).to_string(index=False))

# Save full analysis for inspection
out_csv = "gt_vs_preds_analysis_converted_title.csv"
merged.to_csv(out_csv, index=False)
print(f"\nSaved analysis to: {out_csv}")

# Expose variables for further use: merged, match_flag_series, multi_pred_filenames, zero_pred_filenames



Total GT rows: 603
Images with >=1 matching prediction: 13 (2.16%)
Images with multiple predictions (>1): 142
Images with zero predictions: 284

Example rows (image_file, gt_norm_list, predicted_list, preds_norm, match_any):
       image_file           gt_norm_list        predicted_list            preds_norm  match_any
1689120590238.jpg  [Caranx sexfasciatus]                 [Nan]                 [Nan]      False
1689254343092.jpg         [Gazza minuta]            [Nan, Nan]            [Nan, Nan]      False
1689254343092.jpg     [Upeneus vittatus]            [Nan, Nan]            [Nan, Nan]      False
1689379821579.jpg           [Decapterus]                    []                    []      False
1689388044927.jpg      [Caranx lugubris] [Pomadasys argenteus] [Pomadasys argenteus]      False
1689388326411.jpg      [Caranx lugubris]  [Parupeneus indicus]  [Parupeneus indicus]      False
1689421223571.jpg [Rastrelliger faughni]                    []                    []      False
1689556

In [None]:
import pandas as pd
import pathlib
import ast
import re

# ----------------- USER ADJUSTABLE VARIABLES -----------------
# Provide GT (path or DataFrame). Example uses your variable name:
gt_input = pd.read_csv(timor_leste_data_path, encoding="utf-8-sig", header=0, skiprows=1)

# Predictions CSV path or DataFrame
preds_input = "predictions_with_groundtruth_matches.csv"

# Column names used in your files
gt_image_col = "image_file"
gt_species_col = "Species_name"
preds_image_col = "image_file"
preds_class_col = "pred_class"   # one-row-per-detection format (optional)
preds_list_col = "pred_list"     # one-row-per-image list column (optional)

# Choose Title case for predictions
PRED_CASE = "title"
# ------------------------------------------------------------

# ----------------- Helper functions -----------------
def title_case_name(s):
    """Return string in 'Genus species' style: Genus capitalized, rest lowercased."""
    if s is None:
        return ""
    s = str(s).strip()
    # collapse whitespace and separators
    s = re.sub(r"[ _\-\.\s]+", " ", s).strip()
    parts = s.split()
    if len(parts) >= 2:
        return parts[0].capitalize() + " " + " ".join(p.lower() for p in parts[1:])
    return s.capitalize()

def normalize_label_title(x):
    """Normalize input to Title Case for consistent comparison."""
    if pd.isna(x):
        return ""
    return title_case_name(x)

def snake_or_token_to_name(token: str, case="title"):
    """Convert tokens like 'scomberomorus_commerson' to Title Case or raw string."""
    if token is None:
        return ""
    s = str(token).strip().strip("'\"")
    s = re.sub(r"[ _\-\.\s]+", " ", s).strip()
    if case == "lower":
        return s.lower()
    if case == "title":
        return title_case_name(s)
    return s

def parse_pred_list_cell(val, case="title"):
    """
    Robust parse for pred list cells -> returns list of converted labels in chosen case.
    """
    if val is None or (isinstance(val, float) and pd.isna(val)):
        return []
    if isinstance(val, (list, tuple)):
        return [snake_or_token_to_name(x, case=case) for x in val]
    s = str(val).strip()
    # try safe literal eval
    if s.startswith("[") and s.endswith("]"):
        try:
            parsed = ast.literal_eval(s)
            if isinstance(parsed, (list, tuple)):
                return [snake_or_token_to_name(x, case=case) for x in parsed]
        except Exception:
            pass
    # separators
    if "|" in s:
        parts = [p.strip() for p in s.split("|") if p.strip()]
        return [snake_or_token_to_name(x, case=case) for x in parts]
    if "," in s:
        parts = [p.strip() for p in s.split(",") if p.strip()]
        return [snake_or_token_to_name(x, case=case) for x in parts]
    return [snake_or_token_to_name(s, case=case)]

# ----------------- Load inputs -----------------
def load_gt(gt_input):
    if isinstance(gt_input, pd.DataFrame):
        return gt_input.copy()
    if isinstance(gt_input, (str, pathlib.Path)):
        return pd.read_csv(str(gt_input))
    if hasattr(gt_input, "read") and callable(getattr(gt_input, "read")):
        return pd.read_csv(gt_input)
    raise TypeError("gt_input must be a DataFrame, filename (str/Path), or file-like object.")

def load_preds(preds_input):
    if isinstance(preds_input, pd.DataFrame):
        return preds_input.copy()
    if isinstance(preds_input, (str, pathlib.Path)):
        return pd.read_csv(str(preds_input))
    if hasattr(preds_input, "read") and callable(getattr(preds_input, "read")):
        return pd.read_csv(preds_input)
    raise TypeError("preds_input must be a DataFrame, filename (str/Path), or file-like object.")

gt = load_gt(gt_input)
preds_df = load_preds(preds_input)

# Normalize image filename columns
gt[gt_image_col] = gt[gt_image_col].astype(str).str.strip()
preds_df[preds_image_col] = preds_df[preds_image_col].astype(str).str.strip()

# ----------------- Prepare ground-truth lists (Title Case) -----------------
def split_gt_species(s):
    if pd.isna(s):
        return []
    s = str(s)
    for sep in [" & ", "&", "/", ";", ",", "|"]:
        s = s.replace(sep, " | ")
    parts = [p.strip() for p in s.split("|") if p.strip()]
    return parts

gt["gt_list"] = gt[gt_species_col].apply(split_gt_species)
# Normalize GT list to Title Case
gt["gt_norm_list"] = gt["gt_list"].apply(lambda lst: [normalize_label_title(x) for x in lst])

# ----------------- Prepare predictions grouped per image (converted to Title Case) -----------------
if preds_class_col in preds_df.columns:
    # convert token labels in pred_class directly to Title Case, then group
    preds_df["_pred_converted"] = preds_df[preds_class_col].apply(lambda v: snake_or_token_to_name(v, case=PRED_CASE))
    preds_grouped = preds_df.groupby(preds_image_col)["_pred_converted"].apply(list).reset_index()
    preds_grouped.columns = [preds_image_col, "predicted_list"]
elif preds_list_col in preds_df.columns:
    preds_grouped = preds_df[[preds_image_col, preds_list_col]].copy()
    preds_grouped["predicted_list"] = preds_grouped[preds_list_col].apply(lambda v: parse_pred_list_cell(v, case=PRED_CASE))
    preds_grouped = preds_grouped[[preds_image_col, "predicted_list"]]
else:
    # fallback search for commonly named columns
    fallback_names = ["predicted_list", "predictions", "preds"]
    found = False
    for name in fallback_names:
        if name in preds_df.columns:
            preds_grouped = preds_df[[preds_image_col, name]].copy()
            preds_grouped["predicted_list"] = preds_grouped[name].apply(lambda v: parse_pred_list_cell(v, case=PRED_CASE))
            preds_grouped = preds_grouped[[preds_image_col, "predicted_list"]]
            found = True
            break
    if not found:
        raise ValueError(f"Predictions must contain either '{preds_class_col}' or '{preds_list_col}' or one of {fallback_names}.")

# ----------------- Merge and compute matches -----------------
merged = pd.merge(
    gt[[gt_image_col, gt_species_col, "gt_list", "gt_norm_list"]],
    preds_grouped,
    how="left",
    left_on=gt_image_col,
    right_on=preds_image_col
)

# ensure predicted_list is a real list
merged["predicted_list"] = merged["predicted_list"].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else list(x)))
# Normalize predicted labels to Title Case for preds_norm (ensures Title)
merged["preds_norm"] = merged["predicted_list"].apply(lambda lst: [normalize_label_title(p) for p in lst])

def any_match(gt_norm_list, preds_norm_list):
    if not gt_norm_list:
        return False
    return any((g == p) for g in gt_norm_list for p in preds_norm_list)

merged["match_any"] = merged.apply(lambda r: any_match(r["gt_norm_list"], r["preds_norm"]), axis=1)
merged["n_predictions"] = merged["preds_norm"].apply(len)
merged["multi_predictions"] = merged["n_predictions"] > 1
merged["zero_predictions"] = merged["n_predictions"] == 0

def first_matching_pred(gt_norm_list, preds_norm_list, preds_orig_list):
    for p_norm, p_orig in zip(preds_norm_list, preds_orig_list):
        if any(p_norm == g for g in gt_norm_list):
            return p_orig
    return None

merged["matching_pred"] = merged.apply(lambda r: first_matching_pred(r["gt_norm_list"], r["preds_norm"], r["predicted_list"]), axis=1)

# ----------------- Outputs & summary -----------------
match_flag_series = merged[[gt_image_col, "match_any"]].copy()
multi_pred_filenames = merged.loc[merged["multi_predictions"], gt_image_col].tolist()
zero_pred_filenames = merged.loc[merged["zero_predictions"], gt_image_col].tolist()

total = len(merged)
num_matches = int(merged["match_any"].sum())
pct = num_matches / total * 100 if total > 0 else 0.0

print(f"Total GT rows: {total}")
print(f"Images with >=1 matching prediction: {num_matches} ({pct:.2f}%)")
print(f"Images with multiple predictions (>1): {merged['multi_predictions'].sum()}")
print(f"Images with zero predictions: {merged['zero_predictions'].sum()}")

# quick sanity-check: show a few rows and their normalized columns
print("\nExample rows (image_file, gt_norm_list, predicted_list, preds_norm, match_any):")
print(merged[[gt_image_col, "gt_norm_list", "predicted_list", "preds_norm", "match_any"]].head(10).to_string(index=False))

# Save full analysis for inspection
out_csv = "gt_vs_preds_analysis_converted_title.csv"
merged.to_csv(out_csv, index=False)
print(f"\nSaved analysis to: {out_csv}")

# Expose variables for further use: merged, match_flag_series, multi_pred_filenames, zero_pred_filenames



Total GT rows: 603
Images with >=1 matching prediction: 13 (2.16%)
Images with multiple predictions (>1): 142
Images with zero predictions: 284

Example rows (image_file, gt_norm_list, predicted_list, preds_norm, match_any):
       image_file           gt_norm_list        predicted_list            preds_norm  match_any
1689120590238.jpg  [Caranx sexfasciatus]                 [Nan]                 [Nan]      False
1689254343092.jpg         [Gazza minuta]            [Nan, Nan]            [Nan, Nan]      False
1689254343092.jpg     [Upeneus vittatus]            [Nan, Nan]            [Nan, Nan]      False
1689379821579.jpg           [Decapterus]                    []                    []      False
1689388044927.jpg      [Caranx lugubris] [Pomadasys argenteus] [Pomadasys argenteus]      False
1689388326411.jpg      [Caranx lugubris]  [Parupeneus indicus]  [Parupeneus indicus]      False
1689421223571.jpg [Rastrelliger faughni]                    []                    []      False
1689556

In [None]:
display(Image(filename="runs/detect/train6/results.png"))
display(Image(filename="runs/detect/val3/confusion_matrix.png"))

In [9]:
import os
import pandas as pd
from ultralytics import YOLO

# ---------- config ----------
MODEL_PATH = "runs/detect/train6/weights/best.pt"
IMAGE_PATHS = [os.path.join("../data_images/", f) for f in filtered_images]  # filtered_images must exist
CSV_OUT = "predictions_with_groundtruth_matches.csv"
GT_FILENAME_COL = "image_file"          # column in df_tl_ann to join on
GT_SPECIES_COL = "Species_name"       # ground truth species column name in df_tl_ann
# ----------------------------

# Load model
model = YOLO(MODEL_PATH)

# Run predictions (you can tune imgsz, conf, etc.)
results = model.predict(source=IMAGE_PATHS, imgsz=480)

# Build predictions dataframe
pred_rows = []
for r in results:
    image_name = os.path.basename(r.path)
    boxes = getattr(r, "boxes", None)

    if boxes is None or len(boxes) == 0:
        pred_rows.append({
            "image_file": image_name,
            "pred_class": None,
            "pred_conf": None,
            "pred_x1": None,
            "pred_y1": None,
            "pred_x2": None,
            "pred_y2": None,
        })
        continue

    xyxy = boxes.xyxy.cpu().numpy()
    scores = boxes.conf.cpu().numpy()
    classes = boxes.cls.cpu().numpy()

    for (x1, y1, x2, y2), conf, cls in zip(xyxy, scores, classes):
        pred_rows.append({
            "image_file": image_name,
            "pred_class": model.names[int(cls)] if cls is not None else None,
            "pred_conf": float(conf),
            "pred_x1": int(x1),
            "pred_y1": int(y1),
            "pred_x2": int(x2),
            "pred_y2": int(y2),
        })

df_preds = pd.DataFrame(pred_rows)

# ---------- load ground truth ----------
# If df_tl_ann is not already loaded, load it here, e.g.:
# df_tl_ann = pd.read_csv("timor_leste_annotations.csv")
# Ensure it contains the expected columns
if GT_FILENAME_COL not in df_tl_ann.columns:
    raise KeyError(f"Ground truth dataframe must contain column '{GT_FILENAME_COL}'")
if GT_SPECIES_COL not in df_tl_ann.columns:
    raise KeyError(f"Ground truth dataframe must contain species column '{GT_SPECIES_COL}'")

# If ground truth has multiple rows per filename (multiple labeled fish per image),
# aggregate species into a list per filename to allow matching any of them.
df_gt_grouped = (
    df_tl_ann
    .groupby(GT_FILENAME_COL)[GT_SPECIES_COL]
    .apply(lambda s: list(s.dropna().astype(str)))
    .reset_index()
    .rename(columns={GT_SPECIES_COL: "gt_species_list"})
)

# Merge predictions with grouped ground truth (left join keeps predictions even if no GT)
df_merged = df_preds.merge(df_gt_grouped, on="image_file", how="left")

# ---------- helper for normalization ----------
def normalize_name(s):
    if pd.isna(s) or s is None:
        return ""
    # lowercase, strip whitespace, replace underscores, collapse multiple spaces
    s2 = str(s).lower().strip().replace("_", " ")
    s2 = " ".join(s2.split())
    return s2

# Normalize GT species lists & prediction class for comparison
df_merged["pred_class_norm"] = df_merged["pred_class"].apply(normalize_name)
df_merged["gt_species_norm_list"] = df_merged["gt_species_list"].apply(
    lambda lst: [normalize_name(x) for x in lst] if isinstance(lst, list) else []
)

# ---------- compute match column ----------
def compare_row(row):
    pred = row["pred_class_norm"]
    gt_list = row["gt_species_norm_list"]

    if (pred == "" or pred is None) and (not gt_list):
        # no detection and no gt
        return "NO_DETECTION_AND_NO_GT"
    if pred == "" or pred is None:
        # no detection but GT exists
        return "NO_DETECTION"
    if not gt_list:
        # prediction exists but no ground truth
        return "NO_GT"
    # if any exact match in normalized strings:
    if pred in gt_list:
        return "MATCH"
    # optionally: fuzzy matching can be done here (e.g., substring/in operator)
    # try substring match (pred contained in any gt or vice versa)
    for g in gt_list:
        if pred in g or g in pred:
            return "MATCH_SUBSTRING"
    return "MISMATCH"

df_merged["match"] = df_merged.apply(compare_row, axis=1)

summary = df_merged["match"].value_counts(dropna=False)
print("Match summary:\n", summary)

# Save to CSV
df_merged.to_csv(CSV_OUT, index=False)
print(f"\nSaved merged file with matches to: {CSV_OUT}")


0: 480x480 (no detections), 0.8ms
1: 480x480 (no detections), 0.8ms
2: 480x480 (no detections), 0.8ms
3: 480x480 1 pomadasys_argenteus, 0.8ms
4: 480x480 1 parupeneus_indicus, 0.8ms
5: 480x480 7 chirocentrus_dorabs, 0.8ms
6: 480x480 1 chirocentrus_dorab, 0.8ms
7: 480x480 1 chirocentrus_dorab, 0.8ms
8: 480x480 (no detections), 0.8ms
9: 480x480 (no detections), 0.8ms
10: 480x480 (no detections), 0.8ms
11: 480x480 1 acanthocybium_solandri, 1 scomberomorus_commerson, 0.8ms
12: 480x480 1 acanthocybium_solandri, 0.8ms
13: 480x480 1 alectis_ciliaris, 0.8ms
14: 480x480 2 chirocentrus_dorabs, 0.8ms
15: 480x480 (no detections), 0.8ms
16: 480x480 1 chirocentrus_dorab, 0.8ms
17: 480x480 1 chirocentrus_dorab, 0.8ms
18: 480x480 1 chirocentrus_dorab, 0.8ms
19: 480x480 1 chirocentrus_dorab, 0.8ms
20: 480x480 1 caranx_ignobilis, 0.8ms
21: 480x480 1 chirocentrus_dorab, 0.8ms
22: 480x480 1 caranx_ignobilis, 0.8ms
23: 480x480 1 scomberomorus_commerson, 0.8ms
24: 480x480 1 scomberomorus_commerson, 0.8ms
25