In [4]:
import os
import pandas as pd

In [13]:
DATA_DIR = "../data_csv/"
IMAGES_DIR = "../data_images/"

timor_leste_data_path = os.path.join(DATA_DIR, "timor-leste.csv") # Annotation info for ground truth

# Read annotation info (Timor-leste)
df_tl_ann = pd.read_csv(timor_leste_data_path, encoding="utf-8-sig", header=0, skiprows=1)
df_tl_ann = df_tl_ann[["image_file", "catch_name_en", "Species_name", "Family"]]


In [12]:
# Filter out relevant images
relevant_species = ['Alectis ciliaris', 'Aphareus rutilans', 'Caranx ignobilis', 'Caranx lugubris', 'Caranx melampygus', 'Caranx sexfasciatus', 'Chirocentrus dorab', 'Chirocentrus nudus', 'Decapterus macrosoma', 'Elagatis bipinnulata', 'Epinephelus maculatus', 'Epinephelus radiatus', 'Etelis carbunculus', 'Gymnocranius grandoculis', 'Katsuwonus pelamis', 'Lethrinus atkinsoni', 'Lethrinus erythracanthus', 'Lethrinus obsoletus', 'Lethrinus ornatus', 'Lutjanus bohar', 'Lutjanus fulviflamma', 'Lutjanus fulvus', 'Lutjanus gibbus', 'Lutjanus johnii', 'Lutjanus kasmira', 'Lutjanus rivulatus', 'Lutjanus russellii', 'Lutjanus timoriensis', 'Monotaxis grandoculis', 'Psettodes erumei', 'Rastrelliger kanagurta', 'Sardinella albella', 'Scolopsis lineata', 'Scolopsis vosmeri', 'Scomberoides lysan', 'Scomberomorus commerson', 'Seriola dumerili', 'Variola albimarginata']

# Read annotation info (Timor-leste)
df_tl_ann = pd.read_csv(timor_leste_data_path, encoding="utf-8-sig", header=0, skiprows=1)
df_tl_ann = df_tl_ann[["image_file", "catch_name_en", "Species_name", "Family"]]

# Filter by species in relevant_species
df_filtered = df_tl_ann[df_tl_ann["Species_name"].isin(relevant_species)]

# Keep only images that actually exist
existing_files = set(os.listdir(IMAGES_DIR))

df_filtered = df_filtered[df_filtered["image_file"].isin(existing_files)]

# Convert to list of filenames
filtered_images= df_filtered["image_file"].tolist()

print("Total annotated images:", len(df_tl_ann))
print("Relevant images", len(df_filtered))
print("Unique final JPGs:", len(set(filtered_images)))


Total annotated images: 603
Relevant images 248
Unique final JPGs: 217


In [14]:
raw = """

"""

classes = [line.strip() for line in raw.splitlines() if line.strip()]

# 2) Format: "Genus species" (first word capitalized, others lowercase)
def format_name(name: str) -> str:
    # Leave codes / single tokens as-is
    if "_" not in name:
        return name

    parts = name.split("_")
    parts[0] = parts[0].capitalize()          # Genus
    parts[1:] = [p.lower() for p in parts[1:]]  # species etc.
    return " ".join(parts)

input_species = [format_name(n) for n in classes]

In [15]:
registered_species = set(df_tl_ann["catch_name_en"].dropna().astype(str).str.strip().unique())

input_species = [
    "Blacksaddle filefish",
    "Blue streak cleaner wrasse",
    "Butterfly fish",
    "Checkerboard wrasse",
    "Clark's anemonefish",
    "Commerson's frogfish",
    "Lionfish",
    "Longfin bannerfish",
    "Manybar goatfish",
    "Moorish idol",
    "Pyramid butterflyfish",
    "Redtoothed triggerfish",
    "Reticulate dascyllus",
    "Ribboned sweetlips",
    "Sea goldie",
    "Sergeant major fish",
    "Teira batfish",
    "Weber's chromis"
]

input_species = set([s.strip() for s in input_species])

# Compare sets
intersection = registered_species.intersection(input_species)
only_in_input = input_species - registered_species
only_in_registered = registered_species - input_species

# Print results
print("Species found in BOTH input and dataset:")
print(sorted(intersection), "\n")

print("Species in INPUT but NOT in dataset:")
print(sorted(only_in_input), "\n")

print("Species in dataset but NOT in input list:")
print(f"(Showing first 20) {sorted(list(only_in_registered))[:20]}")


Species found in BOTH input and dataset:
[] 

Species in INPUT but NOT in dataset:
['Blacksaddle filefish', 'Blue streak cleaner wrasse', 'Butterfly fish', 'Checkerboard wrasse', "Clark's anemonefish", "Commerson's frogfish", 'Lionfish', 'Longfin bannerfish', 'Manybar goatfish', 'Moorish idol', 'Pyramid butterflyfish', 'Redtoothed triggerfish', 'Reticulate dascyllus', 'Ribboned sweetlips', 'Sea goldie', 'Sergeant major fish', 'Teira batfish', "Weber's chromis"] 

Species in dataset but NOT in input list:
(Showing first 20) ['Barracuda & Jacks/Trevally/Other Scad', 'Barracuda & Jacks/Trevally/Other Scad & Jacks/Trevally/Other Scad', 'Barracuda & Jacks/Trevally/Other Scad & Snapper/seaperch', 'Bream & Emperor & Snapper/seaperch & Unknown & Unknown', 'Bream & Grouper & Unknown', 'Bream & Jacks/Trevally/Other Scad', 'Bream & Jacks/Trevally/Other Scad & Stingrays', 'Bream & Snapper/seaperch', 'Emperor', 'Emperor & Goatfish & Spinefoot & Tuna/Bonito/Other Mackerel', 'Emperor & Jacks/Trevally