In [1]:
import os
import pandas as pd

In [2]:
DATA_DIR = "../data_csv/"
IMAGES_DIR = "../data_images/"

timor_leste_data_path = os.path.join(DATA_DIR, "timor-leste.csv") # Annotation info for ground truth

# Read annotation info (Timor-leste)
df_tl_ann = pd.read_csv(timor_leste_data_path, encoding="utf-8-sig", header=0, skiprows=1)
df_tl_ann = df_tl_ann[["image_file", "catch_name_en", "Species_name", "Family"]]


In [3]:
# Filter out relevant images
relevant_species = ['Alectis ciliaris', 'Aphareus rutilans', 'Caranx ignobilis', 'Caranx lugubris', 'Caranx melampygus', 'Caranx sexfasciatus', 'Chirocentrus dorab', 'Chirocentrus nudus', 'Decapterus macrosoma', 'Elagatis bipinnulata', 'Epinephelus maculatus', 'Epinephelus radiatus', 'Etelis carbunculus', 'Gymnocranius grandoculis', 'Katsuwonus pelamis', 'Lethrinus atkinsoni', 'Lethrinus erythracanthus', 'Lethrinus obsoletus', 'Lethrinus ornatus', 'Lutjanus bohar', 'Lutjanus fulviflamma', 'Lutjanus fulvus', 'Lutjanus gibbus', 'Lutjanus johnii', 'Lutjanus kasmira', 'Lutjanus rivulatus', 'Lutjanus russellii', 'Lutjanus timoriensis', 'Monotaxis grandoculis', 'Psettodes erumei', 'Rastrelliger kanagurta', 'Sardinella albella', 'Scolopsis lineata', 'Scolopsis vosmeri', 'Scomberoides lysan', 'Scomberomorus commerson', 'Seriola dumerili', 'Variola albimarginata']

# Read annotation info (Timor-leste)
df_tl_ann = pd.read_csv(timor_leste_data_path, encoding="utf-8-sig", header=0, skiprows=1)
df_tl_ann = df_tl_ann[["image_file", "catch_name_en", "Species_name", "Family"]]

# Filter by species in relevant_species
df_filtered = df_tl_ann[df_tl_ann["Species_name"].isin(relevant_species)]

# Keep only images that actually exist
existing_files = set(os.listdir(IMAGES_DIR))

df_filtered = df_filtered[df_filtered["image_file"].isin(existing_files)]

# Convert to list of filenames
filtered_images= df_filtered["image_file"].tolist()

print("Total annotated images:", len(df_tl_ann))
print("Relevant images", len(df_filtered))
print("Unique final JPGs:", len(set(filtered_images)))


Total annotated images: 603
Relevant images 248
Unique final JPGs: 217


In [4]:
raw = """
Ablabys_taenianotus
Abudefduf_notatus
Abudefduf_sexfasciatus
Abudefduf_sordidus
Abudefduf_troschelii
Abudefduf_vaigiensis
Acanthurus_blochii
Acanthurus_triostegus
Antennatus_dorehensis
Arothron_hispidus
Atherinomorus_lacunosus
Bathygobius_cotticeps
Blenniella_bilitonensis
Blenniella_paula
Chaetodon_auriga
Chaetodon_ephippium
Chaetodon_lunula
Chaetodon_melannotus
Cheilinus_chlorourus
Cheilinus_trilobatus
Chromis_margaritifer
Chrysiptera_biocellata
Chrysiptera_brownriggii
Chrysiptera_unimaculata
Corythoichthys_haematopterus
Echidna_nebulosa
Encrasicholina_punctifer
Entomacrodus_decussatus
Entomacrodus_striatus
Epinephelus_merra
Epinephelus_quoyanus
Grammistes_sexlineatus
Gymnothorax_fimbriatus
Gymnothorax_flavimarginatus
Gymnothorax_gracilicauda
Gymnothorax_javanicus
Gymnothorax_kidako
Gymnothorax_undulatus
Halichoeres_argus
Halichoeres_marginatus
Halichoeres_nebulosus
Halichoeres_trimaculatus
Helcogramma_vulcanum
Hemigymnus_melapterus
Istiblennius_dussumieri
Istiblennius_edentulus
Istiblennius_meleagris
Istigobius_ornatus
Lethrinus_ornatus
Lutjanus_ehrenbergii
Myrichthys_maculosus
Myripristis_hexagona
Nectamia_fusca
Neoniphon_sammara
Novaculichthys_taeniourus
Omobranchus_elongatus
Ostorhinchus_cookii
Ostorhinchus_novemfasciatus
Parapercis_millepunctata
Parascorpaena_mossambica
Parupeneus_barberinoides
Parupeneus_barberinus
Parupeneus_indicus
Platax_orbicularis
Plectorhinchus_gibbosus
Plectroglyphidodon_leucozonus
Plesiops_coeruleolineatus
Pseudochromis_marshallensis
Pterois_volitans
Salarias_fasciatus
Salarias_guttatus
Sargocentron_rubrum
Scolopsis_bilineata
Scorpaenodes_guamensis
Siganus_fuscescens
Soleichthys_heterorhinos
Stethojulis_bandanensis
Stethojulis_strigiventer
Stethojulis_terina
Synodus_binotatus
Thalassoma_hardwicke
Unlabeled
Valenciennea_longipinnis
"""

classes = [line.strip() for line in raw.splitlines() if line.strip()]

# 2) Format: "Genus species" (first word capitalized, others lowercase)
def format_name(name: str) -> str:
    # Leave codes / single tokens as-is
    if "_" not in name:
        return name

    parts = name.split("_")
    parts[0] = parts[0].capitalize()          # Genus
    parts[1:] = [p.lower() for p in parts[1:]]  # species etc.
    return " ".join(parts)

input_species = [format_name(n) for n in classes]

In [6]:
registered_species = set(df_tl_ann["Species_name"].dropna().astype(str).str.strip().unique())

input_species = set([s.strip() for s in input_species])

# Compare sets
intersection = registered_species.intersection(input_species)
only_in_input = input_species - registered_species
only_in_registered = registered_species - input_species

# Print results
print("Species found in BOTH input and dataset:")
print(sorted(intersection), "\n")

print("Species in INPUT but NOT in dataset:")
print(sorted(only_in_input), "\n")

print("Species in dataset but NOT in input list:")
print(f"(Showing first 20) {sorted(list(only_in_registered))[:20]}")


Species found in BOTH input and dataset:
['Lethrinus ornatus', 'Neoniphon sammara', 'Parupeneus indicus', 'Siganus fuscescens'] 

Species in INPUT but NOT in dataset:
['Ablabys taenianotus', 'Abudefduf notatus', 'Abudefduf sexfasciatus', 'Abudefduf sordidus', 'Abudefduf troschelii', 'Abudefduf vaigiensis', 'Acanthurus blochii', 'Acanthurus triostegus', 'Antennatus dorehensis', 'Arothron hispidus', 'Atherinomorus lacunosus', 'Bathygobius cotticeps', 'Blenniella bilitonensis', 'Blenniella paula', 'Chaetodon auriga', 'Chaetodon ephippium', 'Chaetodon lunula', 'Chaetodon melannotus', 'Cheilinus chlorourus', 'Cheilinus trilobatus', 'Chromis margaritifer', 'Chrysiptera biocellata', 'Chrysiptera brownriggii', 'Chrysiptera unimaculata', 'Corythoichthys haematopterus', 'Echidna nebulosa', 'Encrasicholina punctifer', 'Entomacrodus decussatus', 'Entomacrodus striatus', 'Epinephelus merra', 'Epinephelus quoyanus', 'Grammistes sexlineatus', 'Gymnothorax fimbriatus', 'Gymnothorax flavimarginatus', '