In [1]:
!pip install -q mutagen

### Clean the dataset

In [19]:
from pathlib import Path
import os
import re
import pandas as pd
import math
import statistics

ROOT = Path("./all_data").resolve()

# Define allowed audio file extensions
ALLOWED_EXTS = {".wav", ".mp3", ".flac", ".m4a", ".ogg", ".aac", ".csv"}

print("Scanning root:", ROOT)

Scanning root: /Users/jago/Desktop/naturalis/data_code/all_data


### Remove wrong file extension names

In [4]:
pattern = re.compile(r"(\.(wav|mp3|flac|m4a|ogg|aac))\?.*?$", re.IGNORECASE)

renamed = []

for f in ROOT.rglob("*"):
    if f.is_file():
        match = pattern.search(f.name)
        if match:
            new_name = re.sub(r"\?.*$", "", f.name)
            new_path = f.with_name(new_name)
            if not new_path.exists():
                f.rename(new_path)
                renamed.append((f.name, new_name))

print(f"Renamed {len(renamed)} files")
if renamed:
    print("Example renames:")
    for old, new in renamed[:10]:
        print(" ", old, "→", new)


Renamed 3804 files
Example renames:
  1076423.mp3?1717453221 → 1076423.mp3
  1581050.wav?1753435098 → 1581050.wav
  1551847.m4a?1751483931 → 1551847.m4a
  1151253.wav?1722685643 → 1151253.wav
  1042369.mp3?1715920450 → 1042369.mp3
  1077567.m4a?1717506924 → 1077567.m4a
  451895.mp3?1653594683 → 451895.mp3
  1075749.m4a?1717423713 → 1075749.m4a
  71729.m4a?1587307780 → 71729.m4a
  488976.m4a?1656808456 → 488976.m4a


In [9]:
to_delete = []

for f in ROOT.rglob("*"):
    if f.is_file():
        ext = f.suffix.lower()
        if ext not in ALLOWED_EXTS:
            to_delete.append(f)

print(f"Found {len(to_delete)} non-audio files")

# Uncomment this block to actually delete
for f in to_delete:
    try:
        # print(f) to check first, if you are sure, then do the next line
        os.remove(f)
    except Exception as e:
        print("Error deleting", f, e)

print("Deleted non-audio files ✅")


Found 648 non-audio files
Deleted non-audio files ✅


In [4]:
# Try to import mutagen for duration reading (no decoding required)
try:
    from mutagen import File as MutagenFile
    HAVE_MUTAGEN = True
except Exception:
    HAVE_MUTAGEN = False

HAVE_MUTAGEN

# Get combined durations of recordings
def is_hidden(p: Path) -> bool:
    return p.name.startswith(".")

def hms(seconds: float) -> str:
    """Convert seconds → HH:MM:SS (rounded)."""
    if seconds is None or (isinstance(seconds, float) and math.isnan(seconds)):
        return "00:00:00"
    seconds = int(round(seconds))
    h = seconds // 3600
    m = (seconds % 3600) // 60
    s = seconds % 60
    return f"{h:02d}:{m:02d}:{s:02d}"

def get_duration_seconds(path: Path):
    """Return duration using mutagen if available, else None."""
    if not HAVE_MUTAGEN:
        return None
    try:
        m = MutagenFile(path.as_posix())
        if m is None:
            return None
        dur = getattr(m.info, "length", None)
        return float(dur) if dur is not None else None
    except Exception:
        return None

In [13]:
records = []  # (rel_path, class, ext, duration_seconds)

species_dirs = sorted([p for p in ROOT.iterdir() if p.is_dir() and not is_hidden(p)])
len(species_dirs), [d.name for d in species_dirs]  # quick peek

(54,
 ['Alytes_almogavarii',
  'Alytes_cisternasii',
  'Alytes_dickhilleni',
  'Alytes_muletensis',
  'Alytes_obstetricans',
  'Bombina_bombina',
  'Bombina_variegata',
  'Bufo_bufo',
  'Bufo_spinosus',
  'Bufotes_balearicus',
  'Bufotes_boulengeri',
  'Bufotes_cypriensis',
  'Bufotes_oblongus',
  'Bufotes_viridis',
  'Epidalea_calamita',
  'Hyla',
  'Hyla_arborea',
  'Hyla_intermedia',
  'Hyla_meridionalis',
  'Hyla_molleri',
  'Hyla_orientalis',
  'Hyla_sarda',
  'Hyla_savignyi',
  'Hyperolius_concolor',
  'Lithobates_catesbeianus',
  'Mystery_mystery',
  'Pelobates_cultripes',
  'Pelobates_fuscus',
  'Pelobates_vespertinus',
  'Pelodytes_atlanticus',
  'Pelodytes_hespericus',
  'Pelodytes_ibericus',
  'Pelodytes_punctatus',
  'Pelophylax',
  "Pelophylax_'esculentus'",
  "Pelophylax_'grafi'",
  'Pelophylax_bedriagae',
  'Pelophylax_bergeri',
  'Pelophylax_cretensis',
  'Pelophylax_cypriensis',
  'Pelophylax_epeiroticus',
  'Pelophylax_esculentus',
  'Pelophylax_kurtmuelleri',
  'Pelo

In [14]:
for species_dir in species_dirs:
    cls = species_dir.name  # species label == folder name
    # scan recursively under each species folder
    for f in species_dir.rglob("*"):
        if f.is_file() and (f.suffix.lower() in ALLOWED_EXTS) and not is_hidden(f):
            rel = f.relative_to(ROOT).as_posix()
            ext = f.suffix.lower()
            dur = get_duration_seconds(f)
            records.append((rel, cls, ext, dur))

len(records)

6824

In [17]:
df = pd.DataFrame(records, columns=["path", "class", "ext", "duration_seconds"])
display(df.head())

Unnamed: 0,path,class,ext,duration_seconds
0,Alytes_almogavarii/396614.m4a,Alytes_almogavarii,.m4a,10.820499
1,Alytes_almogavarii/198483.m4a,Alytes_almogavarii,.m4a,15.627029
2,Alytes_almogavarii/707099.m4a,Alytes_almogavarii,.m4a,10.1239
3,Alytes_almogavarii/XC971001-ALYALM030125MixPre...,Alytes_almogavarii,.wav,52.343946
4,Alytes_almogavarii/781699.m4a,Alytes_almogavarii,.m4a,18.947483


In [21]:
summary = (
    df.groupby("class")["duration_seconds"]
      .agg(
          n_recordings="count",
          total_seconds=lambda x: sum(d for d in x if d is not None and not math.isnan(d)),
          mean_seconds=lambda x: statistics.mean(d for d in x if d is not None and not math.isnan(d)),
          median_seconds=lambda x: statistics.median(d for d in x if d is not None and not math.isnan(d))
      )
      .reset_index()
)

summary["total_hhmmss"] = summary["total_seconds"].apply(hms)
summary = summary[["class", "n_recordings", "total_seconds", "total_hhmmss", "mean_seconds", "median_seconds"]]

display(summary.head(10))


Unnamed: 0,class,n_recordings,total_seconds,total_hhmmss,mean_seconds,median_seconds
0,Alytes_almogavarii,18,663.132847,00:11:03,36.840714,26.248998
1,Alytes_cisternasii,24,1895.613227,00:31:36,78.983884,46.550292
2,Alytes_dickhilleni,10,1448.538571,00:24:09,144.853857,135.454649
3,Alytes_muletensis,2,192.828396,00:03:13,96.414198,96.414198
4,Alytes_obstetricans,428,13938.665894,03:52:19,32.566976,23.182367
5,Bombina_bombina,597,16789.168814,04:39:49,28.122561,19.147687
6,Bombina_variegata,139,5633.808328,01:33:54,40.530995,17.242177
7,Bufo_bufo,250,35236.839721,09:47:17,140.947359,14.79565
8,Bufo_spinosus,18,477.444849,00:07:57,26.524714,17.701833
9,Bufotes_balearicus,1,47.256,00:00:47,47.256,47.256


In [23]:
# Display just species, number of recordings, and total seconds
print(summary[["class", "n_recordings", "total_seconds"]].to_string(index=False))

# Save the 3-column summary to a CSV file
short_summary = summary[["class", "n_recordings", "total_seconds"]]
out_path = ROOT / "species_recording_counts.csv"
short_summary.to_csv(out_path, index=False)

print(f"Saved summary to: {out_path}")


                  class  n_recordings  total_seconds
     Alytes_almogavarii            18     663.132847
     Alytes_cisternasii            24    1895.613227
     Alytes_dickhilleni            10    1448.538571
      Alytes_muletensis             2     192.828396
    Alytes_obstetricans           428   13938.665894
        Bombina_bombina           597   16789.168814
      Bombina_variegata           139    5633.808328
              Bufo_bufo           250   35236.839721
          Bufo_spinosus            18     477.444849
     Bufotes_balearicus             1      47.256000
     Bufotes_boulengeri             1      99.516644
     Bufotes_cypriensis             3      50.536010
       Bufotes_oblongus             1      14.604250
        Bufotes_viridis           407   11980.044464
      Epidalea_calamita           485   18448.420615
                   Hyla             2     109.056000
           Hyla_arborea           853   27901.142677
        Hyla_intermedia           182    3919.

### Read the cleaned dataset and continue

In [2]:
from pathlib import Path
import os
import re
import pandas as pd
import math
import statistics

ROOT = Path("./all_data").resolve()
print("Scanning root:", ROOT)

Scanning root: /Users/jago/Desktop/naturalis/data_code/all_data


In [3]:
records = []  # (rel_path, class, ext, duration_seconds)

species_dirs = sorted([p for p in ROOT.iterdir() if p.is_dir()])
len(species_dirs), [d.name for d in species_dirs]  # quick peek

(54,
 ['Alytes_almogavarii',
  'Alytes_cisternasii',
  'Alytes_dickhilleni',
  'Alytes_muletensis',
  'Alytes_obstetricans',
  'Bombina_bombina',
  'Bombina_variegata',
  'Bufo_bufo',
  'Bufo_spinosus',
  'Bufotes_balearicus',
  'Bufotes_boulengeri',
  'Bufotes_cypriensis',
  'Bufotes_oblongus',
  'Bufotes_viridis',
  'Epidalea_calamita',
  'Hyla',
  'Hyla_arborea',
  'Hyla_intermedia',
  'Hyla_meridionalis',
  'Hyla_molleri',
  'Hyla_orientalis',
  'Hyla_sarda',
  'Hyla_savignyi',
  'Hyperolius_concolor',
  'Lithobates_catesbeianus',
  'Mystery_mystery',
  'Pelobates_cultripes',
  'Pelobates_fuscus',
  'Pelobates_vespertinus',
  'Pelodytes_atlanticus',
  'Pelodytes_hespericus',
  'Pelodytes_ibericus',
  'Pelodytes_punctatus',
  'Pelophylax',
  "Pelophylax_'esculentus'",
  "Pelophylax_'grafi'",
  'Pelophylax_bedriagae',
  'Pelophylax_bergeri',
  'Pelophylax_cretensis',
  'Pelophylax_cypriensis',
  'Pelophylax_epeiroticus',
  'Pelophylax_esculentus',
  'Pelophylax_kurtmuelleri',
  'Pelo

In [5]:
for species_dir in species_dirs:
    cls = species_dir.name  # species label == folder name
    # scan recursively under each species folder
    for f in species_dir.rglob("*"):
        if f.is_file():
            rel = f.relative_to(ROOT).as_posix()
            ext = f.suffix.lower()
            dur = get_duration_seconds(f)
            records.append((rel, cls, ext, dur))

len(records)

6824

In [6]:
df = pd.DataFrame(records, columns=["path", "class", "ext", "duration_seconds"])
display(df.head())

Unnamed: 0,path,class,ext,duration_seconds
0,Alytes_almogavarii/396614.m4a,Alytes_almogavarii,.m4a,10.820499
1,Alytes_almogavarii/198483.m4a,Alytes_almogavarii,.m4a,15.627029
2,Alytes_almogavarii/707099.m4a,Alytes_almogavarii,.m4a,10.1239
3,Alytes_almogavarii/XC971001-ALYALM030125MixPre...,Alytes_almogavarii,.wav,52.343946
4,Alytes_almogavarii/781699.m4a,Alytes_almogavarii,.m4a,18.947483


### Convert m4a to mp3

In [None]:
count_m4a = df['ext'].eq('.m4a').sum()
print("m4a count:", count_m4a)

m4a count: 1885
0       .m4a
1       .m4a
2       .m4a
3       .wav
4       .m4a
        ... 
6819    .wav
6820    .mp3
6821    .wav
6822    .wav
6823    .mp3
Name: ext, Length: 6824, dtype: object


In [10]:
!pip install pydub



In [13]:
from pydub import AudioSegment

converted = 0
for f in ROOT.rglob("*.m4a"):
    mp3_path = f.with_suffix(".mp3")
    if not mp3_path.exists():  # skip if already converted
        sound = AudioSegment.from_file(f, format="m4a")
        sound.export(mp3_path, format="mp3", bitrate="320k")
        converted += 1

print(f"Converted {converted} files to MP3 ✅")

Converted 1885 files to MP3 ✅


In [None]:
for f in ROOT.rglob("*.m4a"):
    f.unlink() # delete original m4a file

m4a count: 1885


### Mono vs stereo sound

count occurences of mono vs stereo

In [19]:
from mutagen import File as MutagenFile
from pathlib import Path

ROOT = Path("./all_data").resolve()
mono, stereo = 0, 0

for f in ROOT.rglob("*.*"):
    try:
        m = MutagenFile(f)
        ch = getattr(m.info, "channels", None)
        if ch == 1:
            mono += 1
        elif ch == 2:
            stereo += 1
    except Exception:
        pass

print(f"Mono: {mono}  |  Stereo: {stereo}")


Mono: 3846  |  Stereo: 2890


convert stereo -> mono

In [20]:
for f in ROOT.rglob("*"):
    if f.is_file():
        try:
            audio = AudioSegment.from_file(f)
            if audio.channels == 2:  # stereo
                print(f"Converting to mono → {f.name}")
                mono_audio = audio.set_channels(1)
                # overwrite original file safely
                mono_audio.export(f, format=f.suffix.lower().replace('.', ''), bitrate="320k")
                converted += 1
        except Exception as e:
            print("Error with", f, ":", e)

print(f"\n✅ Converted {converted} stereo files to mono")

Error with /Users/jago/Desktop/naturalis/data_code/all_data/species_recording_counts.csv : Decoding failed. ffmpeg returned error code: 183

Output from ffmpeg/avlib:

ffmpeg version 8.0.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.6)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/8.0.1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontcon

### Deduplication

In [22]:
import hashlib
def md5_hash(path, chunk_size=8192):
    """Compute MD5 hash of a file in chunks."""
    md5 = hashlib.md5()
    with open(path, "rb") as f:
        while chunk := f.read(chunk_size):
            md5.update(chunk)
    return md5.hexdigest()

records = []
for f in ROOT.rglob("*"):
    if f.is_file():
        try:
            h = md5_hash(f)
            records.append((f.relative_to(ROOT).as_posix(), h))
        except Exception as e:
            print("Error reading:", f, e)

df_md5 = pd.DataFrame(records, columns=["path", "md5"])
df_md5.head()

Unnamed: 0,path,md5
0,species_recording_counts.csv,bf4a9fd7caf68e01c9b5c88446b6f098
1,species_counts.csv,49cba9f47ecb2d49cacd51cfd00b5b7b
2,combined_list_files.csv,6efccdcfbc86c0846878b0a7809c8264
3,species_counts_by_source.csv,1e0702270a2fbf14c57d73fb2cbd0eec
4,species_counts_pivot.csv,292e9d9d6866e47bc8827a8a6a31ac94


In [23]:
# Group by hash and find those with >1 occurrence
dupes = df_md5[df_md5.duplicated("md5", keep=False)].sort_values("md5")

if not dupes.empty:
    print(f"Found {dupes['md5'].nunique()} duplicate groups ({len(dupes)} files total).")
else:
    print("No exact duplicates found.")

dupes.head(10)

Found 178 duplicate groups (365 files total).


Unnamed: 0,path,md5
4660,Pelophylax_lessonae/213335.mp3,01ff2d0c75f9b3f34e86ab95605a0c0b
4655,Pelophylax_lessonae/213334.mp3,01ff2d0c75f9b3f34e86ab95605a0c0b
6324,Epidalea_calamita/236357.mp3,024408162b5bee7065968b4848e5b06e
6509,Epidalea_calamita/236359.mp3,024408162b5bee7065968b4848e5b06e
6507,Epidalea_calamita/236358.mp3,024408162b5bee7065968b4848e5b06e
3240,Bombina_bombina/1442984.wav,026bd6e05ccef9eb57fb20bbfa21e7c6
3295,Bombina_bombina/1442983.wav,026bd6e05ccef9eb57fb20bbfa21e7c6
6389,Epidalea_calamita/241373.wav,027e50192b2beade46d50d442ac1ad37
6325,Epidalea_calamita/241374.wav,027e50192b2beade46d50d442ac1ad37
3347,Bombina_bombina/698104.mp3,047a10d92ee37cdf1f446eff1463837b


In [24]:
# report duplicated to a file
out_path = ROOT / "duplicate_files_by_md5.csv"
dupes.to_csv(out_path, index=False)
print(f"Duplicate report saved to: {out_path}")

Duplicate report saved to: /Users/jago/Desktop/naturalis/data_code/all_data/duplicate_files_by_md5.csv


In [25]:
# remove duplicates
# Make sure df_md5 exists (with columns path, md5)
assert "df_md5" in locals(), "Please run the MD5 hashing step first."

# Find duplicates
duplicates = df_md5[df_md5.duplicated("md5", keep=False)]
to_delete = []

for md5_hash, group in duplicates.groupby("md5"):
    files = group["path"].tolist()
    if len(files) > 1:
        # Keep the first file and delete the rest
        keep = files[0]
        remove = files[1:]
        for rel_path in remove:
            f = ROOT / rel_path
            if f.exists():
                to_delete.append(f)

print(f"Found {len(to_delete)} duplicate files to delete.")

# Uncomment to preview instead of deleting
for f in to_delete[:10]:
    print("Would delete:", f)

# --- Actually delete duplicates ---
confirm = input("Type 'yes' to confirm deletion of duplicates: ")
if confirm.lower() == "yes":
    for f in to_delete:
        try:
            f.unlink()
        except Exception as e:
            print("Error deleting", f, ":", e)
    print(f"✅ Deleted {len(to_delete)} duplicate files.")
else:
    print("❌ Deletion cancelled.")

Found 187 duplicate files to delete.
Would delete: /Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax_lessonae/213335.mp3
Would delete: /Users/jago/Desktop/naturalis/data_code/all_data/Epidalea_calamita/236358.mp3
Would delete: /Users/jago/Desktop/naturalis/data_code/all_data/Epidalea_calamita/236359.mp3
Would delete: /Users/jago/Desktop/naturalis/data_code/all_data/Bombina_bombina/1442983.wav
Would delete: /Users/jago/Desktop/naturalis/data_code/all_data/Epidalea_calamita/241373.wav
Would delete: /Users/jago/Desktop/naturalis/data_code/all_data/Bufotes_viridis/698106.mp3
Would delete: /Users/jago/Desktop/naturalis/data_code/all_data/Hyla_orientalis/31622.mp3
Would delete: /Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax_perezi/290922.mp3
Would delete: /Users/jago/Desktop/naturalis/data_code/all_data/Hyla_meridionalis/XC893219-Meridionale-240521.wav
Would delete: /Users/jago/Desktop/naturalis/data_code/all_data/Pelobates_fuscus/967763.mp3
✅ Deleted 187 duplicate fil

### Serial recordings

In [59]:
from datetime import timedelta
import pandas as pd
import numpy as np

# Adjust these paths to your actual folder structure:
BASE = Path(".").resolve()

INAT_OCC = BASE / "iNaturalist" / "occurrence.txt"
OBS_OCC  = BASE / "observation_org" / "occurrence.txt"
XC_META  = BASE / "xeno_canto" / "metadata.csv"

MIN_RECORDS = 50  # species threshold for all three sources

In [67]:
def load_darwin_occ(occ_path: Path, source_name: str) -> pd.DataFrame:
    """
    Load a Darwin Core occurrence.txt file and return a DataFrame with:
      class, recordedBy, decimalLatitude, decimalLongitude, timestamp, source
    """
    occ = pd.read_csv(occ_path, sep="\t", dtype=str, low_memory=False)
    
    # Make sure expected columns exist
    for col in ["genus", "specificEpithet", "recordedBy",
                "decimalLatitude", "decimalLongitude", "eventDate"]:
        if col not in occ.columns:
            occ[col] = np.nan

    df = pd.DataFrame({
        "gbifID": occ["gbifID"],
        "genus": occ["genus"].astype(str).str.strip(),
        "specificEpithet": occ["specificEpithet"].astype(str).str.strip(),
        "recordedBy": occ["recordedBy"].astype(str).str.strip(),
        "decimalLatitude": pd.to_numeric(occ["decimalLatitude"], errors="coerce"),
        "decimalLongitude": pd.to_numeric(occ["decimalLongitude"], errors="coerce"),
        "eventDate_raw": occ["eventDate"],
    })
    
    # Species label: Genus_species (or just Genus if species missing)
    df["class"] = np.where(
        df["specificEpithet"].notna() & (df["specificEpithet"] != ""),
        df["genus"] + "_" + df["specificEpithet"],
        df["genus"],
    )
    
    # Parse timestamp (UTC); Darwin Core eventDate can be date or datetime
    df["timestamp"] = pd.to_datetime(df["eventDate_raw"], errors="coerce", utc=True)
    
    df["source"] = source_name
    
    # Drop rows with no species at all
    df = df[df["class"].notna() & (df["class"] != "")]
    
    return df[["gbifID", "class", "recordedBy", "decimalLatitude", "decimalLongitude", "timestamp", "source"]]


In [47]:
def load_xc_metadata(csv_path: Path) -> pd.DataFrame:
    """
    Load Xeno-canto metadata.csv and return:
      class, recordedBy, decimalLatitude, decimalLongitude, timestamp, source
    """
    xc = pd.read_csv(csv_path, dtype=str, low_memory=False)

    # Make sure expected columns exist
    for col in ["gen", "sp", "rec", "lat", "lon", "date", "time", "uploaded"]:
        if col not in xc.columns:
            xc[col] = np.nan

    def combine_datetime(row):
        # Prefer date + time, then date, then uploaded
        if pd.notna(row["date"]) and pd.notna(row["time"]):
            dt = pd.to_datetime(row["date"] + " " + row["time"], errors="coerce", utc=True)
        elif pd.notna(row["date"]):
            dt = pd.to_datetime(row["date"], errors="coerce", utc=True)
        else:
            dt = pd.to_datetime(row["uploaded"], errors="coerce", utc=True)
        return dt

    df = pd.DataFrame({
        "genus": xc["gen"].astype(str).str.strip(),
        "specificEpithet": xc["sp"].astype(str).str.strip(),
        "recordedBy": xc["rec"].astype(str).str.strip(),
        "decimalLatitude": pd.to_numeric(xc["lat"], errors="coerce"),
        "decimalLongitude": pd.to_numeric(xc["lon"], errors="coerce"),
    })
    df["timestamp"] = xc.apply(combine_datetime, axis=1)
    
    df["class"] = np.where(
        df["specificEpithet"].notna() & (df["specificEpithet"] != ""),
        df["genus"] + "_" + df["specificEpithet"],
        df["genus"],
    )

    df["source"] = "xeno_canto"
    df = df[df["class"].notna() & (df["class"] != "")]
    
    return df[["class", "recordedBy", "decimalLatitude", "decimalLongitude", "timestamp", "source"]]


In [80]:
def detect_serial_for_source(df_source: pd.DataFrame,
                             min_records: int = 100,
                             loc_round_decimals: int = 3) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    df_source must have columns:
      class, recordedBy, decimalLatitude, decimalLongitude, timestamp, source

    Returns:
      df_marked  - original df_source with columns added:
                   lat_round, lon_round, loc_key, keep_flag, serial_flag
      stats      - per-species summary: total, kept_valid, serial_removed
    """
    df = df_source.copy()

    # Round location and build loc_key
    df["lat_round"] = df["decimalLatitude"].round(loc_round_decimals)
    df["lon_round"] = df["decimalLongitude"].round(loc_round_decimals)
    df["loc_key"] = np.where(
        df["lat_round"].notna() & df["lon_round"].notna(),
        df["lat_round"].astype(str) + "," + df["lon_round"].astype(str),
        np.nan,
    )

    # Species counts in this source
    counts = df.groupby("class").size().rename("n_total").reset_index()
    big_species = counts[counts["n_total"] > min_records]["class"]
    big_species_set = set(big_species)

    # Filter to big species
    df["is_big_species"] = df["class"].isin(big_species_set)

    # Among big species, we can only dedupe when user+loc+time are all present
    mask_valid = (
        df["is_big_species"]
        & df["recordedBy"].notna()
        & df["loc_key"].notna()
        & df["timestamp"].notna()
    )
    df["keep_flag"] = True            # default: keep
    df["serial_flag"] = False         # default: not serial

    def mark_group(g: pd.DataFrame) -> pd.DataFrame:
        """Sliding 1-hour window per (class, user, loc_key) group."""
        g = g.sort_values("timestamp").copy()
        last_kept = None
        keep = []
        for _, row in g.iterrows():
            t = row["timestamp"]
            if pd.isna(t):
                keep.append(True)
                continue
            if last_kept is None or (t - last_kept) >= timedelta(hours=1):
                keep.append(True)
                last_kept = t
            else:
                keep.append(False)
        g["keep_flag"] = keep
        g["serial_flag"] = ~g["keep_flag"]
        return g

    # Apply only on valid subset
    df_valid = df[mask_valid]
    df_others = df[~mask_valid]

    if not df_valid.empty:
        df_marked_valid = (
            df_valid
            .groupby(["class", "recordedBy", "loc_key"], group_keys=False)
            .apply(mark_group)
        )
        # Recombine
        df_marked = pd.concat([df_marked_valid, df_others], ignore_index=True)
    else:
        df_marked = df.copy()

    # Build summary stats for big species
    serial_valid = df_marked[mask_valid & df_marked["serial_flag"]]
    kept_valid = df_marked[mask_valid & df_marked["keep_flag"]]

    removed_counts = serial_valid.groupby("class").size().rename("serial_removed").reset_index()
    kept_counts = kept_valid.groupby("class").size().rename("kept_valid").reset_index()

    stats = counts.merge(kept_counts, on="class", how="left").merge(removed_counts, on="class", how="left")
    stats["kept_valid"] = stats["kept_valid"].fillna(0).astype(int)
    stats["serial_removed"] = stats["serial_removed"].fillna(0).astype(int)

    return df_marked, stats


In [54]:
df_xc = load_xc_metadata(XC_META)
df_xc_marked, stats_xc = detect_serial_for_source(df_xc, min_records=MIN_RECORDS)

print("Xeno-canto shape:", df_xc_marked.shape)
stats_xc[stats_xc["class"].isin(stats_xc[stats_xc["n_total"] > MIN_RECORDS]["class"])]

# Species where at least 1 serial recording was detected
serial_species_obs = stats_xc[stats_xc["serial_removed"] > 0]["class"].tolist()
serial_species_obs


Xeno-canto shape: (1405, 12)


[]

In [81]:
df_inat_raw = load_darwin_occ(INAT_OCC, source_name="iNaturalist")
df_inat_marked, stats_inat = detect_serial_for_source(df_inat_raw, min_records=MIN_RECORDS)

print("iNaturalist shape:", df_inat_marked.shape)
stats_inat[stats_inat["class"].isin(stats_inat[stats_inat["n_total"] > MIN_RECORDS]["class"])]

# Species where at least 1 serial recording was detected
serial_species_inat = stats_inat[stats_inat["serial_removed"] > 0]["class"].tolist()
serial_species_inat


iNaturalist shape: (3517, 13)


  .apply(mark_group)


['Alytes_obstetricans',
 'Bombina_bombina',
 'Bufo_bufo',
 'Epidalea_calamita',
 'Hyla_arborea',
 'Pelophylax_esculentus',
 'Pelophylax_nan',
 'Pelophylax_ridibundus']

In [82]:
df_obs_raw = load_darwin_occ(OBS_OCC, source_name="observation_org")
df_obs_marked, stats_obs = detect_serial_for_source(df_obs_raw, min_records=MIN_RECORDS)

print("Observation.org shape:", df_obs_marked.shape)
stats_obs[stats_obs["class"].isin(stats_obs[stats_obs["n_total"] > MIN_RECORDS]["class"])]

# Species where at least 1 serial recording was detected
serial_species_obs = stats_obs[stats_obs["serial_removed"] > 0]["class"].tolist()
serial_species_obs


Observation.org shape: (1460, 13)


  .apply(mark_group)


['Alytes_obstetricans',
 'Epidalea_calamita',
 'Hyla_arborea',
 'Pelophylax_lessonae',
 'Pelophylax_nan',
 'Pelophylax_ridibundus',
 'Rana_arvalis']

In [83]:
serial_obs = df_obs_marked[
    (df_obs_marked["class"].isin(serial_species_obs)) &
    (df_obs_marked["serial_flag"])
].copy()

print(serial_obs.shape)
serial_obs.head()

(68, 13)


Unnamed: 0,gbifID,class,recordedBy,decimalLatitude,decimalLongitude,timestamp,source,lat_round,lon_round,loc_key,is_big_species,keep_flag,serial_flag
28,3720390039,Alytes_obstetricans,User 22575,52.1,5.35,2021-06-05 00:00:00+00:00,observation_org,52.1,5.35,"52.1,5.35",True,False,True
114,3714945639,Alytes_obstetricans,User 86447,41.7063,2.4018,2014-04-27 00:00:00+00:00,observation_org,41.706,2.402,"41.706,2.402",True,False,True
174,3719173218,Epidalea_calamita,User 123903,37.203507,-6.170717,2020-12-12 00:00:00+00:00,observation_org,37.204,-6.171,"37.204,-6.171",True,False,True
189,4882072480,Epidalea_calamita,User 1338,52.3,5.0,2024-05-23 00:00:00+00:00,observation_org,52.3,5.0,"52.3,5.0",True,False,True
202,3725504100,Epidalea_calamita,User 142165,52.1,4.95,2020-05-09 00:00:00+00:00,observation_org,52.1,4.95,"52.1,4.95",True,False,True


In [84]:
serial_inat = df_inat_marked[
    (df_inat_marked["class"].isin(serial_species_inat)) &
    (df_inat_marked["serial_flag"])
].copy()

print(serial_inat.shape)
serial_inat.head()

(30, 13)


Unnamed: 0,gbifID,class,recordedBy,decimalLatitude,decimalLongitude,timestamp,source,lat_round,lon_round,loc_key,is_big_species,keep_flag,serial_flag
16,4145552338,Alytes_obstetricans,Gloria Arias,41.483051,2.074496,2021-06-06 21:16:54+00:00,iNaturalist,41.483,2.074,"41.483,2.074",True,False,True
75,5087101769,Alytes_obstetricans,nico1111,45.00951,2.007085,2024-08-15 22:54:02+00:00,iNaturalist,45.01,2.007,"45.01,2.007",True,False,True
157,4133960301,Bombina_bombina,Philipp Byzov,54.858979,37.594146,2023-06-08 13:27:31+00:00,iNaturalist,54.859,37.594,"54.859,37.594",True,False,True
185,4901047071,Bombina_bombina,aistespo,55.114031,25.403895,2024-05-18 14:54:55+00:00,iNaturalist,55.114,25.404,"55.114,25.404",True,False,True
202,5153759368,Bombina_bombina,dfriemel,51.158175,14.282224,2025-05-10 19:09:53+00:00,iNaturalist,51.158,14.282,"51.158,14.282",True,False,True


In [93]:
# get the file names based on gbifID
INAT_mm = BASE / "iNaturalist" / "multimedia.txt"
OBS_mm  = BASE / "observation_org" / "multimedia.txt"

mm_obs = pd.read_csv(OBS_mm, sep="\t", dtype=str, low_memory=False)
mm_inat = pd.read_csv(INAT_mm, sep="\t", dtype=str, low_memory=False)

# Keep only gbifID and identifier URL
mm_obs = mm_obs[["gbifID", "identifier"]].copy()
mm_inat = mm_inat[["gbifID", "identifier"]].copy()

# Clean filename: extract last element of URL and remove query strings
mm_obs["filename"] = (
    mm_obs["identifier"]
      .astype(str)
      .str.split("/").str[-1]
      .str.replace(r"\?.*$", "", regex=True)
)

mm_inat["filename"] = (
    mm_inat["identifier"]
      .astype(str)
      .str.split("/").str[-1]
      .str.replace(r"\?.*$", "", regex=True)
)

serial_obs = serial_obs.merge(mm_obs, on="gbifID", how="left")
serial_inat = serial_inat.merge(mm_inat, on="gbifID", how="left")


In [94]:
serial_obs.columns

Index(['gbifID', 'class', 'recordedBy', 'decimalLatitude', 'decimalLongitude',
       'timestamp', 'source', 'lat_round', 'lon_round', 'loc_key',
       'is_big_species', 'keep_flag', 'serial_flag', 'identifier', 'filename'],
      dtype='object')

In [95]:
serial_inal_fns = (
    serial_inat["filename"]
    .dropna()
    .astype(str)
    .str.strip()
    .str.replace(r"\?.*$", "", regex=True)
)

serial_obs_fns = (
    serial_obs["filename"]
    .dropna()
    .astype(str)
    .str.strip()
    .str.replace(r"\?.*$", "", regex=True)
)

serial_filenames = set(serial_inal_fns) | set(serial_obs_fns)
len(serial_filenames)

103

In [96]:
ROOT = Path("all_data").resolve()

to_delete_paths = []

for f in ROOT.rglob("*"):
    if f.is_file():
        name = f.name
        if name in serial_filenames:
            to_delete_paths.append(f)

len(to_delete_paths)

74

In [97]:
for p in to_delete_paths[:20]:
    print(p)

/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/45841.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/173808.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/305500.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/1499912.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/324871.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/1499928.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/94935.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/251493.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/303326.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/299226.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/106750.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/106744.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/1499925.mp3
/Users/jago/Desktop/naturalis/data_code/all_data/Pelophylax/1499924.mp3
/Use

In [98]:
print(f"About to delete {len(to_delete_paths)} files from {ROOT}")

confirm = input("Type 'yes' to confirm deletion: ")

if confirm.lower() == "yes":
    deleted = 0
    errors = 0
    for p in to_delete_paths:
        try:
            p.unlink()
            deleted += 1
        except Exception as e:
            print("Error deleting", p, ":", e)
            errors += 1
    print(f"✅ Deleted {deleted} files. Errors: {errors}")
else:
    print("❌ Deletion cancelled.")

About to delete 74 files from /Users/jago/Desktop/naturalis/data_code/all_data
✅ Deleted 74 files. Errors: 0


### Split the data

In [99]:
import random
from pathlib import Path
import shutil

# Root folder with species subfolders
root = Path("all_data").resolve()          # adjust if needed

# Where to put the splits
out_root = Path("dataset_splits").resolve()
train_dir = out_root / "train"
val_dir   = out_root / "val"
test_dir  = out_root / "test"

for d in [train_dir, val_dir, test_dir]:
    d.mkdir(parents=True, exist_ok=True)

# Only these species
target_species = {
    "Rana_latastei",
    "Pelodytes_atlanticus",
    "Bufo_spinosus",
    "Lithobates_catesbeianus",
    "Hyla_savignyi",
    "Alytes_almogavarii",
    "Pelophylax_bedriagae",
    "Pelophylax_kurtmuelleri",
    "Alytes_dickhilleni",
    "Hyla_molleri",
    "Hyla_sarda",
    "Pelophylax_esculentus",
    "Alytes_cisternasii",
    "Pelobates_fuscus",
    "Hyla_intermedia",
    "Rana_dalmatina",
    "Pelophylax_'esculentus'",
    "Bombina_variegata",
    "Pelodytes_punctatus",
    "Pelodytes_ibericus",
    "Pelophylax_lessonae",
    "Rana_arvalis",
    "Bufotes_viridis",
    "Hyla_meridionalis",
    "Alytes_obstetricans",
    "Bombina_bombina",
    "Epidalea_calamita",
    "Pelophylax_ridibundus",
    "Hyla_arborea",
    "Pelophylax_perezi",
    "Hyla_orientalis",
    "Bufo_bufo",
    "Rana_temporaria",
}

# Ratios
train_ratio = 0.60
val_ratio   = 0.20
test_ratio  = 0.20

AUDIO_EXTS = {".wav", ".mp3"}

total_files_copied = 0

for species in sorted(target_species):
    species_folder = root / species
    if not species_folder.exists() or not species_folder.is_dir():
        print(f"[WARN] Species folder not found, skipping: {species}")
        continue

    # collect files (audio only, just in case)
    files = [f for f in species_folder.iterdir()
             if f.is_file() and f.suffix.lower() in AUDIO_EXTS]

    n = len(files)
    if n == 0:
        print(f"[INFO] No files for {species}, skipping.")
        continue

    random.shuffle(files)

    # Special case: exactly 4 recordings → 2 / 1 / 1
    if n == 4:
        n_train, n_val, n_test = 2, 1, 1
    else:
        n_train = int(n * train_ratio)
        n_val   = int(n * val_ratio)
        n_test  = n - n_train - n_val

    train_files = files[:n_train]
    val_files   = files[n_train:n_train + n_val]
    test_files  = files[n_train + n_val:]

    # Make sure species folders exist in output dirs
    (train_dir / species).mkdir(parents=True, exist_ok=True)
    (val_dir / species).mkdir(parents=True, exist_ok=True)
    (test_dir / species).mkdir(parents=True, exist_ok=True)

    # COPY files (use shutil.move if you want to move instead)
    for f in train_files:
        shutil.copy2(f, train_dir / species / f.name)
    for f in val_files:
        shutil.copy2(f, val_dir / species / f.name)
    for f in test_files:
        shutil.copy2(f, test_dir / species / f.name)

    total_files_copied += n
    print(f"{species}: total={n}, train={len(train_files)}, val={len(val_files)}, test={len(test_files)}")

print("\nDone.")
print("Total files copied:", total_files_copied)
print("Output root:", out_root)


Alytes_almogavarii: total=18, train=10, val=3, test=5
Alytes_cisternasii: total=24, train=14, val=4, test=6
Alytes_dickhilleni: total=10, train=6, val=2, test=2
Alytes_obstetricans: total=418, train=250, val=83, test=85
Bombina_bombina: total=585, train=351, val=117, test=117
Bombina_variegata: total=139, train=83, val=27, test=29
Bufo_bufo: total=246, train=147, val=49, test=50
Bufo_spinosus: total=18, train=10, val=3, test=5
Bufotes_viridis: total=399, train=239, val=79, test=81
Epidalea_calamita: total=453, train=271, val=90, test=92
Hyla_arborea: total=826, train=495, val=165, test=166
Hyla_intermedia: total=183, train=109, val=36, test=38
Hyla_meridionalis: total=314, train=188, val=62, test=64
Hyla_molleri: total=44, train=26, val=8, test=10
Hyla_orientalis: total=55, train=33, val=11, test=11
Hyla_sarda: total=40, train=24, val=8, test=8
Hyla_savignyi: total=12, train=7, val=2, test=3
Lithobates_catesbeianus: total=18, train=10, val=3, test=5
Pelobates_fuscus: total=45, train=27