In [1]:
from pathlib import Path
import re
import json

In [8]:
MERCH_DIR = Path("merchados")   # WSL path to C:\Sparrows\merchados
OUT_JSONL = Path("Sparrows/tasks_merchados.jsonl")

In [9]:
LS_PREFIX = "/data/local-files/?d=merchados/"

In [10]:
base_re = re.compile(r"^(?P<bird>.+)_(?P<date>\d{4}-\d{2}-\d{2})$")

In [11]:
base_re = re.compile(r"^(?P<bird>.+)_(?P<date>\d{4}-\d{2}-\d{2})$")

In [12]:
# ---- INDEX FILES ----
all_files = {p.name for p in MERCH_DIR.iterdir() if p.is_file()}



In [13]:
# Find all "base" from merged csvs (BIRD_DATE.csv), ignoring *_raw_sig.csv
bases = []

In [14]:
for fn in all_files:
    if fn.endswith(".csv") and not fn.endswith("_raw_sig.csv"):
        stem = fn[:-4]  # remove .csv
        if base_re.match(stem):
            bases.append(stem)

bases = sorted(set(bases))

tasks = []
missing = {"wav": 0, "raw": 0, "merged": 0}

for base in bases:
    m = base_re.match(base)
    bird_id = m.group("bird")
    date = m.group("date")

    merged_fn = f"{base}.csv"
    wav_fn    = f"{base}.wav"
    raw_fn    = f"{base}_raw_sig.csv"

    ok = True
    if merged_fn not in all_files:
        missing["merged"] += 1
        ok = False
    if wav_fn not in all_files:
        missing["wav"] += 1
        ok = False
    if raw_fn not in all_files:
        missing["raw"] += 1
        ok = False

    if not ok:
        continue

    task = {
        "data": {
            "audio":  f"{LS_PREFIX}{wav_fn}",
            "merged": f"{LS_PREFIX}{merged_fn}",
            "raw":    f"{LS_PREFIX}{raw_fn}",
            "bird_id": bird_id,
            "date": date,
            "base_id": base
        }
    }
    tasks.append(task)

In [15]:
# ---- WRITE JSONL ----
OUT_JSONL.parent.mkdir(parents=True, exist_ok=True)
with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for t in tasks:
        f.write(json.dumps(t, ensure_ascii=False) + "\n")

print("MERCH_DIR:", MERCH_DIR)
print("Wrote JSONL:", OUT_JSONL)
print("Tasks written:", len(tasks))
print("Skipped because missing files:", missing)

#note

MERCH_DIR: merchados
Wrote JSONL: Sparrows\tasks_merchados.jsonl
Tasks written: 2946
Skipped because missing files: {'wav': 0, 'raw': 0, 'merged': 0}


In [18]:
import pandas as pd
from pathlib import Path

MERCH_DIR = Path("merchados")  # adjust if needed

raw_files = sorted(MERCH_DIR.glob("*_raw_sig.csv"))
print("Found raw files:", len(raw_files))

fixed = 0
bad = 0

for p in raw_files:
    df = pd.read_csv(p)

    if "ts" not in df.columns:
        bad += 1
        continue

    # Parse to datetime (handles nanoseconds if present)
    ts = pd.to_datetime(df["ts"], errors="coerce")

    # If parsing failed for many rows, skip and report
    if ts.isna().mean() > 0.5:
        bad += 1
        continue

    # Format as "YYYY-mm-dd HH:MM:SS.mmm" (3-digit milliseconds)
    ts_str = ts.dt.strftime("%Y-%m-%d %H:%M:%S.") + ts.dt.strftime("%f").str[:3]

    df["ts"] = ts_str

    # IMPORTANT: sort by time for Label Studio TimeSeries
    df = df.sort_values("ts")

    df.to_csv(p, index=False)
    fixed += 1

print("Fixed files:", fixed)
print("Skipped/Bad files:", bad)

Found raw files: 2946


  ts = pd.to_datetime(df["ts"], errors="coerce")


Fixed files: 2946
Skipped/Bad files: 0


In [19]:
p = sorted(Path("merchados").glob("*_raw_sig.csv"))[0]
df = pd.read_csv(p)
print(df["ts"].head().tolist())

['2020-01-17 08:40:57.293', '2020-01-17 08:41:16.591', '2020-01-17 08:41:35.888', '2020-01-17 08:41:55.186', '2020-01-17 08:42:14.484']
