In [1]:
import pyreadr
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
from scipy.io import wavfile
from IPython.display import Audio


In [2]:
result = pyreadr.read_r("data/atsp_spring.rds")

In [3]:
df = list(result.values())[0]

In [4]:
df.columns

Index(['runID', 'ts', 'tsCorrected', 'sig', 'sigsd', 'noise', 'freq', 'freqsd',
       'motusTagID', 'ambigID', 'port', 'runLen', 'motusFilter', 'mfgID',
       'tagModel', 'tagLifespan', 'tagBI', 'tagDeployID', 'markerNumber',
       'tagDeployStart', 'tagDeployEnd', 'tagDepLat', 'tagDepLon', 'tagDepAlt',
       'recvDeployLat', 'recvDeployLon', 'recvDeployName', 'antBearing',
       'ts_5min', 'date', 'year', 'yday', 'month', 'day', 'minute', 'hour',
       'sunrise', 'sunset', 'tagYear', 'Age', 'Sex', 'Location', 'BandNumber',
       'Wing', 'Fat', 'Mass', 'Lost', 'Isotope_value', 'captureDate',
       'capture.yday'],
      dtype='object')

In [5]:
df['ts'] = pd.to_datetime(df['ts'])
df['captureDate'] = pd.to_datetime(df['captureDate'])

In [6]:
df['bird_id'] = df['BandNumber'].astype(str) + "_" + df['motusTagID'].astype(int).astype(str)

In [7]:
df.columns

Index(['runID', 'ts', 'tsCorrected', 'sig', 'sigsd', 'noise', 'freq', 'freqsd',
       'motusTagID', 'ambigID', 'port', 'runLen', 'motusFilter', 'mfgID',
       'tagModel', 'tagLifespan', 'tagBI', 'tagDeployID', 'markerNumber',
       'tagDeployStart', 'tagDeployEnd', 'tagDepLat', 'tagDepLon', 'tagDepAlt',
       'recvDeployLat', 'recvDeployLon', 'recvDeployName', 'antBearing',
       'ts_5min', 'date', 'year', 'yday', 'month', 'day', 'minute', 'hour',
       'sunrise', 'sunset', 'tagYear', 'Age', 'Sex', 'Location', 'BandNumber',
       'Wing', 'Fat', 'Mass', 'Lost', 'Isotope_value', 'captureDate',
       'capture.yday', 'bird_id'],
      dtype='object')

In [8]:
detections_per_day = (
    df.groupby(['bird_id', 'date'])
      .size()
      .reset_index(name='n_detections')
)

In [9]:
detections_per_day

Unnamed: 0,bird_id,date,n_detections
0,2221-38875_43401,2020-01-17,1705
1,2221-38875_43401,2020-01-18,2988
2,2221-38875_43401,2020-01-19,4337
3,2221-38875_43401,2020-01-20,3432
4,2221-38875_43401,2020-01-21,3318
...,...,...,...
2941,UNK2_43376,2020-03-27,975
2942,UNK2_43376,2020-03-28,2007
2943,UNK2_43376,2020-03-29,626
2944,UNK2_43376,2020-03-30,48


In [10]:

# Si 'date' es string:
df['date'] = pd.to_datetime(df['date']).dt.date

# Minuto del día (0–1439)
df['minute_of_day'] = df['hour'] * 60 + df['minute']

In [11]:
def make_merged_for_day(df, bird_id, day):
    """
    Devuelve un dataframe 'merged' de 1440 filas (un minuto por fila)
    para un bird_id y date concretos.

    Columnas: minute_of_day, activity, sig_med, freq_mean, freqsd_mean,
              noise_mean, n_dets, bird_id, date
    """
    day = pd.to_datetime(day).date()

    d = df[(df['bird_id'] == bird_id) & (df['date'] == day)].copy()
    if d.empty:
        return None  # no detections that day

    # Ensure minute_of_day exists
    if 'minute_of_day' not in d.columns:
        d['minute_of_day'] = d['hour'] * 60 + d['minute']

    # Unique antenna ID (station + port)
    d['antenna_id'] = (
        d['recvDeployName'].astype(str)
        + "_P" + d['port'].astype(int).astype(str)
    )

    # Sort by time, compute Δsig within each antenna
    d = d.sort_values('ts')
    d['delta_sig'] = d.groupby('antenna_id')['sig'].diff()
    d['activity_inst'] = np.sqrt(np.abs(d['delta_sig']))  # Morbey-style

    # Aggregate per minute_of_day
    agg = (
        d.groupby('minute_of_day')
         .agg(
            activity=('activity_inst', 'median'),
            sig_med=('sig', 'median'),
            freq_mean=('freq', 'mean'),
            freqsd_mean=('freqsd', 'mean'),
            noise_mean=('noise', 'mean'),
            n_dets=('sig', 'size')
         )
    )

    # Base 0–1439 minutes
    base = pd.DataFrame({'minute_of_day': np.arange(1440)})

    merged = base.merge(agg, on='minute_of_day', how='left')

    # Fill NAs with 0 (as you wanted)
    num_cols = ['activity', 'sig_med', 'freq_mean', 'freqsd_mean', 'noise_mean', 'n_dets']
    merged[num_cols] = merged[num_cols].fillna(0)

    # Add identifiers
    merged['bird_id'] = bird_id
    merged['date'] = day

    return merged

In [12]:
def merged_to_tone_audio(merged, wav_path,
                         duration_sec=60,
                         sample_rate=16000,
                         column='activity',
                         carrier_freq=440):
    """
    Sonification: uses 'column' as amplitude envelope of a sine tone.
    Higher activity => louder tone.
    """

    # Series for the chosen column
    x = merged[column].values.astype(float)
    x = np.nan_to_num(x, nan=0.0)

    # Normalize envelope to [0, 1]
    if np.nanmax(x) > 0:
        x = x / (np.nanmax(x) + 1e-9)
    else:
        x = np.zeros_like(x)

    # Audio time axis
    n_samples = int(duration_sec * sample_rate)
    t_audio = np.linspace(0, duration_sec, n_samples)

    # Map minutes -> audio seconds
    t_minutes = merged['minute_of_day'].values.astype(float)  # 0..1439
    t_minutes_norm = t_minutes / t_minutes.max() * duration_sec

    env = np.interp(t_audio, t_minutes_norm, x)  # envelope in [0,1]

    # Carrier tone
    carrier = np.sin(2 * np.pi * carrier_freq * t_audio)

    # Final audio
    audio_float = env * carrier
    audio_int16 = np.int16(np.clip(audio_float, -1, 1) * 32767)

    wavfile.write(wav_path, sample_rate, audio_int16)
    return sample_rate, audio_float

In [13]:
from pathlib import Path

base_dir = Path(".")  # current folder; change if you want another root

merch_dir = base_dir / "merchados"
audio_dir = base_dir / "aud"

merch_dir.mkdir(exist_ok=True)
audio_dir.mkdir(exist_ok=True)

merch_dir, audio_dir

(WindowsPath('merchados'), WindowsPath('aud'))

In [14]:
pairs = (
    df[['bird_id', 'date']]
    .dropna()
    .drop_duplicates()
    .sort_values(['bird_id', 'date'])
)

len(pairs)

for _, row in pairs.iterrows():
    bird = row['bird_id']
    day = pd.to_datetime(row['date']).date()

    merged = make_merged_for_day(df, bird, day)
    if merged is None:
        continue

    # 1) Save merged CSV in merchados/
    csv_name = f"{bird}_{day}.csv"
    merged_path = merch_dir / csv_name
    merged.to_csv(merged_path, index=False)

    # 2) Create audio WAV in aud/
    wav_name = f"{bird}_{day}.wav"
    wav_path = audio_dir / wav_name
    merged_to_tone_audio(merged, wav_path)

    print("Processed:", bird, day)

Processed: 2221-38875_43401 2020-01-17
Processed: 2221-38875_43401 2020-01-18
Processed: 2221-38875_43401 2020-01-19
Processed: 2221-38875_43401 2020-01-20
Processed: 2221-38875_43401 2020-01-21
Processed: 2221-38875_43401 2020-01-22
Processed: 2221-38875_43401 2020-01-23
Processed: 2221-38875_43401 2020-01-24
Processed: 2221-38875_43401 2020-01-25
Processed: 2221-38875_43401 2020-01-26
Processed: 2221-38875_43401 2020-01-27
Processed: 2221-38875_43401 2020-01-28
Processed: 2221-38875_43401 2020-01-29
Processed: 2221-38875_43401 2020-01-30
Processed: 2221-38875_43401 2020-01-31
Processed: 2221-38875_43401 2020-02-01
Processed: 2221-38875_43401 2020-02-02
Processed: 2221-38875_43401 2020-02-03
Processed: 2221-38875_43401 2020-02-04
Processed: 2221-38875_43401 2020-02-05
Processed: 2221-38875_43401 2020-02-06
Processed: 2221-38875_43401 2020-02-07
Processed: 2221-38875_43401 2020-02-08
Processed: 2221-38875_43401 2020-02-09
Processed: 2221-38875_43401 2020-02-10
Processed: 2221-38875_434

In [15]:
example_csv = sorted(merch_dir.glob("*.csv"))[0]
example_wav = sorted(audio_dir.glob("*.wav"))[0]

print("Example CSV:", example_csv)
print("Example WAV:", example_wav)

pd.read_csv(example_csv).tail()

Example CSV: merchados\2221-38875_43401_2020-01-17.csv
Example WAV: aud\2221-38875_43401_2020-01-17.wav


Unnamed: 0,minute_of_day,activity,sig_med,freq_mean,freqsd_mean,noise_mean,n_dets,bird_id,date
1435,1435,0.0,0.0,0.0,0.0,0.0,0.0,2221-38875_43401,2020-01-17
1436,1436,0.0,0.0,0.0,0.0,0.0,0.0,2221-38875_43401,2020-01-17
1437,1437,0.0,0.0,0.0,0.0,0.0,0.0,2221-38875_43401,2020-01-17
1438,1438,0.0,0.0,0.0,0.0,0.0,0.0,2221-38875_43401,2020-01-17
1439,1439,0.0,0.0,0.0,0.0,0.0,0.0,2221-38875_43401,2020-01-17


In [24]:
Audio(filename=str(example_wav))

In [16]:
raw_dir = base_dir / "raw_sig"
raw_dir.mkdir(exist_ok=True)

raw_dir

WindowsPath('raw_sig')

In [17]:
tasks = []

pairs = (
    df[['bird_id', 'date']]
    .dropna()
    .drop_duplicates()
    .sort_values(['bird_id', 'date'])
)

len(pairs)

for _, row in pairs.iterrows():
    bird = row['bird_id']
    day = pd.to_datetime(row['date']).date()

    # ---- 1) merged 1440-min ----
    merged = make_merged_for_day(df, bird, day)
    if merged is None:
        continue

    csv_name = f"{bird}_{day}.csv"
    merged_path = merch_dir / csv_name
    merged.to_csv(merged_path, index=False)

    # ---- 2) audio WAV ----
    wav_name = f"{bird}_{day}.wav"
    wav_path = audio_dir / wav_name
    merged_to_tone_audio(merged, wav_path)

    # ---- 3) raw por antenas, pero solo sig (+ contexto mínimo) ----
    raw = df[(df['bird_id'] == bird) & (df['date'] == day)].copy()
    if raw.empty:
        continue

    # nos quedamos solo con columnas mínimas
    raw_min = raw[['ts', 'minute_of_day', 'recvDeployName', 'port', 'sig']].copy()
    raw_name = f"{bird}_{day}_raw_sig.csv"
    raw_path = raw_dir / raw_name
    raw_min.to_csv(raw_path, index=False)

    # ---- 4) Añadir entrada para JSONL de Label Studio ----
    task = {
        "data": {
            # OJO: estas rutas asumen que montarás la carpeta del proyecto
            # en Label Studio y usarás /data/local-files/?d=...
            "audio":  f"/data/local-files/?d=aud/{wav_name}",
            "merged": f"/data/local-files/?d=merchados/{csv_name}",
            "raw":    f"/data/local-files/?d=raw_sig/{raw_name}",
            "bird_id": str(bird),
            "date":    str(day)
        }
    }
    tasks.append(task)

    print("Processed:", bird, day)

Processed: 2221-38875_43401 2020-01-17
Processed: 2221-38875_43401 2020-01-18
Processed: 2221-38875_43401 2020-01-19
Processed: 2221-38875_43401 2020-01-20
Processed: 2221-38875_43401 2020-01-21
Processed: 2221-38875_43401 2020-01-22
Processed: 2221-38875_43401 2020-01-23
Processed: 2221-38875_43401 2020-01-24
Processed: 2221-38875_43401 2020-01-25
Processed: 2221-38875_43401 2020-01-26
Processed: 2221-38875_43401 2020-01-27
Processed: 2221-38875_43401 2020-01-28
Processed: 2221-38875_43401 2020-01-29
Processed: 2221-38875_43401 2020-01-30
Processed: 2221-38875_43401 2020-01-31
Processed: 2221-38875_43401 2020-02-01
Processed: 2221-38875_43401 2020-02-02
Processed: 2221-38875_43401 2020-02-03
Processed: 2221-38875_43401 2020-02-04
Processed: 2221-38875_43401 2020-02-05
Processed: 2221-38875_43401 2020-02-06
Processed: 2221-38875_43401 2020-02-07
Processed: 2221-38875_43401 2020-02-08
Processed: 2221-38875_43401 2020-02-09
Processed: 2221-38875_43401 2020-02-10
Processed: 2221-38875_434

In [20]:
from pathlib import Path
import json

In [21]:
jsonl_path = base_dir / "labelstudio_tasks_audio_merged_rawsig.jsonl"

with open(jsonl_path, "w", encoding="utf-8") as f:
    for t in tasks:
        f.write(json.dumps(t) + "\n")