# generate a dataframe of sparse multi-hot clip labels for BirdSet XCL Train dataset

In [19]:
from opensoundscape import Audio, Spectrogram, CNN, BoxedAnnotations
import opensoundscape as opso
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path

from matplotlib import pyplot as plt
def figsize(w,h):
    plt.rcParams['figure.figsize']=[w,h]
figsize(15,5) #for big visuals
%config InlineBackend.figure_format = 'retina'
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

import datasets
from tqdm.autonotebook import tqdm



Prepare and load the dataset

If this is your first time using it, it will download all of xeno-canto! (Consider using smaller datasets or XC subset)

For subsequent uses, just make sure to specify the same cache_dir so that it uses downloaded files

In [None]:
cache_dir = "/home/kitzeslab/data/data_birdset/"
ds = datasets.load_dataset(
    "DBD-research-group/BirdSet",
    "XCL",
    trust_remote_code=True,
    cache_dir=cache_dir,
)
t = ds["train"]
len(t)

528422

how to create the training table depends on how we want to sample clips from XC files

for this example, we will create one clip for each of the detected events. If there are no detected events, we will use the beginning of the audio file. We use only up to the first 5 events maximum (random sample may be better). We use the start time of the event as the start time of the audio clip - center or random may be better. 

In [None]:
clip_duration = 3
max_events = 5

records = []
for i in tqdm(range(len(t))):
    file = t[i]
    # optionally, filter by quality rating or other metadata
    # if file['quality'] not in ['A','B']:
    #     continue
    detected_events = file["detected_events"].copy()

    if len(detected_events) < 1:
        # no detections: use beginning of audio file
        detected_events = [[0, clip_duration]]
    elif len(detected_events) > max_events:
        # use up to 5 first
        detected_events = file["detected_events"][:max_events]

    for j, (start, end) in enumerate(detected_events):
        record = {
            "file": file["filepath"],
            "start_time": start,
            "annotation": file["ebird_code"],
        }
        records.append(record)
    # break
df = pd.DataFrame(records)

# convert integer annotation to list of one annotation per row
# this is the format used by annotations.categorical_to_multi_hot
df["annotation_list"] = [[x] for x in df["annotation"]]

make a sparse multi-hot label array of clip x class presence (1) / absence (0) (mostly 0s, sparse array efficiently stores 1s)

In [None]:
from opensoundscape import annotations

multihot_labels_sparse, classes = annotations.categorical_to_multi_hot(
    df["annotation_list"], sparse=True
)

convert integer labels to ebird codes

In [None]:
ebird_classes = [
    t.info.features["ebird_code_multilabel"].feature.int2str(c) for c in classes
]
ebird_classes[0]

'buwtea'

make a spare dataframe and save to pickle file

In [49]:
labels = pd.DataFrame.sparse.from_spmatrix(
    multihot_labels_sparse,
    index=pd.MultiIndex.from_frame(df[["file", "start_time"]]),
    columns=ebird_classes,
)
# saved pickle is 103 MB, not bad for df of shape (1991469, 9734) with file paths
labels.to_pickle(f"{cache_dir}/xcl_train_sparse_multihot_labels.pkl")

to use the labels in OpenSoundscape later: load the pickle and add the "end_time" index

In [None]:
labels = pd.read_pickle(f"{cache_dir}/xcl_train_sparse_multihot_labels.pkl")

# add in "end time" to the index, which is simply start time + clip duration in our case
labels["end_time"] = labels.index.get_level_values("start_time") + clip_duration
labels = labels.reset_index().set_index(["file", "start_time", "end_time"])
labels.shape