### Spectrogram Mapping Function

In [22]:
from torchaudio.compliance.kaldi import fbank
import torch 
import numpy as np
from PIL import Image

def prepare_dataset(batch, label_key="human_labels", sample_frequency=32_000):
    data = [torch.from_numpy(b["array"]) for b in batch["audio"]]

    imgs = []
    for d in data: 
        img = fbank(
            d.unsqueeze(0),
            htk_compat=True,
            sample_frequency=sample_frequency,
            use_energy=False,
            window_type='hanning',
            num_mel_bins=128,
            dither=0.0,
            frame_shift=10
        )
        imgs.append(img.T) # after .T: width, height
    imgs = [Image.fromarray(img.numpy()) for img in imgs]
    
    batch['input_values'] = imgs
    batch["label"] = batch[label_key]

    return batch

def get_prepare_dataset_fn(label_key="human_labels", sample_frequency=32_000):
    return lambda batch: prepare_dataset(batch, label_key, sample_frequency)

### AudioSet - Balanced

1) Prepare the dataset 

In [23]:
import json
from datasets import Audio, load_dataset, Sequence, ClassLabel


dataset = load_dataset(
    "agkphysics/AudioSet", 
    cache_dir="/home/lrauch/projects/birdMAE/data/audioset_balanced")

dataset = dataset.cast_column("audio", Audio(sampling_rate=32_000))

def _one_hot_encode(batch): # mapping does not work here for some reason 
    label_list = [y for y in batch["human_labels"]]
    
    class_one_hot_matrix = np.zeros((len(label_list), 527), dtype=np.float32)
    
    for class_idx, indices in enumerate(label_list):
        class_one_hot_matrix[class_idx, indices] = 1.0
    
    return {"human_labels": class_one_hot_matrix}

with open("/home/lrauch/projects/birdMAE/data/audioset_ontology_custom527.json", "r") as f:
    ontology = json.load(f)

num_classes = len(ontology)
label_names = list(ontology.keys())
class_label = Sequence(ClassLabel(num_classes=num_classes, names=label_names))
dataset = dataset.cast_column("human_labels", class_label)
dataset = dataset.map(_one_hot_encode, batched=True, batch_size=1000, load_from_cache_file=True)

rows_to_remove = [15_759,17_532] #corrupted
all_indices = list(range(len(dataset["train"])))
indices_to_keep = [i for i in all_indices if i not in rows_to_remove]
dataset["train"] = dataset["train"].select(indices_to_keep)

rows_to_remove = [6_182] #corrupted
all_indices = list(range(len(dataset["test"])))
indices_to_keep = [i for i in all_indices if i not in rows_to_remove]
dataset["test"] = dataset["test"].select(indices_to_keep)

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

In [24]:
from tqdm import tqdm 
for i in tqdm(range(500)):
    dataset["train"][i]["audio"]["array"] # 56 it/s, 8 seconds

100%|██████████| 500/500 [00:08<00:00, 55.96it/s]


2) Create spectrograms 

In [25]:
dataset= dataset.map(
    get_prepare_dataset_fn(
        label_key="human_labels",
        sample_frequency=32_000
    ),
    remove_columns=dataset["train"].column_names,
    batched=True,
    batch_size=500,
    num_proc=5)

dataset.save_to_disk("../data/audioset_balanced/audioset_balanced_prepared_32") 
# size: 18GB, if audiofiles: 47GB

Map (num_proc=5):   0%|          | 0/18683 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/17141 [00:00<?, ? examples/s]

Saving the dataset (0/20 shards):   0%|          | 0/18683 [00:00<?, ? examples/s]

Saving the dataset (0/18 shards):   0%|          | 0/17141 [00:00<?, ? examples/s]

In [26]:
from tqdm import tqdm 
for i in tqdm(range(500)):
    np.array(dataset["train"][i]["input_values"]) # 2719 it/s, 0 seconds

100%|██████████| 500/500 [00:00<00:00, 2719.08it/s]


In [14]:
from datasets import load_from_disk
dataset = load_from_disk("../data/audioset_balanced/audioset_balanced_prepared_32")


Loading dataset from disk:   0%|          | 0/39 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/35 [00:00<?, ?it/s]