### Spectrogram Mapping Function

In [3]:
from torchaudio.compliance.kaldi import fbank
import torch 
import numpy as np
from PIL import Image

def prepare_dataset(batch, label_key="human_labels", sample_frequency=32_000):
    data = [torch.from_numpy(b["array"]) for b in batch["audio"]]

    imgs = []
    for d in data: 
        img = fbank(
            d.unsqueeze(0),
            htk_compat=True,
            sample_frequency=sample_frequency,
            use_energy=False,
            window_type='hanning',
            num_mel_bins=128,
            dither=0.0,
            frame_shift=10
        )
        imgs.append(img.T) # after .T: width, height
    imgs = [Image.fromarray(img.numpy()) for img in imgs]
    
    batch['input_values'] = imgs
    batch["label"] = batch[label_key]

    return batch

def get_prepare_dataset_fn(label_key="human_labels", sample_frequency=32_000):
    return lambda batch: prepare_dataset(batch, label_key, sample_frequency)

In [11]:
from datasets import load_dataset
from datasets import Audio, load_dataset, Sequence, ClassLabel

dataset = load_dataset(
    "agkphysics/AudioSet", 
    cache_dir="/home/lrauch/projects/birdMAE/data/audioset_balanced")

dataset = dataset.cast_column("audio", Audio(sampling_rate=32_000, decode=False))

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

In [8]:
dataset["train"][0]["audio"]["path"]

'audio/bal_train/--PJHxphWEs.flac'

### AudioSet - Balanced

1) Prepare the dataset 

In [23]:
import json
from datasets import Audio, load_dataset, Sequence, ClassLabel


dataset = load_dataset(
    "agkphysics/AudioSet", 
    cache_dir="/home/lrauch/projects/birdMAE/data/audioset_balanced")

dataset = dataset.cast_column("audio", Audio(sampling_rate=32_000))

def _one_hot_encode(batch): # mapping does not work here for some reason 
    label_list = [y for y in batch["human_labels"]]
    
    class_one_hot_matrix = np.zeros((len(label_list), 527), dtype=np.float32)
    
    for class_idx, indices in enumerate(label_list):
        class_one_hot_matrix[class_idx, indices] = 1.0
    
    return {"human_labels": class_one_hot_matrix}

with open("/home/lrauch/projects/birdMAE/data/audioset_ontology_custom527.json", "r") as f:
    ontology = json.load(f)

num_classes = len(ontology)
label_names = list(ontology.keys())
class_label = Sequence(ClassLabel(num_classes=num_classes, names=label_names))
dataset = dataset.cast_column("human_labels", class_label)
dataset = dataset.map(_one_hot_encode, batched=True, batch_size=1000, load_from_cache_file=True)

rows_to_remove = [15_759,17_532] #corrupted
all_indices = list(range(len(dataset["train"])))
indices_to_keep = [i for i in all_indices if i not in rows_to_remove]
dataset["train"] = dataset["train"].select(indices_to_keep)

rows_to_remove = [6_182] #corrupted
all_indices = list(range(len(dataset["test"])))
indices_to_keep = [i for i in all_indices if i not in rows_to_remove]
dataset["test"] = dataset["test"].select(indices_to_keep)

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

In [24]:
from tqdm import tqdm 
for i in tqdm(range(500)):
    dataset["train"][i]["audio"]["array"] # 56 it/s, 8 seconds

100%|██████████| 500/500 [00:08<00:00, 55.96it/s]


2) Create spectrograms 

In [25]:
dataset= dataset.map(
    get_prepare_dataset_fn(
        label_key="human_labels",
        sample_frequency=32_000
    ),
    remove_columns=dataset["train"].column_names,
    batched=True,
    batch_size=500,
    num_proc=5)

dataset.save_to_disk("../data/audioset_balanced/audioset_balanced_prepared_32") 
# size: 18GB, if audiofiles: 47GB

Map (num_proc=5):   0%|          | 0/18683 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/17141 [00:00<?, ? examples/s]

Saving the dataset (0/20 shards):   0%|          | 0/18683 [00:00<?, ? examples/s]

Saving the dataset (0/18 shards):   0%|          | 0/17141 [00:00<?, ? examples/s]

In [26]:
from tqdm import tqdm 
for i in tqdm(range(500)):
    np.array(dataset["train"][i]["input_values"]) # 2719 it/s, 0 seconds

100%|██████████| 500/500 [00:00<00:00, 2719.08it/s]


In [1]:
from datasets import load_from_disk
dataset = load_from_disk("../data/audioset_balanced/audioset_balanced_prepared_32")


FileNotFoundError: Directory ../data/audioset_balanced/audioset_balanced_prepared_32 not found

### Prepare XC BirdSet

In [40]:
from datasets import load_from_disk
dataset = load_from_disk("/home/lrauch/projects/birdMAE/data/XCM/XCM_processed_42_004252813e2e3003")
#dataset = load_from_disk("/home/lrauch/projects/birdMAE/data/HSN/HSN_processed_42_8fc39823e3ef9314")

In [41]:
dataset

DatasetDict({
    train: Dataset({
        features: ['filepath', 'labels', 'detected_events', 'start_time', 'end_time'],
        num_rows: 184565
    })
    valid: Dataset({
        features: ['filepath', 'labels', 'detected_events', 'start_time', 'end_time'],
        num_rows: 4560
    })
})

In [2]:
dataset["train"][0]

{'filepath': '/home/lrauch/projects/birdMAE/data/XCM/downloads/extracted/31d12b771a7fcbaaa492e8d786a5c0fe2aea66b3ff8a803977e9827be6ede33f/data/xeno-canto/europe/parus_major/XC654011.ogg',
 'labels': array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [63]:
dataset_t = dataset["train"].select(range(10000))

In [7]:
dataset_t.save_to_disk("../data/XCM/XCM_test_noimage")

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [84]:
# columns = ["filepath", "labels", "start_time", "end_time"]
# dataset_t.set_format("numpy", columns=columns, output_all_columns=True)
#test.set_format("python", columns=["detected_events"], output_all_columns=True)

In [61]:
from torchaudio.compliance.kaldi import fbank
import torch 
import numpy as np
from PIL import Image
from birdset.datamodule.components.event_decoding import EventDecoding

event_decoder = EventDecoding(min_len=5, max_len=5, sampling_rate=32_000)

def prepare_dataset(batch, label_key="labels", sample_frequency=32_000):
    labels = batch[label_key]
    data = event_decoder(batch)
    data = [torch.from_numpy(audio["array"]) for audio in batch["audio"]]

    imgs = []
    batch = {}
    for d in data: 
        img = fbank(
            d.unsqueeze(0),
            htk_compat=True,
            sample_frequency=sample_frequency,
            use_energy=False,
            window_type='hanning',
            num_mel_bins=128,
            dither=0.0,
            frame_shift=10
        )
        imgs.append(img.T) # after .T: width, height
    imgs = [Image.fromarray(img.numpy()) for img in imgs]
    
    batch['input_values'] = imgs
    batch["label"] = labels

    return batch

def get_prepare_dataset_fn(label_key="labels", sample_frequency=32_000):
    return lambda batch: prepare_dataset(batch, label_key, sample_frequency)

In [64]:
dataset_t = dataset_t.map(
    get_prepare_dataset_fn(
        label_key="labels",
        sample_frequency=32_000
    ),
    remove_columns=["filepath", "labels", "detected_events", "start_time", "end_time"],
    batched=True,
    batch_size=100,
    num_proc=8,
    load_from_cache_file=False)

Map (num_proc=8):   0%|          | 0/10000 [00:00<?, ? examples/s]

TimeoutError: 

In [17]:
dataset_t.set_format("python")


In [18]:
dataset_t.save_to_disk("../data/XCM/XCM_test_image_")

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [1]:
import datasets 
datasets.config.IN_MEMORY_MAX_SIZE

0.0

In [2]:
from datasets import load_from_disk
ds = load_from_disk("../data/XCM/XCM_test_noimage", keep_in_memory=None)

In [5]:
from datasets import load_from_disk
ds = load_from_disk("../data/XCM/XCM_test_noimage", keep_in_memory=None)

import soundfile as sf
from tqdm import tqdm
for i in tqdm(range(len(ds))):
    file_info = sf.info(ds[i]["filepath"])
    sr = file_info.samplerate
    total_duration = file_info.duration

    start = 0 
    end = 5 if total_duration > 5 else total_duration
    start, end = int(start * sr), int(end * sr)
    audio, sr = sf.read(ds[i]["filepath"], start=start, stop=end)


    # on the server in a notebook: 16 seconds, 60 it/ts

100%|██████████| 1000/1000 [00:04<00:00, 215.34it/s]


In [6]:
import psutil 

def print_memory_usage():
    process = psutil.Process()  # Get the current process
    mem_info = process.memory_info()  # Memory usage information
    print(f"Memory Usage: {mem_info.rss / 1024 ** 2:.2f} MB")  # Convert bytes to MB

# Call the function to print memory usage
print_memory_usage()

Memory Usage: 205.73 MB


In [7]:
import numpy as np
from tqdm import tqdm
from datasets import load_from_disk

ds = load_from_disk("../data/XCM/XCM_test_image_", keep_in_memory=False)
for i in tqdm(range(len(ds))):
    ds[i]["input_values"]

100%|██████████| 1000/1000 [00:00<00:00, 4044.59it/s]


In [8]:
dataset_t["input_values"][0]

NameError: name 'dataset_t' is not defined

In [89]:
dataset_t.set_format("python")


In [90]:
dataset_t[0]["input_values"]

<PIL.TiffImagePlugin.TiffImageFile image mode=F size=498x128>

In [10]:
from datasets import load_dataset
dataset = load_dataset("lhoestq/demo1")

KeyboardInterrupt: 

In [2]:
from transformers import AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
dataset = load_dataset('glue', 'mrpc')
encoded_dataset = dataset.map(lambda examples: tokenizer(examples["sentence1"]), batched=True)



In [4]:
for i in tqdm(range(len(encoded_dataset["train"]))):
    encoded_dataset["train"][i]["input_ids"]

100%|██████████| 3668/3668 [00:00<00:00, 27160.20it/s]


In [1]:
import psutil 

def print_memory_usage():
    process = psutil.Process()  # Get the current process
    mem_info = process.memory_info()  # Memory usage information
    print(f"Memory Usage: {mem_info.rss / 1024 ** 2:.2f} MB")  # Convert bytes to MB

# Call the function to print memory usage
print_memory_usage()

Memory Usage: 61.85 MB
