## Example on using the dataset

Resources on building the dataset
- https://huggingface.co/datasets/AILAB-VNUHCM/vivos/blob/main/vivos.py
- https://huggingface.co/docs/datasets/en/audio_dataset#loading-script
- https://huggingface.co/docs/datasets/en/stream

Whenever changes are made to the dataset repo, run ```Remove-Item -Recurse -Force ~/.cache/huggingface/datasets/``` from the terminal

In [1]:
file_paths = [
    "waves/3000-1_11.wav",
    "waves/3000-1_28.wav",
    "waves/3000-1_52.wav",
    "waves/3001-1_101.wav",
    "waves/3001-1_105.wav",
    "waves/3001-1_11.wav",
    "waves/3001-1_110.wav",
    "waves/3001-1_113.wav",
    "waves/3001-1_127.wav",
    "waves/3001-1_13.wav",
    "waves/3001-1_130.wav",
    "waves/3001-1_142.wav",
    "waves/3001-1_197.wav",
    "waves/3001-1_75.wav",
    "waves/3001-1_83.wav",
    "waves/3001-1_96.wav"
]

In [2]:
num_test_examples = 80

In [3]:
from datasets import load_dataset
from IPython.display import Audio
import soundfile as sf
from torch.utils.data import IterableDataset
import os 
import numpy as np
from itertools import islice
dataset_repo = "johnlohjy/imda_nsc_p3_test_noiseaugmented"
dataset_test = load_dataset(dataset_repo, split='test', streaming=True, trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

In [4]:
class SlicedDataset(IterableDataset):
    def __init__(self, dataset, num_examples):
        self.dataset = dataset
        self.num_examples = num_examples

    def __iter__(self):
        return islice(iter(self.dataset), self.num_examples)

    def __len__(self):
        return self.num_examples

dataset_test_reduced = SlicedDataset(dataset_test, num_examples=num_test_examples)

In [5]:
output_dir = "audio_samples_find_ideal_context_len"
os.makedirs(output_dir, exist_ok=True)

for sample in dataset_test_reduced:
    audio_path = sample["audio"]["path"]
    if audio_path in file_paths:
        audio_array = sample["audio"]["array"].astype(np.float32)
        sample_rate = sample["audio"]["sampling_rate"]
        output_file = os.path.join(output_dir, os.path.basename(audio_path))

        sf.write(output_file, audio_array, sample_rate)