In [3]:
%load_ext autoreload
%autoreload 2

### Using helper functions in src/data/load/data_loader

In [1]:
import random
from data.load.data_loader import get_codecfake_audio_id_list, load_audio_data

In [2]:
audio_ids = get_codecfake_audio_id_list()
len(audio_ids)

35373

In [11]:
# It won't download the data to local disk. Instead, it streams the data directly from HuggingFace,
# holding only one sample in memory at each iteration.

random_audio_ids = random.sample(audio_ids, 5)
iterable_dataset = load_audio_data(
    audio_ids=random_audio_ids,
    dataset='codecfake'
)

for sample in iterable_dataset:
    print(sample)

{'audio': {'path': 'F01_p310_044.flac', 'array': array([-0.00521851, -0.00527954, -0.00531006, ..., -0.00335693,
       -0.00335693, -0.00332642]), 'sampling_rate': 16000}, 'audio_id': 'p310_044', 'real_or_fake': 'F01'}
{'audio': {'path': 'F05_p310_044.flac', 'array': array([ 0.00112915,  0.00125122,  0.00125122, ..., -0.00280762,
       -0.00265503, -0.00286865]), 'sampling_rate': 48000}, 'audio_id': 'p310_044', 'real_or_fake': 'F05'}
{'audio': {'path': 'F02_p310_044.flac', 'array': array([ 1.06811523e-03,  1.70898438e-03,  1.28173828e-03, ...,
        5.49316406e-04,  6.10351562e-05, -1.22070312e-04]), 'sampling_rate': 16000}, 'audio_id': 'p310_044', 'real_or_fake': 'F02'}
{'audio': {'path': 'F06_p310_044.flac', 'array': array([0.00421143, 0.00418091, 0.00424194, ..., 0.00863647, 0.00878906,
       0.00888062]), 'sampling_rate': 16000}, 'audio_id': 'p310_044', 'real_or_fake': 'F06'}
{'audio': {'path': 'F03_p310_044.flac', 'array': array([0.00421143, 0.00418091, 0.00424194, ..., 0.008

In [12]:
# It will download the data to the local disk and cache it in the specified cache directory.
# So, next time, if you try to access an audio ID from any of the cached parquet files, it will be retrieved from the local cache folder.
# Note: Total space consumed by all 381 parquet files is around 20GB.
# It still holds only one sample in memory at each iteration.

random.seed(5)
random_audio_ids = random.sample(audio_ids, 5)
iterable_dataset = load_audio_data(
    audio_ids=random_audio_ids,
    dataset='codecfake',
    cache_dir='./cache'
)

for sample in iterable_dataset:
    print(sample)

{'audio': {'path': 'R_p278_119.flac', 'array': array([-4.27246094e-04, -9.15527344e-05, -4.27246094e-04, ...,
       -8.85009766e-04, -7.93457031e-04, -7.93457031e-04]), 'sampling_rate': 48000}, 'audio_id': 'p278_119', 'real_or_fake': 'R'}
{'audio': {'path': 'F04_p278_119.flac', 'array': array([ 5.18798828e-04, -9.15527344e-05,  1.52587891e-04, ...,
       -9.76562500e-04, -9.76562500e-04, -9.76562500e-04]), 'sampling_rate': 24000}, 'audio_id': 'p278_119', 'real_or_fake': 'F04'}
{'audio': {'path': 'F03_p278_119.flac', 'array': array([0.00167847, 0.00219727, 0.00231934, ..., 0.00686646, 0.00668335,
       0.0067749 ]), 'sampling_rate': 16000}, 'audio_id': 'p278_119', 'real_or_fake': 'F03'}
{'audio': {'path': 'F02_p278_119.flac', 'array': array([ 2.13623047e-04,  0.00000000e+00, -2.44140625e-04, ...,
        3.35693359e-04,  9.15527344e-05, -1.83105469e-04]), 'sampling_rate': 16000}, 'audio_id': 'p278_119', 'real_or_fake': 'F02'}
{'audio': {'path': 'F06_p278_119.flac', 'array': array([0.

### Code to load codecfake data from HuggingFace (without helper functions in src module)

In [1]:
import os
import time
import random

import json
import requests

import numpy as np
import matplotlib.pyplot as plt

import librosa

from datasets import load_dataset, Audio
from collections import defaultdict

In [2]:
from utils.config import load_config

config      = load_config()
cache_files = config['data_paths']['codecfake']['cache_files']

In [3]:
username      = "ajaykarthick"
dataset_name  = "codecfake-audio"
repo_id       = f"{username}/{dataset_name}"
json_file_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/audio_id_to_file_map.json"

In [4]:
# Get the Audio ID to Parquet File Map
response = requests.get(json_file_url)
response.raise_for_status()
audio_id_to_file_map = response.json()

### Stream from HuggingFace Datasets (without Downloading datasets)

In [6]:
def get_dataset(audio_id):
    parquet_file = audio_id_to_file_map[audio_id]
    iterable_ds = load_dataset("parquet", data_files=parquet_file, split="train", streaming=True)
    dataset = iterable_ds.filter(lambda example: example['audio_id'] == audio_id)
    return dataset

audio_ids = list(audio_id_to_file_map.keys())
audio_id = audio_ids[-1]

start_time = time.time()

for example in get_dataset(audio_id):
    print(example)
    
current_time = time.time()
print(f"Time elapsed: {current_time - start_time} seconds")

{'audio': {'path': 'R_p376_295.flac', 'array': array([-0.00546265, -0.0085144 , -0.00708008, ..., -0.01385498,
       -0.01327515, -0.01489258]), 'sampling_rate': 48000}, 'audio_id': 'p376_295', 'real_or_fake': 'R'}
{'audio': {'path': 'F04_p376_295.flac', 'array': array([-0.00265503, -0.00244141, -0.0022583 , ..., -0.01498413,
       -0.0149231 , -0.01489258]), 'sampling_rate': 24000}, 'audio_id': 'p376_295', 'real_or_fake': 'F04'}
{'audio': {'path': 'F03_p376_295.flac', 'array': array([ 0.00158691,  0.00146484,  0.00158691, ..., -0.01025391,
       -0.01049805, -0.01074219]), 'sampling_rate': 16000}, 'audio_id': 'p376_295', 'real_or_fake': 'F03'}
{'audio': {'path': 'F02_p376_295.flac', 'array': array([-0.00448608, -0.00445557, -0.00537109, ...,  0.00067139,
        0.00033569,  0.00042725]), 'sampling_rate': 16000}, 'audio_id': 'p376_295', 'real_or_fake': 'F02'}
{'audio': {'path': 'F05_p376_295.flac', 'array': array([-0.00708008, -0.00720215, -0.00750732, ..., -0.0055542 ,
       -0.0

### Stream more than one audio ID from single parquet file

In [7]:
from datasets import load_dataset

def get_dataset(audio_ids):
    if isinstance(audio_ids, str):
        audio_ids = [audio_ids]

    # Check if all audio_ids belong to the same parquet file
    parquet_file = audio_id_to_file_map[audio_ids[0]]
    for audio_id in audio_ids:
        if audio_id_to_file_map[audio_id] != parquet_file:
            raise ValueError("Not all audio_ids belong to the same parquet file")

    iterable_ds = load_dataset("parquet", data_files=parquet_file, split="train", streaming=True)
    
    # Filter the dataset to include only the desired audio_ids
    dataset = iterable_ds.filter(lambda example: example['audio_id'] in audio_ids)
    return dataset


audio_ids = list(audio_id_to_file_map.keys())[:10] 

start_time = time.time()

dataset = get_dataset(audio_ids)
for example in dataset:
    print(example)

current_time = time.time()
print(f"Total time elapsed: {current_time - start_time} seconds")


{'audio': {'path': 'R_p225_002.flac', 'array': array([-0.0045166 , -0.00665283, -0.006073  , ...,  0.00723267,
        0.00668335,  0.00775146]), 'sampling_rate': 48000}, 'audio_id': 'p225_002', 'real_or_fake': 'R'}
{'audio': {'path': 'F03_p225_002.flac', 'array': array([0.00195312, 0.0015564 , 0.00164795, ..., 0.00265503, 0.00244141,
       0.00262451]), 'sampling_rate': 16000}, 'audio_id': 'p225_002', 'real_or_fake': 'F03'}
{'audio': {'path': 'F04_p225_002.flac', 'array': array([-0.0012207 , -0.00112915, -0.0010376 , ...,  0.00564575,
        0.00564575,  0.00558472]), 'sampling_rate': 24000}, 'audio_id': 'p225_002', 'real_or_fake': 'F04'}
{'audio': {'path': 'F01_p225_002.flac', 'array': array([ 0.00186157,  0.00192261,  0.00195312, ..., -0.00888062,
       -0.00888062, -0.00894165]), 'sampling_rate': 16000}, 'audio_id': 'p225_002', 'real_or_fake': 'F01'}
{'audio': {'path': 'F05_p225_002.flac', 'array': array([-0.00335693, -0.0032959 , -0.00320435, ...,  0.00796509,
        0.0077209

### Stream multiple Audio files from Multiple Parquet files

In [5]:
def get_dataset(audio_ids):
    if isinstance(audio_ids, str):
        audio_ids = [audio_ids]

    # Create a dictionary to map parquet files to audio IDs
    parquet_to_audio_ids = defaultdict(list)
    for audio_id in audio_ids:
        parquet_file = audio_id_to_file_map[audio_id]['train']
        parquet_to_audio_ids[parquet_file].append(audio_id)

    # Create a generator to yield filtered examples from each parquet file
    def dataset_generator():
        for parquet_file, ids in parquet_to_audio_ids.items():
            iterable_ds = load_dataset("parquet", data_files={'train': parquet_file}, split="train", streaming=True)
            filtered_ds = iterable_ds.filter(lambda example: example['audio_id'] in ids)
            for example in filtered_ds:
                yield example

    return dataset_generator()

audio_ids = list(audio_id_to_file_map.keys())
random_audio_ids = random.sample(audio_ids, 10)

start_time = time.time()

dataset_gen = get_dataset(random_audio_ids)

# Iterate over the generator to process each example
for example in dataset_gen:
    print(example)

current_time = time.time()
print(f"Total time elapsed: {current_time - start_time} seconds")


{'audio': {'path': 'R_p236_083.flac', 'array': array([ 0.03491211,  0.04995728,  0.04464722, ..., -0.06677246,
       -0.06228638, -0.06896973]), 'sampling_rate': 48000}, 'audio_id': 'p236_083', 'real_or_fake': 'R'}
{'audio': {'path': 'F03_p236_083.flac', 'array': array([ 0.02581787,  0.02758789,  0.02987671, ..., -0.06774902,
       -0.06851196, -0.06954956]), 'sampling_rate': 16000}, 'audio_id': 'p236_083', 'real_or_fake': 'F03'}
{'audio': {'path': 'F04_p236_083.flac', 'array': array([0.03515625, 0.03408813, 0.03442383, ..., 0.03271484, 0.03311157,
       0.03344727]), 'sampling_rate': 24000}, 'audio_id': 'p236_083', 'real_or_fake': 'F04'}
{'audio': {'path': 'F05_p236_083.flac', 'array': array([ 0.01300049,  0.00997925,  0.00726318, ..., -0.02612305,
       -0.02603149, -0.02661133]), 'sampling_rate': 48000}, 'audio_id': 'p236_083', 'real_or_fake': 'F05'}
{'audio': {'path': 'F01_p236_083.flac', 'array': array([ 0.0480957 ,  0.04959106,  0.05072021, ..., -0.01605225,
       -0.0133056