In [1]:
%load_ext autoreload
%autoreload 2

### Using helper functions in src/data/load/data_loader

In [2]:
import random
from data.load.data_loader import get_wavefake_audio_id_list, load_audio_data

In [3]:
audio_ids = get_wavefake_audio_id_list()
len(audio_ids)

13100

In [4]:
# It won't download the data to local disk. Instead, it streams the data directly from HuggingFace,
# holding only one sample in memory at each iteration.

random_audio_ids = random.sample(audio_ids, 5)
iterable_dataset = load_audio_data(
    audio_ids=random_audio_ids,
    dataset='wavefake'
)

for sample in iterable_dataset:
    print(sample)

{'audio': {'path': 'LJ022-0185_gen.wav', 'array': array([0.00091553, 0.00100708, 0.00100708, ..., 0.00082397, 0.00082397,
       0.00079346]), 'sampling_rate': 22050}, 'audio_id': 'LJ022-0185', 'real_or_fake': 'WF1'}
{'audio': {'path': 'LJ022-0185_gen.wav', 'array': array([-2.47192383e-03, -2.62451172e-03, -2.59399414e-03, ...,
       -1.52587891e-04, -6.10351562e-05,  3.35693359e-04]), 'sampling_rate': 22050}, 'audio_id': 'LJ022-0185', 'real_or_fake': 'WF2'}
{'audio': {'path': 'LJ022-0185_generated.wav', 'array': array([-0.00064087, -0.00036621, -0.00033569, ..., -0.00061035,
       -0.00054932, -0.00018311]), 'sampling_rate': 22050}, 'audio_id': 'LJ022-0185', 'real_or_fake': 'WF3'}
{'audio': {'path': 'LJ022-0185_gen.wav', 'array': array([-0.00698853, -0.00689697, -0.00686646, ..., -0.00057983,
       -0.0007019 , -0.00067139]), 'sampling_rate': 22050}, 'audio_id': 'LJ022-0185', 'real_or_fake': 'WF4'}
{'audio': {'path': 'LJ022-0185_gen.wav', 'array': array([-6.40869141e-04, -6.1035156

### Code to load codecfake data from HuggingFace (without helper functions in src module)

In [5]:
import os
import time
import random

import json
import requests

import numpy as np
import matplotlib.pyplot as plt

import librosa

from datasets import load_dataset, Audio
from collections import defaultdict

In [6]:
from utils.config import load_config

config      = load_config()
cache_files = config['data_paths']['wavefake']['cache_files']

In [7]:
username      = "ajaykarthick"
dataset_name  = "wavefake-audio"
repo_id       = f"{username}/{dataset_name}"
json_file_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/audio_id_to_file_map.json"

In [8]:
# Get the Audio ID to Parquet File Map
response = requests.get(json_file_url)
response.raise_for_status()
audio_id_to_file_map = response.json()

### Stream from HuggingFace Datasets (without Downloading datasets)

In [9]:
def get_dataset(audio_id):
    parquet_file = audio_id_to_file_map[audio_id]
    iterable_ds = load_dataset("parquet", data_files=parquet_file, split="train", streaming=True)
    dataset = iterable_ds.filter(lambda example: example['audio_id'] == audio_id)
    return dataset

audio_ids = list(audio_id_to_file_map.keys())
audio_id = audio_ids[-1]

start_time = time.time()

for example in get_dataset(audio_id):
    print(example)
    
current_time = time.time()
print(f"Time elapsed: {current_time - start_time} seconds")

{'audio': {'path': 'LJ050-0278_gen.wav', 'array': array([0.00048828, 0.00027466, 0.        , ..., 0.0010376 , 0.00112915,
       0.0012207 ]), 'sampling_rate': 22050}, 'audio_id': 'LJ050-0278', 'real_or_fake': 'WF1'}
{'audio': {'path': 'LJ050-0278_gen.wav', 'array': array([-0.00048828, -0.0005188 , -0.00039673, ...,  0.00064087,
        0.00128174,  0.00216675]), 'sampling_rate': 22050}, 'audio_id': 'LJ050-0278', 'real_or_fake': 'WF2'}
{'audio': {'path': 'LJ050-0278_generated.wav', 'array': array([-0.00042725,  0.00030518, -0.00042725, ..., -0.00018311,
       -0.00036621, -0.00024414]), 'sampling_rate': 22050}, 'audio_id': 'LJ050-0278', 'real_or_fake': 'WF3'}
{'audio': {'path': 'LJ050-0278_gen.wav', 'array': array([-2.16674805e-03, -2.16674805e-03, -2.19726562e-03, ...,
        1.52587891e-04,  3.05175781e-05,  9.15527344e-05]), 'sampling_rate': 22050}, 'audio_id': 'LJ050-0278', 'real_or_fake': 'WF4'}
{'audio': {'path': 'LJ050-0278_gen.wav', 'array': array([ 6.40869141e-04,  7.9345703

### Stream more than one audio ID from single parquet file

In [10]:
from datasets import load_dataset

def get_dataset(audio_ids):
    if isinstance(audio_ids, str):
        audio_ids = [audio_ids]

    # Check if all audio_ids belong to the same parquet file
    parquet_file = audio_id_to_file_map[audio_ids[0]]
    for audio_id in audio_ids:
        if audio_id_to_file_map[audio_id] != parquet_file:
            raise ValueError("Not all audio_ids belong to the same parquet file")

    iterable_ds = load_dataset("parquet", data_files=parquet_file, split="train", streaming=True)
    
    # Filter the dataset to include only the desired audio_ids
    dataset = iterable_ds.filter(lambda example: example['audio_id'] in audio_ids)
    return dataset


audio_ids = list(audio_id_to_file_map.keys())[:10] 

start_time = time.time()

dataset = get_dataset(audio_ids)
for example in dataset:
    print(example)

current_time = time.time()
print(f"Total time elapsed: {current_time - start_time} seconds")


{'audio': {'path': 'LJ001-0001_gen.wav', 'array': array([-9.15527344e-05, -9.15527344e-05, -9.15527344e-05, ...,
        7.01904297e-04,  9.46044922e-04,  1.00708008e-03]), 'sampling_rate': 22050}, 'audio_id': 'LJ001-0001', 'real_or_fake': 'WF1'}
{'audio': {'path': 'LJ001-0001_gen.wav', 'array': array([ 0.00097656,  0.00091553,  0.00100708, ..., -0.00042725,
        0.00030518,  0.00073242]), 'sampling_rate': 22050}, 'audio_id': 'LJ001-0001', 'real_or_fake': 'WF2'}
{'audio': {'path': 'LJ001-0001_generated.wav', 'array': array([ 0.00073242,  0.00067139, -0.00018311, ...,  0.00061035,
        0.00045776,  0.00039673]), 'sampling_rate': 22050}, 'audio_id': 'LJ001-0001', 'real_or_fake': 'WF3'}
{'audio': {'path': 'LJ001-0001_gen.wav', 'array': array([-0.00024414, -0.00021362, -0.00021362, ...,  0.00042725,
        0.00042725,  0.00057983]), 'sampling_rate': 22050}, 'audio_id': 'LJ001-0001', 'real_or_fake': 'WF4'}
{'audio': {'path': 'LJ001-0001_gen.wav', 'array': array([-9.15527344e-05, -1.5

Total time elapsed: 12.488763332366943 seconds


### Stream multiple Audio files from Multiple Parquet files

In [None]:
def get_dataset(audio_ids):
    if isinstance(audio_ids, str):
        audio_ids = [audio_ids]

    # Create a dictionary to map parquet files to audio IDs
    parquet_to_audio_ids = defaultdict(list)
    for audio_id in audio_ids:
        parquet_file = audio_id_to_file_map[audio_id]['train']
        parquet_to_audio_ids[parquet_file].append(audio_id)

    # Create a generator to yield filtered examples from each parquet file
    def dataset_generator():
        for parquet_file, ids in parquet_to_audio_ids.items():
            iterable_ds = load_dataset("parquet", data_files={'train': parquet_file}, split="train", streaming=True)
            filtered_ds = iterable_ds.filter(lambda example: example['audio_id'] in ids)
            for example in filtered_ds:
                yield example

    return dataset_generator()

audio_ids = list(audio_id_to_file_map.keys())
random_audio_ids = random.sample(audio_ids, 10)

start_time = time.time()

dataset_gen = get_dataset(random_audio_ids)

# Iterate over the generator to process each example
for example in dataset_gen:
    print(example)

current_time = time.time()
print(f"Total time elapsed: {current_time - start_time} seconds")


{'audio': {'path': 'LJ003-0317_gen.wav', 'array': array([9.15527344e-05, 3.05175781e-05, 3.05175781e-05, ...,
       1.06811523e-03, 1.06811523e-03, 1.09863281e-03]), 'sampling_rate': 22050}, 'audio_id': 'LJ003-0317', 'real_or_fake': 'WF1'}
{'audio': {'path': 'LJ003-0317_gen.wav', 'array': array([0.00018311, 0.00015259, 0.00018311, ..., 0.00033569, 0.00048828,
       0.00085449]), 'sampling_rate': 22050}, 'audio_id': 'LJ003-0317', 'real_or_fake': 'WF2'}
{'audio': {'path': 'LJ003-0317_generated.wav', 'array': array([-6.40869141e-04, -4.27246094e-04, -9.15527344e-05, ...,
       -1.09863281e-03, -1.25122070e-03, -1.25122070e-03]), 'sampling_rate': 22050}, 'audio_id': 'LJ003-0317', 'real_or_fake': 'WF3'}
{'audio': {'path': 'LJ003-0317_gen.wav', 'array': array([-0.0010376 , -0.00097656, -0.0010376 , ...,  0.00030518,
        0.00021362,  0.00030518]), 'sampling_rate': 22050}, 'audio_id': 'LJ003-0317', 'real_or_fake': 'WF4'}
{'audio': {'path': 'LJ003-0317_gen.wav', 'array': array([ 9.155273

{'audio': {'path': 'LJ025-0074_gen.wav', 'array': array([0.00054932, 0.00088501, 0.00057983, ..., 0.00054932, 0.00054932,
       0.00048828]), 'sampling_rate': 22050}, 'audio_id': 'LJ025-0074', 'real_or_fake': 'WF1'}
{'audio': {'path': 'LJ025-0074_gen.wav', 'array': array([-0.00045776, -0.00039673,  0.00061035, ...,  0.00067139,
        0.00109863,  0.00140381]), 'sampling_rate': 22050}, 'audio_id': 'LJ025-0074', 'real_or_fake': 'WF2'}
{'audio': {'path': 'LJ025-0074_generated.wav', 'array': array([ 0.00036621, -0.0007019 , -0.00140381, ..., -0.00057983,
       -0.00079346, -0.00039673]), 'sampling_rate': 22050}, 'audio_id': 'LJ025-0074', 'real_or_fake': 'WF3'}
{'audio': {'path': 'LJ025-0074_gen.wav', 'array': array([-0.00054932, -0.00057983, -0.00027466, ...,  0.00018311,
        0.00015259,  0.00027466]), 'sampling_rate': 22050}, 'audio_id': 'LJ025-0074', 'real_or_fake': 'WF4'}
{'audio': {'path': 'LJ025-0074_gen.wav', 'array': array([-1.09863281e-03, -1.64794922e-03, -2.04467773e-03, 