# 2. Create `webdataset` Tarballs

notes

In [1]:
import tarfile
import IPython.display as ipd
from pathlib import Path
from utils import (
    multicore_thread_process,
    SAMPLE_RATE,
    split_to_chunks_of_size,
    split_to_n_chunks,
)
import numpy as np
import json
import soundfile as sf
import io
from tqdm import tqdm

In [2]:
def tar_single_chunk(
    tarpath_chunk_flacfiles: tuple[Path, list[Path]],
    dataset_directory: Path,
    label_to_ix: dict,
):
    tarpath, chunk_flacfiles = tarpath_chunk_flacfiles
    tar = tarfile.open(name=tarpath, mode="w")
    for flacfile in chunk_flacfiles:
        metadata_file = flacfile.with_suffix(".json")
        assert metadata_file.exists(), f"{str(metadata_file)} does not exist!"
        tar.add(flacfile, arcname=flacfile.relative_to(dataset_directory.parent))
        tar.add(
            metadata_file, arcname=metadata_file.relative_to(dataset_directory.parent)
        )
    tar.close()

In [3]:
dataset_directory = Path("./data/common_voice/")

In [4]:
flacfiles = sorted(dataset_directory.glob("*.flac"))
np.random.seed(42)  # deterministically shuffle the flacfiles
np.random.shuffle(flacfiles)

In [5]:
webdataset_directory = Path("./data/webdataset_small_chunk/")
webdataset_directory.mkdir(exist_ok=True)

In [6]:
chunk_size = 64
# chunk_size = 2048
chunked_flacfiles = list(
    split_to_chunks_of_size(flacfiles, chunk_size=chunk_size, shuffle=False)
)

In [7]:
with open("label_to_ix.json") as f:
    label_to_ix = json.load(f)

In [8]:
print(f"Last chunk has {len(chunked_flacfiles[-1])} samples")

Last chunk has 46 samples


In [10]:
len(chunked_flacfiles)

26729

In [11]:
print(f"Will create {len(chunked_flacfiles)} tarballs")

Will create 836 tarballs


In [11]:
tarpaths = [
    webdataset_directory / f"common_voice_{i:05}.tar"
    for i in range(len(chunked_flacfiles))
]

In [13]:
num_workers = 4

In [14]:
args = list(split_to_n_chunks(list(zip(tarpaths, chunked_flacfiles)), n=num_workers))

In [15]:
_ = multicore_thread_process(
    num_workers=num_workers,
    num_threads=4,
    fn=tar_single_chunk,
    chunked_args=args,
    dataset_directory=dataset_directory,
    label_to_ix=label_to_ix,
)


[2023-12-13_21-40-12]	
+----------------------------------+
|Beginning multicore_thread_process|
+----------------------------------+


Worker: 0: 100%|██████████████████████████████████████████████████████████████████████| 6683/6683 [23:50<00:00,  4.67it/s]
Worker: 3: 100%|██████████████████████████████████████████████████████████████████████| 6680/6680 [23:53<00:00,  4.66it/s]
Worker: 1: 100%|██████████████████████████████████████████████████████████████████████| 6683/6683 [24:02<00:00,  4.63it/s]
Worker: 2: 100%|██████████████████████████████████████████████████████████████████████| 6683/6683 [24:01<00:00,  4.64it/s]


[2023-12-13_22-04-22]	
+-------------------------------------------+
|multicore_thread_process took 24.16 minutes|
+-------------------------------------------+

