In [2]:
from pathlib import Path

ROOT = Path("/workspaces/SpeakerVerification")
DATA = ROOT / "data"

VOXCELEB1 = DATA / "voxceleb1"
VOXCELEB2 = DATA / "voxceleb2"

VOXCELEB1_DEEPLAKE = VOXCELEB1 / "deeplake" / "VoxCeleb1"
VOXCELEB2_DEEPLAKE = VOXCELEB2 / "deeplake" / "VoxCeleb2"

VOXCELEB1_H5 = VOXCELEB1 / "VoxCeleb1.h5"
VOXCELEB2_H5 = VOXCELEB2 / "VoxCeleb2.h5"

In [3]:
import deeplake
import h5py
import numpy as np

def convert_audio(audio):
    audio = audio.numpy()
    audio = audio.reshape(audio.shape[0])
    return audio

def convert_text(text):
    text = str(text)
    return text

def convert_class_label(class_label):
    class_label = class_label.numpy()
    return int(class_label[0])

def convert(deeplake_path, h5_path):
    deeplake_dataset = deeplake.load(deeplake_path)
    deeplake_dataset.summary()
    h5_path.unlink(missing_ok=True)
    
    with h5py.File(h5_path, 'w') as h5_file:
        tensors = {}
        datasets = []
        transform = []

        for name in deeplake_dataset.tensors:
            if name != "Video":
                tensor = deeplake_dataset.tensors[name]
                dtype = None
                if tensor.htype == "audio":
                    dtype = h5py.vlen_dtype(np.dtype('float32'))
                    transform.append(convert_audio)
                elif tensor.htype == "text":
                    dtype = h5py.string_dtype(encoding='utf-8', length=None)
                    transform.append(convert_text)
                elif tensor.htype == "class_label":
                    label_map = {class_name: i for i, class_name in enumerate(tensor.info["class_names"])}
                    dtype = h5py.enum_dtype(label_map, basetype='i')
                    transform.append(convert_class_label)
                tensors[name] = None
                datasets.append(
                    h5_file.create_dataset(name, shape=(tensor.shape[0],), dtype=dtype)
                )
        loader = deeplake_dataset.pytorch(
            transform=tensors,
            num_workers=1,
            shuffle=False,
            batch_size=1,
            pin_memory=False,
            prefetch_factor=10,
            progressbar=True,
        )
        for i, sample in enumerate(loader):
            for j, data in enumerate(sample):
                data = transform[j](data[0])
                datasets[j][i] = data

convert(VOXCELEB1_DEEPLAKE, VOXCELEB1_H5)

/workspaces/SpeakerVerification/data/voxceleb1/deeplake/VoxCeleb1 loaded successfully.
Dataset(path='/workspaces/SpeakerVerification/data/voxceleb1/deeplake/VoxCeleb1', tensors=['Audio', 'Gender', 'Nationality', 'Sample Name', 'Set', 'Speaker ID', 'VGGFace1 ID', 'Video'])

   tensor        htype               shape              dtype  compression
   -------      -------             -------            -------  ------- 
    Audio        audio     (153516, 63361:2318721, 1)   None      wav   
   Gender     class_label         (153516, 1)          uint32    None   
 Nationality  class_label         (153516, 1)          uint32    None   
 Sample Name     text             (153516, 1)            str     None   
     Set      class_label         (153516, 1)          uint32    None   
 Speaker ID   class_label         (153516, 1)          uint32    None   
 VGGFace1 ID  class_label         (153516, 1)          uint32    None   
    Video     class_label         (153516, 1)          uint32    No

/workspaces/SpeakerVerification/data/voxceleb1/deeplake/VoxCeleb1: 100%|██████████| 153516/153516 [53:50<00:00, 47.52it/s] 
