# 3. Prepare `deeplake` dataset

In [9]:
from pathlib import Path
import deeplake
import numpy as np
import json
import shutil
from utils import time_me

In [2]:
deeplake_directory = Path("./data/deeplake")

In [3]:
# shutil.rmtree(deeplake_directory)

In [4]:
dataset_directory = Path("./data/common_voice/")
with open("label_to_ix.json") as f:
    label_to_ix = json.load(f)
ix_to_label = {
    key_: {val: key for key, val in val_.items()} for key_, val_ in label_to_ix.items()
}

In [5]:
ds = deeplake.empty(deeplake_directory)

data/deeplake loaded successfully.




In [6]:
with ds:
    ds.create_tensor("audio", htype="audio", sample_compression="flac")
    ds.create_tensor("sentence", htype="text")
    ds.create_tensor("duration", htype="generic", dtype=np.float32)
    ds.create_tensor(
        "gender",
        htype="class_label",
        class_names=[
            ix_to_label["gender"][i] for i in range(len(label_to_ix["gender"]))
        ],
    )
    ds.create_tensor(
        "age",
        htype="class_label",
        class_names=[ix_to_label["age"][i] for i in range(len(label_to_ix["age"]))],
    )
    ds.create_tensor(
        "accent",
        htype="class_label",
        class_names=[
            ix_to_label["accent"][i] for i in range(len(label_to_ix["accent"]))
        ],
    )

    ds.info.update(description="Deep Lake dataset for Mozilla Common Voice")



In [7]:
flacfiles = list(dataset_directory.glob("*.flac"))

In [8]:
@deeplake.compute
def flac_to_deeplake(flacfile, sample_out):
    ## First two arguments are always default arguments containing:
    #     1st argument is an element of the input iterable (list, dataset, array,...)
    #     2nd argument is a dataset sample
    # Other arguments are optional

    # Find the label number corresponding to the file
    metadata_file = flacfile.with_suffix(".json")
    with open(metadata_file) as f:
        metadata = json.load(f)

    # Append the label and image to the output sample
    sample_out.append(
        {
            "audio": deeplake.read(str(flacfile)),
            "sentence": metadata["sentence"],
            "duration": np.float32(metadata["duration"]),
            "accent": np.uint32(metadata["accent"]),
            "age": np.uint32(metadata["age"]),
            "gender": np.uint32(metadata["gender"]),
            "client": np.uint32(metadata["client"]),
        }
    )

    return sample_out

In [11]:
@time_me
def commit_files_to_ds(flacfiles, ds):
    with ds:
        # Iterate through the files and append to Deep Lake dataset
        flac_to_deeplake().eval(flacfiles, ds, num_workers=8)

In [13]:
commit_files_to_ds(flacfiles=flacfiles, ds=ds)


[2023-12-13_21-27-02]	
+----------------------------+
|Beginning commit_files_to_ds|
+----------------------------+




Evaluating flac_to_deeplake: 100%|██████████████████████████████████████████████████████████| 1710638/1710638 [45:31<00:00

Synchronizing class labels...





Evaluating class_label_sync: 0%|                                                                      | 0/1710638 [00:00<?[A
Evaluating flac_to_deeplake: 100%|██████████████████████████████████████████████████████████| 1710638/1710638 [45:41<00:00[A
Evaluating class_label_sync: 13%|███████▉                                                    | 226964/1710638 [00:10<01:08[A
Evaluating class_label_sync: 21%|████████████▍                                               | 353697/1710638 [00:15<00:58[A
Evaluating class_label_sync: 28%|████████████████▉                                           | 481969/1710638 [00:20<00:50[A
Evaluating class_label_sync: 35%|█████████████████████▎                                      | 607133/1710638 [00:25<00:45[A
Evaluating class_label_sync: 43%|█████████████████████████▌                                  | 729732/1710638 [00:30<00:40[A
Evaluating class_label_sync: 50%|█████████████████████████████▉                              | 852971/1710638 [00:35<

Synchronizing class labels...





Evaluating class_label_sync: 0%|                                                                      | 0/1710638 [00:00<?[A
Evaluating class_label_sync: 7%|████▎                                                        | 120075/1710638 [00:05<01:11[A
Evaluating class_label_sync: 15%|████████▋                                                   | 249283/1710638 [00:10<01:00[A
Evaluating class_label_sync: 21%|████████████▊                                               | 365283/1710638 [00:15<00:56[A
Evaluating class_label_sync: 29%|█████████████████▌                                          | 498990/1710638 [00:20<00:48[A
Evaluating class_label_sync: 37%|██████████████████████▏                                     | 632564/1710638 [00:25<00:42[A
Evaluating class_label_sync: 45%|██████████████████████████▊                                 | 764655/1710638 [00:30<00:36[A
Evaluating class_label_sync: 52%|███████████████████████████████▍                            | 897876/1710638 [00:35<

Synchronizing class labels...




Evaluating class_label_sync: 0%|                                                                      | 0/1710638 [00:00<?[A
Evaluating class_label_sync: 8%|████▋                                                        | 131212/1710638 [00:05<01:04[A
Evaluating class_label_sync: 15%|█████████▏                                                  | 262404/1710638 [00:10<00:56[A
Evaluating class_label_sync: 23%|█████████████▊                                              | 395051/1710638 [00:15<00:50[A
Evaluating class_label_sync: 30%|█████████████████▊                                          | 508683/1710638 [00:20<00:48[A
Evaluating class_label_sync: 37%|██████████████████████▍                                     | 639955/1710638 [00:25<00:42[A
Evaluating class_label_sync: 45%|███████████████████████████                                 | 771245/1710638 [00:30<00:36[A
Evaluating class_label_sync: 53%|███████████████████████████████▋                            | 902697/1710638 [00:35

[2023-12-13_22-17-27]	
+-------------------------------------+
|commit_files_to_ds took 50.42 minutes|
+-------------------------------------+



