# 1. Restructure Dataset

Notes ...

In [2]:
from pathlib import Path
import json
from IPython import display as ipd
import torch

torch.set_num_threads(1)
import torchaudio
from torchaudio import functional as F
from utils import time_me, split_to_n_chunks, multicore_thread_process, SAMPLE_RATE
import pandas as pd
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Download and extract the dataset

Goals: 
Convert the mp3 format into more commonly used flac fileformat
Resample the audio data from 48kHz to 16kHz (which is most common for audio ML and what Wav2Vec2 uses)
Dump transcripts as .txt file along with .flac files (as opposed to the pandas dataframe)
end up wit h

In [3]:
raw_dataset_directory = Path("./data/cv-corpus-15.0-2023-09-08/")
export_directory = Path("./data/common_voice/")
assert raw_dataset_directory.is_dir()

In [4]:
df = pd.read_csv(raw_dataset_directory / "en" / "validated.tsv", delimiter="\t")

In [5]:
df.sample(n=5)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,variant,locale,segment
985188,bd631b28464f4d8218a5e395adefdd436730e0b5f85795...,common_voice_en_17261270.mp3,"If you want a sandwich, don't hesitate grabbin...",2,0,fifties,male,Canadian English,,en,
1564729,ab43bbde8dd98948b7e6b8393d966519e92744fd961723...,common_voice_en_37986974.mp3,It is available in white and black.,2,0,thirties,female,"Southern African (South Africa, Zimbabwe, Nami...",,en,
563224,0a4bc8c21a1c2387ae8f88276b21cf0f167fa73a511c20...,common_voice_en_31391747.mp3,Ferrara was prolific and wrote numerous books ...,4,0,,female,England English,,en,
1724655,372293e65cdab88771e028a4351651ab2eff64438ddafc...,common_voice_en_22396422.mp3,"He returned, pale under his pit-dirt with fury.",4,0,fourties,male,"German English,Non native speaker",,en,
561979,de24a8a2d67d096344ea3b4c2691edbff9488aa117a600...,common_voice_en_24846455.mp3,He also fought against the corrupting influenc...,2,0,,,United States English,,en,


Comment about filtering, NaN values

In [6]:
len_df_before_filtering = len(df)
print(f"Before filtering, len(df) = {len_df_before_filtering:,}")

Before filtering, len(df) = 1,752,025


In [7]:
df["discard"] = False
df.accents = df.accents.fillna("unknown")
df.age = df.age.fillna("unknown")
df.gender = df.gender.fillna("unknown")

In [8]:
merge_groups = [
    "Irish",
    "United States",
    "England",
    "German",
    "India and South Asia",
    "Canadian",
    "Australian",
    "Scottish",
    "Southern African",
    "New Zealand",
    "Irish",
    "Filipino",
    "Hong Kong",
    "Singaporean",
    "Malaysian",
    "Turkish",
    "unknown",
]

In [9]:
def merge_accent_groups(row):
    for accent in merge_groups:
        if accent in str(row.accents):
            row.accents = accent
            return row
    row.discard = True
    return row

In [10]:
df = df.parallel_apply(merge_accent_groups, axis=1)
df = df[df.discard == False]
df = df.reset_index(drop=True)
df = df.drop(
    ["up_votes", "down_votes", "variant", "locale", "segment", "discard"], axis=1
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=219004), Label(value='0 / 219004')…

In [11]:
len_df_after_filtering = len(df)

In [12]:
print(
    f"After filtering, len(df) = {len_df_after_filtering:,} ({len_df_after_filtering/len_df_before_filtering:.1%} of originals)"
)

After filtering, len(df) = 1,744,074 (99.5% of originals)


In [12]:
df.age.unique()

array(['unknown', 'twenties', 'fourties', 'thirties', 'seventies',
       'teens', 'sixties', 'fifties', 'eighties', 'nineties'],
      dtype=object)

In [13]:
df.gender.unique()

array(['unknown', 'male', 'female', 'other'], dtype=object)

In [14]:
df.accents.value_counts(dropna=False)

accents
unknown                 802576
United States           428252
England                 152052
India and South Asia    106625
Canadian                 69444
Australian               53542
German                   49298
Irish                    20371
Scottish                 17349
Southern African         15493
New Zealand              13445
Filipino                  5599
Hong Kong                 4439
Singaporean               3484
Malaysian                 2102
Turkish                      3
Name: count, dtype: int64

In [15]:
gender_to_ix = {"unknown": 0, "female": 1, "male": 2, "other": 3}
age_to_ix = {
    "unknown": 0,
    "teens": 1,
    "twenties": 2,
    "thirties": 3,
    "fourties": 4,
    "fifties": 5,
    "sixties": 6,
    "seventies": 7,
    "eighties": 8,
    "nineties": 9,
}
accent_to_ix = {x[0]: i for i, x in enumerate(df.accents.value_counts().items())}
client_to_ix = {x[0]: i for i, x in enumerate(df.client_id.value_counts().items())}

In [16]:
label_to_ix = {
    "gender": gender_to_ix,
    "age": age_to_ix,
    "accent": accent_to_ix,
    "client": client_to_ix,
}

In [17]:
with open("label_to_ix.json", "w") as f:
    json.dump(label_to_ix, f)

In [18]:
def process_single_row(
    row, raw_dataset_directory: Path, export_directory: Path, label_to_ix: dict
):
    mp3_path = raw_dataset_directory / "en" / "clips" / row["path"]
    flac_path = export_directory / f"{mp3_path.stem}.flac"
    metadata_path = flac_path.with_suffix(".json")
    audio, sr = torchaudio.load(mp3_path)
    duration = audio.shape[1] / sr
    if duration > 10:  # no file should be this large, most likely an erronous recording
        return
    audio = audio
    if sr > SAMPLE_RATE:
        audio = F.resample(audio, orig_freq=sr, new_freq=SAMPLE_RATE)
    elif sr < SAMPLE_RATE:
        return
    torchaudio.save(flac_path, audio, sample_rate=SAMPLE_RATE)
    if flac_path.stat().st_size > 300_000:
        # most files above 300kB (in flac) are corruputed and slow down the forced alignment process. this is <<< 1% of the data.
        flac_path.unlink()
        return
    metadata = {
        "sentence": row["sentence"],
        "age": label_to_ix["age"][row["age"]],
        "gender": label_to_ix["gender"][row["gender"]],
        "accent": label_to_ix["accent"][row["accents"]],
        "client": label_to_ix["client"][row["client_id"]],
        "duration": duration,
    }
    metadata_path.write_text(json.dumps(metadata))

In [None]:
df_list = [x._asdict() for x in df.itertuples()]

num_workers = 16
export_directory.mkdir(exist_ok=True)
_ = multicore_thread_process(
    num_workers=num_workers,
    num_threads=2,
    chunked_args=list(split_to_n_chunks(df_list, n=num_workers)),
    fn=process_single_row,
    raw_dataset_directory=raw_dataset_directory,
    export_directory=export_directory,
    label_to_ix=label_to_ix,
)


[2023-12-13_20-25-44]	
+----------------------------------+
|Beginning multicore_thread_process|
+----------------------------------+


Worker: 7:  41%|████████████████████████████████████████████████████▉                                                                             | 44371/109005 [07:45<15:34, 69.20it/s]

Let's see size of the final dataset in GB

In [13]:
sample_flac_path = next(export_directory.glob("*.flac"))

In [14]:
ipd.Audio(sample_flac_path)

In [15]:
with open(sample_flac_path.with_suffix(".json")) as f:
    print(json.load(f))

{'sentence': 'Two children playing in the snow near a stack of logs.', 'age': 2, 'gender': 2, 'accent': 10, 'client': 14389, 'duration': 3.624}
