In [1]:
import os
import librosa
import numpy as np
import torchaudio
import torchaudio.transforms as T
import torch
import json
from pathlib import Path
from datasets import Audio
from tqdm import tqdm
from datasets import load_dataset



In [2]:
# make sure, torch.audio can handle mp3 data
# torchaudio.set_audio_backend("sox_io")  # oder "ffmpeg" falls installiert


In [4]:
dataset = load_dataset("mozilla-foundation/common_voice_17_0", "yo")

# or load the separate splits if the dataset has train/validation/test splits
train_dataset = load_dataset("mozilla-foundation/common_voice_17_0", "yo", split="train")
valid_dataset = load_dataset("mozilla-foundation/common_voice_17_0", "yo", split="validation")
test_dataset  = load_dataset("mozilla-foundation/common_voice_17_0", "yo", split="test")

In [5]:
# test
# ds = load_dataset("mozilla-foundation/common_voice_17_0", "yo")
# printing the structures of the train / valid / test sets
print(train_dataset)
print(valid_dataset)
print(test_dataset)



Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 1213
})
Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 863
})
Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 999
})


In [6]:
# where do I find the right sample rate???
SAMPLE_RATE = 16000
N_MFCC = 13

In [7]:
def extract_mfcc_from_dataset_item(example, sample_rate=SAMPLE_RATE, n_mfcc=N_MFCC):
    # Hole die Audiodaten aus dem Beispiel
    audio = example["audio"]
    waveform = torch.tensor(audio["array"], dtype=torch.float64).unsqueeze(0)  # [1, time]
    sr = audio["sampling_rate"]

    # Resample falls notwendig
    if sr != sample_rate:
        resampler = T.Resample(orig_freq=sr, new_freq=sample_rate)
        waveform = resampler(waveform)

    # optional: in float32 for further (torchaudio-Transforms akzeptieren float32 besser)
    waveform = waveform.to(torch.float32)

    # MFCC-Berechnung
    mfcc_transform = T.MFCC(
        sample_rate=sample_rate,
        n_mfcc=n_mfcc,
        melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 40}
    )
    mfcc = mfcc_transform(waveform)

    # Delta-Features
    delta = torchaudio.functional.compute_deltas(mfcc)
    delta2 = torchaudio.functional.compute_deltas(delta)

    # Kombinieren
    combined = torch.cat([mfcc, delta, delta2], dim=0).squeeze(0)  # [39, time]

    return {"mfcc": combined.T}  # [time_steps, 39]


In [8]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLE_RATE))
dataset = dataset.map(extract_mfcc_from_dataset_item)

In [9]:
from torch.utils.data import DataLoader

# convertingn to a torch compatible list
class MFCCDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "mfcc": item["mfcc"],
            "text": item["sentence"]
        }

# dataLoader
torch_dataset = MFCCDataset(dataset["train"])
dataloader = DataLoader(torch_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: x)


NameError: name 'dataset_n' is not defined

In [11]:
# test
print(type(dataset["train"]))

# train_data = dataset["train"]
train_data_n = dataset_n["train"]
# sample = train_data[0]
sample_n = train_data_n[0]


print(sample_n["sentence"])
print(torch.tensor(sample_n["mfcc"]).shape)



<class 'datasets.arrow_dataset.Dataset'>


In [14]:
# torch combatible dataset
# torch_dataset = MFCCDataset(dataset)

# local path!!! .py-Datei
output_path = Path("/home/aaron/automated_speech_recognition/X-lingual_IPA_ASR/data/mfcc_data_export.py")

# extract all data (warning: can be RAM intensiv)
# to remember what torch_dataset looks like: torch_dataset = MFCCDataset(dataset["train"])

export_data = []
for item in torch_dataset:
    export_data.append({
        "mfcc": item["mfcc"], 
        "text": item["text"]
    })



In [15]:
# makes dictionary to a valid python text string
py_code = "data = " + json.dumps(export_data, indent=2)

# write data
output_path.write_text(py_code, encoding="utf-8")
print(f"data saved as python file: {output_path.resolve()}")


data saved as .py data: /home/aaron/automated_speech_recognition/X-lingual_IPA_ASR/data/mfcc_data_export.py


In [11]:
# has to be done before the export!!
# extraction only mfcc´s + sentence (deleting other informations like client_id, age, gender...)
dataset_n = dataset.remove_columns([
    col for col in dataset.column_names["train"] 
    if col not in ["mfcc", "sentence"]
])

dataset_n.save_to_disk("yoruba_mfcc_dataset")  # save as arrow format


Saving the dataset (0/1 shards):   0%|          | 0/1213 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/863 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/999 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1113 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/229 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3077 [00:00<?, ? examples/s]

In [16]:
# example first 3 sentences with mfcc extractions
for i in range(3):
    sample = dataset["train"][i]
    mfcc = torch.tensor(sample["mfcc"])
    
    print(f"Sample {i+1}:")
    print("sentence:", sample["sentence"])
    print("MFCC shape:", mfcc.shape)
    print("MFCC (first row):", mfcc[0])
    print("-" * 40)


Sample 1:
sentence: Ọmọ ẹgbẹ́ òkùnkùn dèrò àtìmọ́lé torí nílùú Ìbàdàn.
MFCC shape: torch.Size([623, 13, 3])
MFCC (first row): tensor([[-4.8120e+02,  6.1035e-06, -9.0949e-14],
        [ 2.7145e-05,  0.0000e+00,  0.0000e+00],
        [-3.5849e-05, -7.2760e-13,  1.0842e-20],
        [ 3.4594e-05,  0.0000e+00,  0.0000e+00],
        [-1.9379e-05,  3.6380e-13, -5.4210e-21],
        [ 2.9969e-06,  0.0000e+00,  0.0000e+00],
        [-2.2608e-05,  0.0000e+00,  0.0000e+00],
        [-1.9617e-06,  0.0000e+00,  0.0000e+00],
        [-1.4746e-06,  0.0000e+00,  0.0000e+00],
        [-2.7256e-05,  3.6380e-13, -5.4210e-21],
        [ 1.5318e-04,  0.0000e+00,  0.0000e+00],
        [-8.7926e-05,  1.4552e-12, -2.1684e-20],
        [-2.7204e-05,  3.6380e-13, -5.4210e-21]])
----------------------------------------
Sample 2:
sentence: Ìyàwó àwọn ọlọ́pàá tó kú lásìkò ìwọ́de tó kọjá ti bẹ̀bẹ̀ fún ìrànwọ́
MFCC shape: torch.Size([620, 13, 3])
MFCC (first row): tensor([[-4.8199e+02,  6.1035e-06, -9.0949e-1

In [17]:
# shape of the dataset, splits etc. 
print(dataset_n)


DatasetDict({
    train: Dataset({
        features: ['sentence', 'mfcc'],
        num_rows: 1213
    })
    validation: Dataset({
        features: ['sentence', 'mfcc'],
        num_rows: 863
    })
    test: Dataset({
        features: ['sentence', 'mfcc'],
        num_rows: 999
    })
    other: Dataset({
        features: ['sentence', 'mfcc'],
        num_rows: 1113
    })
    invalidated: Dataset({
        features: ['sentence', 'mfcc'],
        num_rows: 229
    })
    validated: Dataset({
        features: ['sentence', 'mfcc'],
        num_rows: 3077
    })
})
