In [4]:
import os
import csv
from tqdm import tqdm
import numpy as np
import soundfile as sf
from datasets import load_dataset, config as hf_config

# Use Hugging Face's default cache directory
hf_cache_root = hf_config.HF_DATASETS_CACHE or os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "datasets")

# We'll store outputs in a clean subdirectory here
output_dir = os.path.join(hf_cache_root, "fleurs-belebele-output")
audio_dir = os.path.join(output_dir, "saved_audios")
os.makedirs(audio_dir, exist_ok=True)

# CSV metadata path
csv_path = os.path.join(output_dir, "audio_metadata.csv")

# Load dataset (dataset itself will go into the HF-managed cache)
dataset = load_dataset("wuenlp/fleurs-belebele", "pes_Arab", trust_remote_code=True)

# CSV headers
csv_headers = [
    "audio_path", "sentence", "gender", "raw_transcription",
    "seamlessm4t_asr", "seamlessm4t_asr_translation",
    "transcription", "whisper_asr", "whisper_asr_translation"
]

# Write CSV metadata
with open(csv_path, mode="w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=csv_headers)
    writer.writeheader()

    # Count total items for progress bar
    total_items = sum(len(item["audio"]) for sample in dataset["test"] for item in sample["sentence_data"])
    pbar = tqdm(total=total_items, desc="Processing audio")

    for sample in dataset["test"]:
        for item in sample["sentence_data"]:
            for i, audio in enumerate(item["audio"]):
                # Use original filename
                audio_filename = audio["path"]
                audio_abs_path = os.path.join(audio_dir, audio_filename)

                # Save audio
                sf.write(audio_abs_path, audio["array"], audio["sampling_rate"])

                # Write metadata
                writer.writerow({
                    "audio_path": audio_abs_path,
                    "sentence": item.get("sentence", ""),
                    "gender": item.get("gender", [""])[i] if "gender" in item else "",
                    "raw_transcription": item.get("raw_transcription", ""),
                    "seamlessm4t_asr": item.get("seamlessm4t_asr", [""])[i] if "seamlessm4t_asr" in item else "",
                    "seamlessm4t_asr_translation": item.get("seamlessm4t_asr_translation", [""])[i] if "seamlessm4t_asr_translation" in item else "",
                    "transcription": item.get("transcription", ""),
                    "whisper_asr": item.get("whisper_asr", [""])[i] if "whisper_asr" in item else "",
                    "whisper_asr_translation": item.get("whisper_asr_translation", [""])[i] if "whisper_asr_translation" in item else "",
                })

                pbar.update(1)

    pbar.close()


Processing audio: 100%|██████████| 5688/5688 [00:38<00:00, 146.63it/s]


In [17]:
from datasets import Dataset, Audio, Value, Features, load_dataset, DatasetDict
import os
import pandas as pd

# Define paths
hf_cache_root = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "datasets")
output_dir = os.path.join(hf_cache_root, "fleurs-belebele-output")
csv_path = os.path.join(output_dir, "audio_metadata.csv")

# Load CSV into pandas
df = pd.read_csv(csv_path)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Cast audio column
dataset = dataset.cast_column("audio_path", Audio())

# (Optional) Set features explicitly for clarity
features = Features({
    "audio_path": Audio(),
    "sentence": Value("string"),
    "gender": Value("string"),
    "raw_transcription": Value("string"),
    "seamlessm4t_asr": Value("string"),
    "seamlessm4t_asr_translation": Value("string"),
    "transcription": Value("string"),
    "whisper_asr": Value("string"),
    "whisper_asr_translation": Value("string")
})
dataset = dataset.cast(features)
dataset = dataset.rename_column("audio_path", "audio")
dataset = DatasetDict({"test": dataset})

Casting the dataset:   0%|          | 0/5688 [00:00<?, ? examples/s]

In [19]:
dataset["test"][0]

{'audio': {'path': '/home/jovyan/.cache/huggingface/datasets/fleurs-belebele-output/saved_audios/1441824098773732742.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00, -3.05175781e-05, -6.10351562e-05]),
  'sampling_rate': 16000},
 'sentence': 'بسیاری، ریتم\u200cهای فیزیولوژی و رفتار را غالباً به صورت جدی به وجود چرخه درون\u200cزا و تولید آن\u200cها را به ساعت\u200cهای بیولوژیکی مرتبط می\u200cدانند.',
 'gender': 'FEMALE',
 'raw_transcription': 'بسیاری، ریتم\u200cهای فیزیولوژی و رفتار را غالباً به صورت جدی به وجود چرخه درون\u200cزا و تولید آن\u200cها را به ساعت\u200cهای بیولوژیکی مرتبط می\u200cدانند.',
 'seamlessm4t_asr': 'بسیاری از ریتم های فیزیولوژیکی و رفتار را غالبا به صورت جدی به صورت چرخه درونزا و تولید آنها را به ساعت های بیولوژیکی مرتبط می دانند.',
 'seamlessm4t_asr_translation': 'Many physiological and behavioral rhythms are often seriously associated with the internal cycle and their production in biological clocks.',
 'tr

In [20]:
dataset.save_to_disk("/home/jovyan/.cache/huggingface/datasets/fleurs-belebele")

Saving the dataset (0/6 shards):   0%|          | 0/5688 [00:00<?, ? examples/s]