In [1]:
import os
import torch
import librosa
import torchaudio
import numpy as np
import soundfile as sf
from datasets import load_dataset, Dataset, Audio

In [2]:
dataset = load_dataset("hsekhalilian/persian-youtube", num_proc=32)
dataset = dataset.remove_columns("file_name")

dataset = dataset["train"].select(range(100))

Resolving data files:   0%|          | 0/46 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/46 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/59 [00:00<?, ?it/s]

In [3]:
dataset[0]

{'sentence': 'تا حالا عربی رقصیدی؟',
 'audio': {'path': '0053700001.mp3',
  'array': array([-0.01711661, -0.02069014, -0.01701352, ...,  0.06120613,
          0.08319854,  0.06572199]),
  'sampling_rate': 44100},
 'normalized_transcription': 'تا حالا عربی رقصیدی؟'}

# librosa resampling

In [4]:
output_dir = "/home/jovyan/.cache/datasets/test/audio_files"
os.makedirs(output_dir, exist_ok=True)

target_sr = 16000

def process(example):
    audio_array = example["audio"]["array"]
    original_sr = example["audio"]["sampling_rate"]
    resampled = librosa.resample(np.array(audio_array), orig_sr=original_sr, target_sr=target_sr)

    filename = os.path.splitext(example["audio"]["path"])[0] + ".flac"
    output_path = os.path.join(output_dir, filename)
    sf.write(output_path, resampled, target_sr, format="FLAC")

    return {
        "audio": output_path,
        "sentence": example["sentence"],
        "normalized_transcription": example["normalized_transcription"]
    }

new_dataset = dataset.map(process)
new_dataset = new_dataset.cast_column("audio", Audio(sampling_rate=target_sr))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
new_dataset[0]

{'sentence': 'تا حالا عربی رقصیدی؟',
 'audio': {'path': '/home/jovyan/.cache/datasets/test/audio_files/0053700001.flac',
  'array': array([-0.01202393, -0.02008057, -0.01922607, ...,  0.02316284,
          0.07275391,  0.        ]),
  'sampling_rate': 16000},
 'normalized_transcription': 'تا حالا عربی رقصیدی؟'}

# pytorchaudio gpu

In [10]:
output_dir = "/home/jovyan/.cache/datasets/test/audio_files"
os.makedirs(output_dir, exist_ok=True)

target_sr = 16000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resampler_dict = {}

def process(example):
    waveform = torch.tensor(example["audio"]["array"]).unsqueeze(0)  # Shape: [1, T]
    original_sr = example["audio"]["sampling_rate"]
    waveform = waveform.to(torch.float32).to(device)

    if not resampler_dict.get(str(original_sr)):
        resampler_dict[str(original_sr)] = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=target_sr).to(device)
        
    if original_sr != target_sr:
        resampled = resampler_dict[str(original_sr)](waveform).squeeze(0).cpu().numpy()
    else:
        resampled = waveform.squeeze(0).cpu().numpy()

    filename = os.path.splitext(example["audio"]["path"])[0] + ".flac"
    output_path = os.path.join(output_dir, filename)
    sf.write(output_path, resampled, target_sr, format="FLAC")

    return {
        "audio": output_path,
        "sentence": example["sentence"],
        "normalized_transcription": example["normalized_transcription"]
    }

new_dataset = dataset.map(process)
new_dataset = new_dataset.cast_column("audio", Audio(sampling_rate=target_sr))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]