In [17]:
# !pip install -r ../requirements.txt

# create dataset

In [5]:
import os
import json
import torch
import torchaudio
from datasets import load_dataset

In [6]:
dataset_name = "hsekhalilian/commonvoice"
dataset = load_dataset(dataset_name, split="dev")
dataset = dataset.select(range(1000))

In [7]:
output_dir = os.path.expanduser(f"~/.cache/datasets/{dataset_name.replace("/", "___")}/audio_files/")
os.makedirs(output_dir, exist_ok=True)

output_manifest = os.path.join(output_dir, "..", "dev_manifest.json")
target_sr = 16000


with open(output_manifest, "w") as fout:
    for sample in dataset:
        text = sample["sentence"]
        
        audio_path = sample["audio"]["path"]
        audio_path = audio_path.strip("/").split("/")[-1]
        output_audio_path = os.path.join(output_dir, audio_path)
        
        waveform, sr = torch.tensor(sample["audio"]["array"]).unsqueeze(0), sample["audio"]["sampling_rate"]

        
        if sr != target_sr:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
            waveform = resampler(waveform)
            sr = target_sr

        torchaudio.save(output_audio_path, waveform, sr)

        duration = waveform.shape[1] / sr

        fout.write(json.dumps({
            "audio_filepath": os.path.abspath(output_audio_path),
            "duration": duration,
            "text": text
        }) + "\n")

# inference

In [8]:
import os
import json
from tqdm import tqdm
from jiwer import wer
import nemo.collections.asr as nemo_asr


import logging
logging.getLogger('nemo_logger').setLevel(logging.CRITICAL)

In [9]:
asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name="nvidia/stt_fa_fastconformer_hybrid_large").eval()

In [10]:
manifest_path = os.path.expanduser(f"~/.cache/datasets/{dataset_name.replace("/", "___")}/dev_manifest.json")

audio_files = []
references = []
predictions = []

with open(manifest_path, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        audio_files.append(entry["audio_filepath"])
        references.append(entry["text"])

results = asr_model.transcribe(audio_files)
predictions = [result.text for result in results]

error_rate = wer(references, predictions)
print(f"\nWER: {error_rate:.2%}")

Transcribing: 100%|██████████| 250/250 [00:13<00:00, 19.15it/s]


WER: 14.72%





In [11]:
import numpy as np

# test

In [1]:
import sys_append

In [2]:
from utils.create_dataset import create_nemo_dataset

In [3]:
create_nemo_dataset({"dataset": "hsekhalilian/commonvoice", "split": "dev", "sample_size": 1000})

Processing dev split: 100%|██████████| 1000/1000 [00:04<00:00, 215.56it/s]
