In [1]:
import unicodedata
import string

SKIP = set(
    list(string.ascii_letters)
    + [
        "=",  # occurs only 2x in utterance (transl.): "twenty = xx"
        "ā",  # occurs only 4x together with "š"
        "š",
        # Arabic letters
        "ة",  # TEH MARBUTA
    ]
)

DISCARD = [
    # "(laughter)" in Farsi
    "(خنده)",
    # ASCII
    "!",
    '"',
    "#",
    "&",
    "'",
    "(",
    ")",
    ",",
    "-",
    ".",
    ":",
    ";",
    # Unicode punctuation?
    "–",
    "“",
    "”",
    "…",
    "؟",
    "،",
    "؛",
    "ـ",
    # Unicode whitespace?
    "ً",
    "ٌ",
    "َ",
    "ُ",
    "ِ",
    "ّ",
    "ْ",
    "ٔ",
    # Other
    "«",
    "»",
]

REPLACEMENTS = {
    "أ": "ا",
    "ۀ": "ە",
    "ك": "ک",
    "ي": "ی",
    "ى": "ی",
    "ﯽ": "ی",
    "ﻮ": "و",
    "ے": "ی",
    "ﺒ": "ب",
    "ﻢ": "ﻡ",
    "٬": " ",
    "ە": "ه",
}


def maybe_normalize(text: str) -> str | None:

    # Skip selected with banned characters
    if set(text) & SKIP:
        return None  # skip this

    # Remove hashtags - they are not being read in Farsi CV
    text = " ".join(w for w in text.split() if not w.startswith("#"))

    # Replace selected characters with others
    for lhs, rhs in REPLACEMENTS.items():
        text = text.replace(lhs, rhs)

    # Replace selected characters with empty strings
    for tok in DISCARD:
        text = text.replace(tok, "")

    # Unify the symbols that have the same meaning but different Unicode representation.
    text = unicodedata.normalize("NFKC", text)

    # Remove hamza's that were not merged with any letter by NFKC.
    text = text.replace("ء", "")

    # Remove double whitespace etc.
    return " ".join(t for t in text.split() if t)


In [4]:
from datasets import load_dataset
dataset = load_dataset("hsekhalilian/fleurs", split="validation")

In [5]:
dataset[0]

{'id': 1567,
 'num_samples': 152640,
 'path': '/home/jovyan/.cache/huggingface/datasets/downloads/extracted/208b445c44866c7760854933a1fe8f3e632c83bf1655d9cd40f3a134a7625264/10049006222377318397.wav',
 'audio': {'path': '10049006222377318397.wav',
  'array': array([0.        , 0.        , 0.        , ..., 0.0001471 , 0.00012559,
         0.00013351]),
  'sampling_rate': 16000},
 'transcription': 'بزرگترین مسابقه سال در ماه دسامبر در زمین\u200cهای چوگان لاس کانیتاس اتفاق می\u200cافتد',
 'raw_transcription': 'بزرگترین مسابقه سال در ماه دسامبر در زمین\u200cهای چوگان «لاس کانیتاس» اتفاق می\u200cافتد.',
 'gender': 0,
 'lang_id': 22,
 'language': 'Persian',
 'lang_group_id': 2,
 'normalized_transcription': 'بزرگترین مسابقه سال در ماه دسامبر در زمین\u200cهای چوگان لاس کانیتاس اتفاق می\u200cافتد'}

In [9]:
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name="nvidia/stt_fa_fastconformer_hybrid_large")

[NeMo I 2025-07-28 13:13:39 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2025-07-28 13:13:41 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: dummy
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 8
    pin_memory: true
    max_duration: 10
    min_duration: 0.5
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    use_lhotse: true
    lhotse:
      shar_path: /data_artifacts/data/shar/train
      batch_duration: 1200
      quadratic_duration: 15
      num_buckets: 10
      num_cuts_for_bins_estimate: 10000
      buffer_size: 10000
      shuffle_buffer_size: 10000
    
[NeMo W 2025-07-28 13:13:41 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and prov

[NeMo I 2025-07-28 13:13:41 nemo_logging:393] PADDING: 0
[NeMo I 2025-07-28 13:13:42 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-07-28 13:13:42 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}


[NeMo W 2025-07-28 13:13:42 nemo_logging:405] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: CUDA is not available


[NeMo I 2025-07-28 13:13:43 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}


[NeMo W 2025-07-28 13:13:43 nemo_logging:405] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: CUDA is not available


[NeMo I 2025-07-28 13:13:44 nemo_logging:393] Model EncDecHybridRNNTCTCBPEModel was successfully restored from /home/jovyan/.cache/huggingface/hub/models--nvidia--stt_fa_fastconformer_hybrid_large/snapshots/249cf5bf70dda7220a60ddeeecff2f6aad8e1784/stt_fa_fastconformer_hybrid_large.nemo.


In [5]:
from text_normalizer.persian_normalizer import persian_normalizer_no_punc

In [12]:
from pathlib import Path

In [17]:
for sample in dataset:
    audio_info = sample["audio"]
    audio_filename = Path(audio_info["path"]).name
    output_audio_path = Path("/home/jovyan/.cache") / "audio_files" / audio_filename
    text = sample.get("sentence")
    waveform = torch.tensor(audio_info["array"]).unsqueeze(0)
    sr = audio_info["sampling_rate"]

    if sr != TARGET_SAMPLING_RATE:
        waveform = resample_audio(waveform, sr, TARGET_SAMPLING_RATE)

    save_audio(waveform, TARGET_SAMPLING_RATE, output_audio_path)

    duration = waveform.shape[1] / TARGET_SAMPLING_RATE


In [18]:
output_audio_path

PosixPath('/home/jovyan/.cache/audio_files/common_voice_fa_19036153.flac')