# setup

In [3]:
# !pip install -r ../requirements.txt
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

In [4]:
!nvidia-smi

Mon Jul  7 06:42:02 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off |   00000000:00:07.0 Off |                  N/A |
| 30%   18C    P8              9W /  350W |       2MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# load dataset

In [1]:
from datasets import load_dataset
dataset = load_dataset("hsekhalilian/commonvoice", split="dev")
# dataset = dataset.select(indices=range(100))
dataset

Dataset({
    features: ['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment', 'audio', 'normalized_transcription'],
    num_rows: 10676
})

# load model

In [2]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [3]:
model_name_or_path = "m3hrdadfi/wav2vec2-large-xlsr-persian-v3"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
model = Wav2Vec2ForCTC.from_pretrained(model_name_or_path).to(device)



# predict

In [5]:
import sys_append
import numpy as np
from utils.normalizer import persian_normalizer

In [6]:
def predict(batch):
    features = processor(
        [sample["array"] for sample in batch["audio"]], 
        sampling_rate=processor.feature_extractor.sampling_rate, 
        return_tensors="pt", 
        padding=True
    )

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits 

    pred_ids = torch.argmax(logits, dim=-1)

    batch["prediction"] = processor.batch_decode(pred_ids)
    
    return batch


def normalize_prediction(example):
    try:
        example['normalized_prediction'] = persian_normalizer(example['prediction'])
        return example
    except:
        return None

In [7]:
result = dataset.map(predict, batched=True, batch_size=64)



Map:   0%|          | 0/10676 [00:00<?, ? examples/s]

In [8]:
result

Dataset({
    features: ['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment', 'audio', 'normalized_transcription', 'prediction'],
    num_rows: 10676
})

# evaluate

In [9]:
from utils.evaluate import evaluate_asr

In [11]:
evaluate_asr(result["normalized_transcription"], result["prediction"])

{'wer': 0.31191391970501287, 'cer': 0.07564881281060187}

In [12]:
from jiwer import wer
wer(result["sentence"], result["prediction"])

0.32022248708780293