# setup

In [11]:
import sys_append
from utils.normalizer import persian_normalizer_no_punc
from utils.evaluate import evaluate_asr

In [1]:
import torch
from datasets import load_dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torchaudio
from jiwer import wer
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
dataset = load_dataset("hsekhalilian/commonvoice", split="dev")
dataset = dataset.select(range(1000))

In [5]:
model_path = "/home/jovyan/.cache/models/m3hrdadfi/wav2vec2-large-xlsr-persian-v3/"
model_path = "hsekhalilian/wav2vec2-custom-model"

parts = model_path.strip("/").split("/")
model_name = f"{parts[-2]}/{parts[-1]}"
print(model_name)

processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path).to(device)

hsekhalilian/wav2vec2-custom-model


# one sample

In [6]:
# Load one sample
sample = dataset[0]
audio = sample["audio"]
speech_array, sampling_rate = audio["array"], audio["sampling_rate"]

# Resample to 16kHz if needed
if sampling_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
    speech_array = resampler(torch.tensor(speech_array)).numpy()

# Tokenize and move inputs to GPU
inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = {key: val.to(device) for key, val in inputs.items()}

# Inference
with torch.no_grad():
    logits = model(**inputs).logits

# Decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])

# Print results
print("Prediction:", transcription)
print("Reference:", sample["sentence"])

Prediction: این اولین قدم برای تغییر خودم
Reference: این اولین قدم برای تغییر خودم


# for loop

In [7]:
predictions = []
references = []

for sample in tqdm(dataset):
    audio = sample["audio"]
    speech_array, sampling_rate = audio["array"], audio["sampling_rate"]

    # Resample if needed
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
        speech_array = resampler(torch.tensor(speech_array)).numpy()

    # Tokenize and move to device
    inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Inference
    with torch.no_grad():
        logits = model(**inputs).logits

    # Decode
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])

    # Store results
    references.append(sample["sentence"])
    predictions.append(transcription)

# Compute WER
error_rate = evaluate_asr(references, predictions)["wer"]
print(f"\nWER: {error_rate:.2%}")

100%|██████████| 1000/1000 [00:25<00:00, 39.97it/s]


WER: 25.36%





In [8]:
for reference, prediction in zip(references, predictions):
    print(f"reference: {reference}")
    print(f"predicted: {prediction}")
    print("-"*30)

reference: این اولین قدم برای تغییر خودم
predicted: این اولین قدم برای تغییر خودم
------------------------------
reference: با خنده ای ترسناک چرا وحشت کردین؟ چرا تهمت می زنی؟
predicted: با خنده‌ای ترسناک چرا وحشت کردین چرا تهمت می‌ زنی
------------------------------
reference: من همه جا دنبالت گشتم
predicted: من و همه جا دنبالت گشتم
------------------------------
reference: افسانهها میگن سگها واسطهی دنیای زندهها با مردههان
predicted: درسانه‌ها میگن تک‌ها واسطه دنیای زنده‌ها با مرده هان
------------------------------
reference: فكر می کنم همین جا باید تمومش کنیم
predicted: فکر می کنم همین جا باید تمومش کنیم
------------------------------
reference: اَفراسیاب
predicted: عطراسیاب
------------------------------
reference: طاهره چی بهش گفتی رنگش پرید
predicted: طاهره چی بهش گفتی رنگش پرید
------------------------------
reference: من شبا خواب میبینم که سگها به هم حمله میکنن
predicted: من شبا خواب میبینم که سگابه هم حمله میکنم
------------------------------
reference: از وقتی که فقط پنج سالت 

# huggingface datasets

In [16]:
def preprocess_and_predict(sample):
    try:
        speech_array, sampling_rate = sample["audio"]["array"], sample["audio"]["sampling_rate"]
      
        if speech_array is None or len(speech_array) == 0:
            batch["prediction"] = ""
            batch["reference"] = batch["sentence"].lower()
            return batch
        
        if sampling_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
            speech_array = resampler(torch.tensor(speech_array)).numpy()
    
        inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
    
        with torch.no_grad():
            logits = model(**inputs).logits
        if device == "cuda":
            _ = torch.tensor([0.], device=device)  # dummy op
            torch.cuda.synchronize()
    
    
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])
    
        sample["prediction"] = transcription
        sample["reference"] = sample["sentence"]
        
    except Exception as e:
        print(f"Error with sample: {e}")
        sample["prediction"] = ""
        sample["reference"] = sample["sentence"]
    
    return sample

# Apply the function
dataset = dataset.map(preprocess_and_predict, num_proc=1)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

KeyboardInterrupt: 