# setup

In [1]:
# !pip install --upgrade -r ../requirements.txt

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import sys_append
from utils.normalizer import persian_normalizer

In [2]:
import torch
import torchaudio
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
from jiwer import wer
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print(f"Using device: {device}")

Using device: cuda


In [3]:
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, 
    # low_cpu_mem_usage=True, use_safetensors=True
).to(device)
del model.generation_config.forced_decoder_ids

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

Device set to use cuda


In [4]:
dataset = load_dataset("hsekhalilian/commonvoice", split="dev")
dataset = dataset.select(range(100))

# one sample

In [5]:
# Load one sample
sample = dataset[0]
result = pipe(sample["audio"], generate_kwargs={"task": "transcribe", "language": "persian"})

# Print results
print("Prediction:", result["text"])
print("Reference:", sample["sentence"])

Prediction:  این اولین قدم برای تغییر خودم
Reference: این اولین قدم برای تغییر خودم


# for loop

In [6]:
predictions = []
references = []

for sample in tqdm(dataset):
    result = pipe(sample["audio"], generate_kwargs={"task": "transcribe", "language": "persian"})
    predictions.append(persian_normalizer(result["text"]))
    references.append(sample["normalized_transcription"])

# Compute WER
error_rate = wer(references, predictions)
print(f"\nWER: {error_rate:.2%}")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [01:19<00:00,  1.26it/s]


WER: 38.73%





In [18]:
for reference, prediction in zip(references, predictions):
    print(f"reference: {reference}")
    print(f"predicted: {prediction}")
    print("-"*30)

reference: این اولین قدم برای تغییر خودم
predicted: این اولین قدم برای تغییر خودم
------------------------------
reference: با خنده ای ترسناک چرا وحشت کردین؟ چرا تهمت می زنی؟
predicted: با خنده ترسناک چرا وحشت کردین؟ چرا تهمت میزنین؟
------------------------------
reference: من همه جا دنبالت گشتم
predicted: من همه جا دنبالت گشتم
------------------------------
reference: افسانهها میگن سگها واسطهی دنیای زندهها با مردههان
predicted: افثانه ها میگن تکا وسط دنیا زنده ها با مرده ها
------------------------------
reference: فر می کنم همین جا باید تمومش کنیم
predicted: فکر می کنم همینجا باید تمومش کنیم
------------------------------
reference: افراسیاب
predicted: افراسیاب
------------------------------
reference: طاهره چی بهش گفتی رنگش پرید
predicted: تا هر چی بهش گفتی رنگست برید؟
------------------------------
reference: من شبا خواب میبینم که سگها به هم حمله میکنن
predicted: من شبا خواب میبینم که سگا به هم حمله میکنه
------------------------------
reference: از وقتی که فقط پنج سالت بود وضع هم

# huggingface datasets

In [7]:
def transcribe(sample):
    result = pipe(sample["audio"], return_timestamps=True, generate_kwargs={"task": "transcribe", "language": "persian"})
    sample["prediction"] = persian_normalizer(result["text"])
    
    return sample

processed_dataset = dataset.map(transcribe, batched=False)

references = processed_dataset["normalized_transcription"]
predictions = processed_dataset["prediction"]

error_rate = wer(references, predictions)
print(f"\nWER: {error_rate:.2%}")

Map:   0%|          | 0/100 [00:00<?, ? examples/s]


WER: 39.97%


In [10]:
def transcribe_batch(batch):
    results = pipe(
        batch["audio"],
        return_timestamps=True,
        generate_kwargs={"task": "transcribe", "language": "persian"}
    )
    # Handle both single and batched outputs
    if isinstance(results, dict):
        texts = [persian_normalizer(results["text"])]
    else:
        texts = [persian_normalizer(result["text"]) for result in results]
    batch["prediction"] = texts
    return batch

# Apply batched processing (batch size = 8, can be tuned)
processed_dataset = dataset.map(
    transcribe_batch,
    batched=True,
    batch_size=4,
)

# Compute WER
references = processed_dataset["normalized_transcription"]
predictions = processed_dataset["prediction"]
error_rate = wer(references, predictions)
print(f"\nWER: {error_rate:.2%}")

Map:   0%|          | 0/100 [00:00<?, ? examples/s]


WER: 39.97%


# test

In [20]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

# 🔧 Fix: Enable timestamps for long-form audio
result = pipe(sample, return_timestamps=True)

# If you only want the text
print(result["text"])

Device set to use cuda:0
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of Up Guards and Adam paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth, and Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampooer in a Turkish bath next man


In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=16,  # batch size for inference - set based on your device
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample)
print(result["text"])


Device set to use cuda:0
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Leighton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of Upguards and Adam paintings, and Mason's exquisite idylls are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath, Next man!
