
# Fine‑tune Whisper Large‑v3 Turbo on Urdu Common Voice 17.0  

This notebook follows the official [Hugging Face Whisper fine‑tuning guide](https://huggingface.co/blog/fine-tune-whisper) and adapts it to Urdu.  
We will:  

1. Install & import dependencies  
2. Load the Urdu subset of *Common Voice 17.0*  
3. Pre‑process audio & text with `WhisperProcessor`  
4. Fine‑tune **`openai/whisper-large-v3-turbo`** with `Seq2SeqTrainer`  
5. Track metrics & checkpoints on the Hub (push_to_hub)  
6. Run a quick inference demo  


In [1]:
!pip install -q transformers accelerate evaluate jiwer huggingface_hub soundfile librosa tensorboardX

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
from huggingface_hub import login
import os

hf_token = os.environ.get("HF_TOKEN")
login(hf_token)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
import os
import re
import torch
import numpy as np
from datasets import load_dataset, Audio
from transformers import (
    AutoProcessor,                    # replaces WhisperProcessor
    AutoModelForSpeechSeq2Seq,        # replaces WhisperForConditionalGeneration
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from evaluate import load as load_metric

from dataclasses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Unionsses import dataclass
from typing import Any, Dict, List, Union

In [5]:
# ----------------------------------
# Configuration – edit as you like
# ----------------------------------
MODEL_ID = "openai/whisper-large-v3-turbo"
HF_USERNAME = "kingabzpro"  #  <-- CHANGE
PUSH_MODEL_ID = f"{HF_USERNAME}/whisper-large-v3-turbo-urdu"
LANG_ID = "ur"
MAX_AUDIO_SEC = 30
SAMPLING_RATE = 16_000
SEED = 42


In [6]:
# Load Common Voice 17.0 Urdu subset
dataset = load_dataset(
    "mozilla-foundation/common_voice_17_0",
    LANG_ID,
    split={"train": "train", "test": "test[:400]"},
    trust_remote_code=True,
)

# Remove imports with missing audio
dataset = dataset.remove_columns(
    [col for col in dataset["train"].column_names if col not in ("audio", "sentence")]
)

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 5368
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 400
    })
})


In [10]:
processor = AutoProcessor.from_pretrained(MODEL_ID, language="Urdu", task="transcribe")


def remove_special_chars(text):
    text = text.lower()
    text = re.sub(r"[^\w\s\u0600-\u06FF]+", " ", text)  # keep Urdu chars & digits
    return re.sub(r"\s+", " ", text).strip()


def prepare_example(batch):
    # Pre‑process text
    batch["sentence"] = remove_special_chars(batch["sentence"])

    # Resample & load audio to 16 kHz
    audio = batch["audio"]
    batch["input_features"] = processor.feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    # Tokenise labels
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch


# Cast audio & preprocess
dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))
dataset = dataset.map(
    prepare_example,
    remove_columns=dataset["train"].column_names,
    desc="Pre‑processing",
)


Pre‑processing:   0%|          | 0/5368 [00:00<?, ? examples/s]

Pre‑processing:   0%|          | 0/400 [00:00<?, ? examples/s]

In [28]:
wer_metric = load_metric("wer")


def compute_metrics(eval_pred):
    pred_ids = eval_pred.predictions  # already (batch, seq_len)
    label_ids = eval_pred.label_ids

    # Replace -100 so we can decode the references
    label_ids = np.where(
        label_ids != -100,
        label_ids,
        processor.tokenizer.pad_token_id,
    )

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


In [29]:
torch.manual_seed(SEED)

training_args = Seq2SeqTrainingArguments(
    output_dir=PUSH_MODEL_ID,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    max_steps=2_000,  # 🔧 adapt or set num_train_epochs
    learning_rate=2e-5,
    warmup_steps=200,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=2,
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    predict_with_generate=True,
    generation_max_length=225,
    metric_for_best_model="wer",
    greater_is_better=False,
    report_to=["tensorboard"],  # or "wandb" if you prefer
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id=PUSH_MODEL_ID,
    hub_private_repo=False,
    hub_strategy="every_save",  # push checkpoints & metrics
)


In [13]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_ID,
)
model.config.forced_decoder_ids = None  # disable lang‑id tokens


In [30]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [31]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [33]:
trainer.train()

Step,Training Loss,Validation Loss,Wer
500,0.5561,1.070738,0.662857
1000,0.6472,0.700715,0.508
1500,0.5162,0.545962,0.426286
2000,0.3663,0.462983,0.382571


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=2000, training_loss=0.5836709747314454, metrics={'train_runtime': 1926.8107, 'train_samples_per_second': 2.076, 'train_steps_per_second': 1.038, 'total_flos': 6.81980461056e+18, 'train_loss': 0.5836709747314454, 'epoch': 0.7451564828614009})

In [34]:
!nvidia-smi

Sat Jun 28 19:52:34 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.05             Driver Version: 550.127.05     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:8A:00.0 Off |                    0 |
| N/A   27C    P0             68W /  400W |   25175MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [35]:
# Push the final artefacts
trainer.push_to_hub()

Uploading...:   0%|          | 0.00/3.24G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kingabzpro/whisper-large-v3-turbo-urdu/commit/9f74e922a30cc085570b6b1ccf32ac9ade0e8128', commit_message='End of training', commit_description='', oid='9f74e922a30cc085570b6b1ccf32ac9ade0e8128', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kingabzpro/whisper-large-v3-turbo-urdu', endpoint='https://huggingface.co', repo_type='model', repo_id='kingabzpro/whisper-large-v3-turbo-urdu'), pr_revision=None, pr_num=None)

In [36]:
from datasets import Audio

sample = dataset["test"][0]
input_features = dataset["test"][0]["input_features"]

input_features = torch.tensor(input_features).unsqueeze(0).to(model.device)
pred_ids = model.generate(input_features)[0]
print("Prediction:", processor.decode(pred_ids, skip_special_tokens=True))


Prediction: بیہ زوق نہیں اگرچے فطرت
