
# Fine‑tune Whisper Large‑v3 Turbo on Urdu Common Voice 17.0  

End-to-end fine-tuning of Whisper-v3-turbo on Urdu with TensorBoard experiment tracking.


## 0. Install dependencies

In [1]:
%%capture
!pip install transformers accelerate datasets evaluate peft librosa bitsandbytes \
             huggingface_hub tensorboard jiwer

In [2]:
import os
import random
import re
import unicodedata
import numpy as np
import torch
import librosa

from dataclasses import dataclass
from huggingface_hub import login, HfApi
from datasets import load_dataset, load_from_disk, Audio
from transformers import (
    AutoProcessor,
    AutoModelForSpeechSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    DefaultFlowCallback,
)
from transformers.integrations import TensorBoardCallback
from peft import LoraConfig, get_peft_model
from evaluate import load as load_metric

## 1. Configuration & Reproducibility

In [3]:
SEED         = 420
HF_TOKEN     = os.environ.get("HF_TOKEN")           # your HF token
HF_USERNAME  = "kingabzpro"                         # replace as needed
MODEL_ID     = "openai/whisper-large-v3-turbo"
LANG_ID      = "ur"
SAMPLING_RATE= 16_000
CACHE_DIR    = "./cached_cv_urdu"
PUSH_MODEL_ID = f"{HF_USERNAME}/whisper-large-v3-turbo-urdu"

assert HF_TOKEN, "Please set HF_TOKEN env var to your Hugging Face token"

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x79a1d419b030>

## 2. Helpers: text & audio cleaning

In [4]:
def normalize_urdu(text: str) -> str:
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r'[\u0617-\u061A\u064B-\u0652]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[“”«»„”…—–\[\]\(\)]', '', text)
    digits_map = str.maketrans("۰۱۲۳۴۵۶۷۸۹", "0123456789")
    return text.translate(digits_map)



## 3. Login & Load Model + Processor

In [5]:
login(HF_TOKEN)

processor = AutoProcessor.from_pretrained(
    MODEL_ID, language=LANG_ID, task="transcribe"
)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_ID, ignore_mismatched_sizes=True
)
model.config.use_cache = False
model.generation_config.language = "ur"
model.generation_config.task = "transcribe"


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

## 4. Preprocess + Cache Dataset

In [6]:
def prepare_example(batch):
    # batch["audio"] is a list of dictionaries because batched=True.
    # We create a list of all the audio arrays from the batch.
    audio_arrays = [x["array"] for x in batch["audio"]]
    
    # We can assume the sampling rate is the same for all items in the batch.
    sampling_rate = batch["audio"][0]["sampling_rate"]

    input_feats = processor.feature_extractor(
        audio_arrays, sampling_rate=sampling_rate, return_tensors="np"
    ).input_features

    cleaned = [normalize_urdu(t) for t in batch["sentence"]]
    labels  = processor.tokenizer(
        cleaned,
        truncation=True,
        max_length=model.config.max_length, # Uncomment if 'model' is defined
        padding="max_length",
        return_tensors="np"
    ).input_ids

    # The function should return a dictionary
    return {"input_features": input_feats, "labels": labels}


In [7]:
from datasets import load_dataset, load_from_disk, DatasetDict, logging
logging.set_verbosity_error()

if not os.path.isdir(CACHE_DIR):
    # 1. load both splits into a DatasetDict
    dataset = load_dataset(
        "mozilla-foundation/common_voice_17_0",
        LANG_ID,
        split={"train": "train+validation", "validation": "test[:600]"},
        cache_dir="./hf_cache",
        trust_remote_code=True
    )
    # 2. drop unwanted columns
    dataset = dataset.remove_columns(
        [col for col in raw["train"].column_names if col not in ("audio","sentence")]
    )
    print(raw)
    
    # 3. 
    # Cast audio & preprocess
    dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))
    dataset = dataset.map(
        prepare_example,
        remove_columns=dataset["train"].column_names,
        desc="Pre‑processing",
        batched=True,
        batch_size=125,
        load_from_cache_file=True,
    )

    # 4. save all splits
    DatasetDict(dataset).save_to_disk(CACHE_DIR)

# 5. load and set torch format
dataset = load_from_disk(CACHE_DIR)
dataset.set_format(type="torch", columns=["input_features","labels"])


Loading dataset from disk:   0%|          | 0/29 [00:00<?, ?it/s]

## 5. Data Collator

In [8]:
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # 1. Gather raw audio features
        input_feats = [feat["input_features"] for feat in features]
        # 2. Pad them (this returns both 'input_features' and 'attention_mask')
        batch_inputs = self.processor.feature_extractor.pad(
            {"input_features": input_feats},
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )

        # 3. Gather label sequences
        label_ids = [feat["labels"] for feat in features]
        # 4. Pad them (this returns 'input_ids' and its 'attention_mask')
        label_batch = self.processor.tokenizer.pad(
            {"input_ids": label_ids},
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )

        # 5. Replace pad token ids in labels with -100 so they're ignored in loss
        labels = label_batch["input_ids"].masked_fill(
            label_batch["attention_mask"].ne(1), -100
        )

        # 6. If a bos token was prepended earlier, drop it here
        if labels.size(1) > 0 and torch.all(labels[:, 0] == self.decoder_start_token_id):
            labels = labels[:, 1:]

        # 7. Package everything up
        batch_inputs["labels"] = labels
        # and give the decoder its own attention mask
        batch_inputs["decoder_attention_mask"] = label_batch["attention_mask"]

        return batch_inputs
        
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

## 6. Metric Definition

In [9]:
wer_metric = load_metric("wer")

def compute_metrics(eval_pred):
    pred_ids  = eval_pred.predictions            # already (batch, seq_len)
    label_ids = eval_pred.label_ids

    # Replace -100 so we can decode the references
    label_ids = np.where(
        label_ids != -100,
        label_ids,
        processor.tokenizer.pad_token_id,
    )

    pred_str  = processor.batch_decode(pred_ids,  skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)*100
    return {"wer": wer}

Downloading builder script: 0.00B [00:00, ?B/s]

## 7. Full Training

In [10]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir               = PUSH_MODEL_ID,

    # --- Core Performance Optimizations ---
    per_device_train_batch_size   = 8,
    gradient_accumulation_steps   = 1,
    per_device_eval_batch_size    = 4,
    bf16                          = True,
    fp16                          = False,
    gradient_checkpointing        = False,

    # --- Learning Schedule ---
    learning_rate            = 2e-5,
    warmup_steps             = 100,
    max_steps                = 1500,
    lr_scheduler_type        = "cosine",

    # --- Logging and Saving ---
    eval_strategy            = "steps",
    eval_steps               = 300,
    logging_steps            = 100,

    # Turn off automatic checkpointing:
    save_strategy            = "no",        # ← disable all intermediate saves
    save_steps               = None,        # ← ignored when save_strategy="no"
    save_total_limit         = None,

    # We’ll load the final model manually if you like, so disable this:
    load_best_model_at_end   = False,

    # --- Generation & Hub Push ---
    predict_with_generate    = True,
    generation_max_length    = 225,

    report_to                = ["tensorboard"],

    push_to_hub              = True,
    hub_private_repo         = False,
    hub_strategy             = "end",       # ← only push once after training
)


In [12]:
trainer = Seq2SeqTrainer(
    model           = model,
    args            = training_args,
    train_dataset   = dataset["train"],
    eval_dataset    = dataset["validation"],
    data_collator   = data_collator,
    compute_metrics = compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,Wer
300,0.6764,0.624419,44.977595
600,0.5881,0.508855,37.621359
900,0.4662,0.434867,32.132188
1200,0.3661,0.363361,26.568335
1500,0.2293,0.353437,25.784167


TrainOutput(global_step=1500, training_loss=0.4755880928039551, metrics={'train_runtime': 2339.7763, 'train_samples_per_second': 5.129, 'train_steps_per_second': 0.641, 'total_flos': 2.044747917361152e+19, 'train_loss': 0.4755880928039551, 'epoch': 1.272264631043257})

In [13]:
!nvidia-smi

Sat Jul  5 13:09:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.20             Driver Version: 570.133.20     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:07:00.0 Off |                    0 |
| N/A   35C    P0             64W /  400W |   33809MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

## 8. Save & Push to Hugging Face Hub

In [20]:
trainer.push_to_hub()
processor.save_pretrained(PUSH_MODEL_ID)
processor.push_to_hub(PUSH_MODEL_ID)


Uploading...:   0%|          | 0.00/3.24G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kingabzpro/whisper-large-v3-turbo-urdu/commit/bef7b66cc8393e9330613c53c60e656ac62d8fee', commit_message='Upload processor', commit_description='', oid='bef7b66cc8393e9330613c53c60e656ac62d8fee', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kingabzpro/whisper-large-v3-turbo-urdu', endpoint='https://huggingface.co', repo_type='model', repo_id='kingabzpro/whisper-large-v3-turbo-urdu'), pr_revision=None, pr_num=None)

In [26]:
# grab your single example
feat = dataset["validation"][30]["input_features"]

# turn it into a tensor and unsqueeze
input_tensor = torch.tensor(feat).unsqueeze(0)

# move to device AND cast to the model’s dtype
model_dtype = next(model.parameters()).dtype
input_tensor = input_tensor.to(device=model.device, dtype=model_dtype)

# now generate
pred_ids = model.generate(input_tensor)[0]
print("Prediction:", processor.decode(pred_ids, skip_special_tokens=True))

  input_tensor = torch.tensor(feat).unsqueeze(0)


Prediction: ہاروی وائنسٹن کے خلاف دوسری خاتون بھی جیری کے سامنے پیش


## 9. Loading and Testing

In [22]:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
import torch, warnings, os

device = "cuda:0"
torch_dtype = torch.float16
model_id = "kingabzpro/whisper-large-v3-turbo-urdu"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True
).to(device)
model.config.use_cache = False
model.generation_config.language = "ur"
model.generation_config.task = "transcribe"

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

ds = load_dataset(
    "mozilla-foundation/common_voice_17_0",
    "ur",
    split="test",
    trust_remote_code=True,
    cache_dir="./hf_cache",
)
audio = ds[100]["audio"]

result = pipe(audio)
print("Original  :", ds[100]["sentence"])
print("Predicted :", result["text"])


Device set to use cuda:0


Original  : اگر عمران خان ٹھیک کر رہے ہیں۔
Predicted : اگر عمران خان ٹھیک کر رہے ہیں۔
