
# Fine‑tune Whisper Large‑v3 Turbo on Urdu Common Voice 17.0  

End-to-end fine-tuning of Whisper-v3-turbo on Urdu with TensorBoard experiment tracking.


## 0. Install dependencies

In [1]:
%%capture
!pip install torch torchaudio "transformers==4.52.2" accelerate "datasets==3.4.1" evaluate peft librosa bitsandbytes \
             huggingface_hub tensorboard jiwer 

In [2]:
import os
import random
import re
import unicodedata
import numpy as np
import torch
import librosa

from dataclasses import dataclass
from huggingface_hub import login, HfApi
from datasets import load_dataset, load_from_disk, Audio
from transformers import (
    AutoProcessor,
    AutoModelForSpeechSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    DefaultFlowCallback,
)
from transformers.integrations import TensorBoardCallback
from peft import LoraConfig, get_peft_model
from evaluate import load as load_metric

  from .autonotebook import tqdm as notebook_tqdm


## 1. Configuration & Reproducibility

In [4]:
SEED         = 420
HF_TOKEN     = os.environ.get("HF_TOKEN")           # your HF token
HF_USERNAME  = "kingabzpro"                         # replace as needed
MODEL_ID     = "openai/whisper-large-v3"
LANG_ID      = "ur"
SAMPLING_RATE= 16_000
CACHE_DIR    = "./cached_cv_urdu"
PUSH_MODEL_ID = f"{HF_USERNAME}/whisper-large-v3-urdu"

assert HF_TOKEN, "Please set HF_TOKEN env var to your Hugging Face token"

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fa339af69b0>

## 2. Helpers: text & audio cleaning

In [5]:
def normalize_urdu(text: str) -> str:
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r'[\u0617-\u061A\u064B-\u0652]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[“”«»„”…—–\[\]\(\)]', '', text)
    digits_map = str.maketrans("۰۱۲۳۴۵۶۷۸۹", "0123456789")
    return text.translate(digits_map)



## 3. Login & Load Model + Processor

In [6]:
login(HF_TOKEN)

processor = AutoProcessor.from_pretrained(
    MODEL_ID, language=LANG_ID, task="transcribe"
)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_ID, ignore_mismatched_sizes=True
)
model.config.use_cache = False
model.generation_config.language = "ur"
model.generation_config.task = "transcribe"


## 4. Preprocess + Cache Dataset

In [7]:
def prepare_example(batch):
    # batch["audio"] is a list of dictionaries because batched=True.
    # We create a list of all the audio arrays from the batch.
    audio_arrays = [x["array"] for x in batch["audio"]]
    
    # We can assume the sampling rate is the same for all items in the batch.
    sampling_rate = batch["audio"][0]["sampling_rate"]

    input_feats = processor.feature_extractor(
        audio_arrays, sampling_rate=sampling_rate, return_tensors="np"
    ).input_features

    cleaned = [normalize_urdu(t) for t in batch["sentence"]]
    labels  = processor.tokenizer(
        cleaned,
        truncation=True,
        max_length=model.config.max_length, # Uncomment if 'model' is defined
        padding="max_length",
        return_tensors="np"
    ).input_ids

    # The function should return a dictionary
    return {"input_features": input_feats, "labels": labels}


In [8]:
from datasets import load_dataset, load_from_disk, DatasetDict, logging
logging.set_verbosity_error()

if not os.path.isdir(CACHE_DIR):
    # 1. load both splits into a DatasetDict
    dataset = load_dataset(
        "mozilla-foundation/common_voice_17_0",
        LANG_ID,
        split={"train": "train+validation", "validation": "test[:600]"},
        cache_dir="./hf_cache",
        trust_remote_code=True
    )
    # 2. drop unwanted columns
    dataset = dataset.remove_columns(
        [col for col in dataset["train"].column_names if col not in ("audio","sentence")]
    )
    print(dataset)
    
    # 3. 
    # Cast audio & preprocess
    dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))
    dataset = dataset.map(
        prepare_example,
        remove_columns=dataset["train"].column_names,
        desc="Pre‑processing",
        batched=True,
        batch_size=125,
        load_from_cache_file=True,
    )

    # 4. save all splits
    DatasetDict(dataset).save_to_disk(CACHE_DIR)

# 5. load and set torch format
dataset = load_from_disk(CACHE_DIR)
dataset.set_format(type="torch", columns=["input_features","labels"])


## 5. Data Collator

In [9]:
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # 1. Gather raw audio features
        input_feats = [feat["input_features"] for feat in features]
        # 2. Pad them (this returns both 'input_features' and 'attention_mask')
        batch_inputs = self.processor.feature_extractor.pad(
            {"input_features": input_feats},
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )

        # 3. Gather label sequences
        label_ids = [feat["labels"] for feat in features]
        # 4. Pad them (this returns 'input_ids' and its 'attention_mask')
        label_batch = self.processor.tokenizer.pad(
            {"input_ids": label_ids},
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )

        # 5. Replace pad token ids in labels with -100 so they're ignored in loss
        labels = label_batch["input_ids"].masked_fill(
            label_batch["attention_mask"].ne(1), -100
        )

        # 6. If a bos token was prepended earlier, drop it here
        if labels.size(1) > 0 and torch.all(labels[:, 0] == self.decoder_start_token_id):
            labels = labels[:, 1:]

        # 7. Package everything up
        batch_inputs["labels"] = labels
        # and give the decoder its own attention mask
        batch_inputs["decoder_attention_mask"] = label_batch["attention_mask"]

        return batch_inputs
        
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

## 6. Metric Definition

In [10]:
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

def compute_metrics(eval_pred):
    pred_ids  = eval_pred.predictions            # already (batch, seq_len)
    label_ids = eval_pred.label_ids

    # Replace -100 so we can decode the references
    label_ids = np.where(
        label_ids != -100,
        label_ids,
        processor.tokenizer.pad_token_id,
    )

    pred_str  = processor.batch_decode(pred_ids,  skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)*100
    cer = cer_metric.compute(predictions=pred_str, references=label_str) * 100
    return {"wer": wer, "cer": cer}

Downloading builder script: 6.61kB [00:00, 5.90MB/s]


## 7. Full Training

In [11]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir               = PUSH_MODEL_ID,

    # --- Core Performance Optimizations ---
    per_device_train_batch_size   = 8,
    gradient_accumulation_steps   = 2,
    per_device_eval_batch_size    = 4,
    bf16                          = True,
    fp16                          = False,
    gradient_checkpointing        = False,

    # --- Learning Schedule ---
    learning_rate            = 3e-5,
    warmup_steps             = 100,
    max_steps                = 1500,
    lr_scheduler_type        = "cosine",

    # --- Logging and Saving ---
    eval_strategy            = "steps",
    eval_steps               = 300,
    logging_steps            = 100,

    # Turn off automatic checkpointing:
    save_strategy            = "no",        # ← disable all intermediate saves
    save_steps               = None,        # ← ignored when save_strategy="no"
    save_total_limit         = None,

    # We’ll load the final model manually if you like, so disable this:
    load_best_model_at_end   = False,

    # --- Generation & Hub Push ---
    predict_with_generate    = True,
    generation_max_length    = 225,

    report_to                = ["tensorboard"],

    push_to_hub              = True,
    hub_private_repo         = False,
    hub_strategy             = "end",       # ← only push once after training
)


In [12]:
trainer = Seq2SeqTrainer(
    model           = model,
    args            = training_args,
    train_dataset   = dataset["train"],
    eval_dataset    = dataset["validation"],
    data_collator   = data_collator,
    compute_metrics = compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,Wer,Cer
300,0.0261,0.025424,30.022405,10.364577
600,0.0211,0.022594,25.85885,8.578023
900,0.0121,0.02064,24.215833,7.941189
1200,0.0093,0.01948,21.303211,7.201778
1500,0.0043,0.020351,21.471247,7.197504


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.


TrainOutput(global_step=1500, training_loss=0.01930886177221934, metrics={'train_runtime': 2256.3553, 'train_samples_per_second': 10.637, 'train_steps_per_second': 0.665, 'total_flos': 8.14380344082432e+19, 'train_loss': 0.01930886177221934, 'epoch': 2.542832909245123})

In [13]:
!nvidia-smi

Fri Jul 11 13:14:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.163.01             Driver Version: 550.163.01     CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          Off |   00000000:DB:00.0 Off |                    0 |
| N/A   28C    P0            116W /  700W |   65448MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

## 8. Save & Push to Hugging Face Hub

In [14]:
trainer.push_to_hub()
processor.save_pretrained(PUSH_MODEL_ID)
processor.push_to_hub(PUSH_MODEL_ID)




CommitInfo(commit_url='https://huggingface.co/kingabzpro/whisper-large-v3-urdu/commit/369eeb93b84a69e864684cff06637cded0d61a08', commit_message='Upload processor', commit_description='', oid='369eeb93b84a69e864684cff06637cded0d61a08', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kingabzpro/whisper-large-v3-urdu', endpoint='https://huggingface.co', repo_type='model', repo_id='kingabzpro/whisper-large-v3-urdu'), pr_revision=None, pr_num=None)

In [15]:
# grab your single example
feat = dataset["validation"][30]["input_features"]

# turn it into a tensor and unsqueeze
input_tensor = torch.tensor(feat).unsqueeze(0)

# move to device AND cast to the model’s dtype
model_dtype = next(model.parameters()).dtype
input_tensor = input_tensor.to(device=model.device, dtype=model_dtype)

# now generate
pred_ids = model.generate(input_tensor)[0]
print("Prediction:", processor.decode(pred_ids, skip_special_tokens=True))

  input_tensor = torch.tensor(feat).unsqueeze(0)
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Prediction: ہاروی وائنسٹن کے خلاف دوسری خاتون بھی جیری کے سامنے پیش
