
# Fine‑tune Whisper Large‑v3 Turbo on Urdu Common Voice 17.0  

This notebook follows the official [Hugging Face Whisper fine‑tuning guide](https://huggingface.co/blog/fine-tune-whisper) and adapts it to Urdu.  
We will:  

1. Install & import dependencies  
2. Load the Urdu subset of *Common Voice 17.0*  
3. Pre‑process audio & text with `WhisperProcessor`  
4. Fine‑tune **`openai/whisper-large-v3-turbo`** with `Seq2SeqTrainer`  
5. Track metrics & checkpoints on the Hub (push_to_hub)  
6. Run a quick inference demo  


In [1]:
!pip install -q transformers accelerate evaluate jiwer huggingface_hub soundfile librosa tensorboardX

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
# ----------------------------------
# Configuration – edit as you like
# ----------------------------------
MODEL_ID = "openai/whisper-large-v3-turbo"
HF_USERNAME = "kingabzpro"  #  <-- CHANGE
PUSH_MODEL_ID = f"{HF_USERNAME}/whisper-large-v3-turbo-urdu"
LANG_ID = "ur"
MAX_AUDIO_SEC = 30
SAMPLING_RATE = 16_000
SEED = 42


In [6]:
# Load Common Voice 17.0 Urdu subset
dataset = load_dataset(
    "mozilla-foundation/common_voice_17_0",
    LANG_ID,
    split={"train": "train", "test": "test[:400]"},
    trust_remote_code=True,
)

# Remove imports with missing audio
dataset = dataset.remove_columns(
    [col for col in dataset["train"].column_names if col not in ("audio", "sentence")]
)

print(dataset)


ur_other_2.tar:   6%|5         | 73.4M/1.31G [00:00<?, ?B/s]

ur_other_3.tar:   0%|          | 0.00/465M [00:00<?, ?B/s]

ur_invalidated_0.tar:   0%|          | 0.00/177M [00:00<?, ?B/s]

ur_validated_0.tar:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

ur_validated_1.tar:   0%|          | 0.00/323M [00:00<?, ?B/s]

train.tsv:   0%|          | 0.00/1.75M [00:00<?, ?B/s]

dev.tsv:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

test.tsv:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

other.tsv:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

invalidated.tsv:   0%|          | 0.00/2.30M [00:00<?, ?B/s]

validated.tsv:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 5368it [00:00, 85594.79it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 4057it [00:00, 232358.24it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 4056it [00:00, 233737.23it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 21724it [00:00, 217228.30it/s][A
Reading metadata...: 43447it [00:00, 209169.73it/s][A
Reading metadata...: 64385it [00:00, 205408.65it/s][A
Reading metadata...: 84938it [00:00, 203027.84it/s][A
Reading metadata...: 105247it [00:00, 200343.13it/s][A
Reading metadata...: 135861it [00:00, 202066.81it/s][A


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 6818it [00:00, 227808.21it/s]


Generating validated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 23152it [00:00, 231509.18it/s][A
Reading metadata...: 53858it [00:00, 216928.15it/s][A


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 200
    })
})


In [7]:
processor = AutoProcessor.from_pretrained(MODEL_ID, language="Urdu", task="transcribe")


def remove_special_chars(text):
    text = text.lower()
    text = re.sub(r"[^\w\s\u0600-\u06FF]+", " ", text)  # keep Urdu chars & digits
    return re.sub(r"\s+", " ", text).strip()


def prepare_example(batch):
    # Pre‑process text
    batch["sentence"] = remove_special_chars(batch["sentence"])
    # Resample & load audio to 16 kHz
    audio = batch["audio"]
    batch["input_features"] = processor.feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    # Tokenise labels
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch


# Cast audio & preprocess
dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))
dataset = dataset.map(
    prepare_example,
    remove_columns=dataset["train"].column_names,
    desc="Pre‑processing",
)

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Pre‑processing:   0%|          | 0/2000 [00:00<?, ? examples/s]

Pre‑processing:   0%|          | 0/200 [00:00<?, ? examples/s]

In [35]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

wer_metric = load_metric("wer")


def compute_metrics(eval_pred):
    pred_ids = eval_pred.predictions  # already (batch, seq_len)
    label_ids = eval_pred.label_ids

    # Replace -100 so we can decode the references
    label_ids = np.where(
        label_ids != -100,
        label_ids,
        processor.tokenizer.pad_token_id,
    )

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


In [38]:
torch.manual_seed(SEED)

training_args = Seq2SeqTrainingArguments(
    output_dir=PUSH_MODEL_ID,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    max_steps=2_000,  # 🔧 adapt or set num_train_epochs
    learning_rate=2e-4,
    warmup_steps=200,
    fp16=True,
    fp16_full_eval=False,
    half_precision_backend="auto",
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    predict_with_generate=True,
    generation_max_length=225,
    metric_for_best_model="wer",
    greater_is_better=False,
    report_to=["tensorboard"],  # or "wandb" if you prefer
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id=PUSH_MODEL_ID,
    hub_private_repo=False,
    hub_strategy="every_save",  # push checkpoints & metrics
)


In [39]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_ID,
)
model.config.forced_decoder_ids = None  # disable lang‑id tokens


In [40]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [42]:
trainer.train()

Step,Training Loss,Validation Loss,Wer
500,0.3122,0.444916,0.335977
1000,0.1068,0.479238,0.323513
1500,0.0323,0.519182,0.351275
2000,0.0039,0.577517,0.501983


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=2000, training_loss=0.15848407854139804, metrics={'train_runtime': 3452.0178, 'train_samples_per_second': 4.635, 'train_steps_per_second': 0.579, 'total_flos': 2.727921844224e+19, 'train_loss': 0.15848407854139804, 'epoch': 8.0})

In [43]:
# Push the final artefacts
trainer.push_to_hub()

Uploading...:   0%|          | 0.00/3.24G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kingabzpro/whisper-large-v3-turbo-urdu/commit/bbcb1faaa069a21eb172a383c39d3443be7ea2f5', commit_message='End of training', commit_description='', oid='bbcb1faaa069a21eb172a383c39d3443be7ea2f5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kingabzpro/whisper-large-v3-turbo-urdu', endpoint='https://huggingface.co', repo_type='model', repo_id='kingabzpro/whisper-large-v3-turbo-urdu'), pr_revision=None, pr_num=None)

Prediction: بے ذوق نہیں اگر چک فترت
