<a href="https://colab.research.google.com/github/l-ordkp/CelebSearch/blob/master/fine_tuning_whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import re
v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
!pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2
!pip install librosa soundfile evaluate jiwer torchcodec "datasets>=3.4.1,<4.0.0"

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting xformers==0.0.33.post1
  Downloading xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.11.4-py3-none-any.whl.metadata (32 kB)
Downloading xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl (122.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/122.9 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.25.1-py3-none-any.whl (465 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import evaluate
import numpy as np
from unsloth import FastLanguageModel
from transformers import WhisperProcessor, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, Audio
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# --- CONFIGURATION ---
MODEL_ID = "openai/whisper-large-v3"
DATASET_ID = "ai4bharat/IndicVoices"
# 🚩 UPDATED FOR MAITHILI 🚩
DATASET_SUBSET = "maithili"
LANGUAGE_CODE = "mai"    # Maithili ISO 639-2 code is 'mai' (used by Hugging Face)
TASK = "transcribe"

# 1. Load the Processor
# This sets the correct language and task tokens for Maithili transcription.
processor = WhisperProcessor.from_pretrained(
    MODEL_ID, language=LANGUAGE_CODE, task=TASK
)

# Fix for AttributeError: Use the EOS token ID for decoder start.
decoder_start_token_id = processor.tokenizer.eos_token_id

print(f"✅ Processor loaded for {LANGUAGE_CODE.upper()} language.")


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

✅ Processor loaded for MAI language.


In [5]:
from google.colab import userdata

In [9]:
from huggingface_hub import notebook_login
# This will open an interactive window where you paste your HF token.
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
# 2. Load the IndicVoices Dataset for Maithili
try:
    dataset = load_dataset(
        DATASET_ID,
        DATASET_SUBSET,
        split="train"
    )
    test_dataset = load_dataset(
        DATASET_ID,
        DATASET_SUBSET,
        split="valid"
    )

    # Rename 'text' to 'sentence' and remove unnecessary columns
    dataset = dataset.rename_column("text", "sentence")
    test_dataset = test_dataset.rename_column("text", "sentence")

    columns_to_remove = [col for col in dataset.column_names if col not in ['audio', 'sentence']]
    dataset = dataset.remove_columns(columns_to_remove)
    test_dataset = test_dataset.remove_columns(columns_to_remove)

except Exception as e:
    print(f"FATAL ERROR: Dataset loading failed for {DATASET_ID} ({DATASET_SUBSET}): {e}")
    raise SystemExit(1)

# Cast audio to 16kHz
target_sampling_rate = processor.feature_extractor.sampling_rate
dataset = dataset.cast_column("audio", Audio(sampling_rate=target_sampling_rate))
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=target_sampling_rate))

# 3. Preprocessing Function
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = processor.feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids

    # Filter out audio longer than 30 seconds
    if len(audio["array"]) > target_sampling_rate * 30:
        return None

    return batch

# 4. Apply Preprocessing and Filter
dataset = dataset.map(
    prepare_dataset,
    remove_columns=dataset.column_names,
    num_proc=4,
    fn_kwargs={"processor": processor}
).filter(lambda x: x is not None)

test_dataset = test_dataset.map(
    prepare_dataset,
    remove_columns=test_dataset.column_names,
    num_proc=4,
    fn_kwargs={"processor": processor}
).filter(lambda x: x is not None)

print(f"Training Samples (Filtered): {len(dataset)}")
print(f"Test Samples (Filtered): {len(test_dataset)}")

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



FATAL ERROR: Dataset loading failed for ai4bharat/IndicVoices (maithili): Dataset 'ai4bharat/IndicVoices' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/ai4bharat/IndicVoices to ask for access.
Traceback (most recent call last):
  File "/tmp/ipython-input-1368571358.py", line 3, in <cell line: 0>
    dataset = load_dataset(
              ^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/datasets/load.py", line 2062, in load_dataset
    builder_instance = load_dataset_builder(
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/datasets/load.py", line 1782, in load_dataset_builder
    dataset_module = dataset_module_factory(
                     ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/datasets/load.py", line 1652, in dataset_module_factory
    raise e1 from None
  File "/usr/local/lib/python3.12/dist-packages/datasets/load.py", line 1636, in dataset_module_facto

TypeError: object of type 'NoneType' has no len()