# Finetuning Whisper
Due to limited resources, the unique procedure was performed in Google Colab. Necessary libraries were first downloaded and imported into the script. Following the assignment's Notion guidelines, the fine-tuning process was initiated. The pre-trained model was initially evaluated using the WER (Word Error Rate) metric, and then the fine-tuning process began. Once fine-tuning was completed, the model was re-evaluated to assess its performance.

In [None]:
from huggingface_hub import notebook_login
from datasets import load_dataset
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor
from datasets import Audio
from transformers import WhisperForConditionalGeneration
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

After this, just paste a token with *write* access to your hugging face account.

In [None]:
dataset = load_dataset("haideraqeeb/gujrati_asr_16kHz")

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 71058
    })
    test: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 1994
    })
    validation: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 7983
    })
})


Downloading the models and its files.

In [None]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [None]:
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="gujarati", task="transcribe")

In [None]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="gujarati", task="transcribe")

In [None]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

Inspecting the dataset files.

In [None]:
print(dataset["train"][0])

{'audio': {'path': None, 'array': array([ 0.00146484,  0.00186157,  0.00216675, ..., -0.00079346,
       -0.00088501, -0.00091553]), 'sampling_rate': 16000}, 'transcription': 'order આપવા એટલે પછી એ સાહેબ ને એટલે જે phone આયો તો મેડમ નો એમાં એમને કીધું મેં દેખો મેડમ હું આવી રીતે એક જગ્યા એ મેં કીધું ફસાયો છું એમ'}


In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

Preparing the dataset.

In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]
    max_length = model.config.max_target_positions
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["transcription"], truncation=True, max_length=max_length).input_ids
    return batch

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=8)

In [None]:
model.generation_config.language = "gujarati"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

Using DataCollator we will prepare the dataset for the model.

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [None]:
metric = evaluate.load("wer")

Create the evaluation metrics.

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Creating the hyperparameters for the training of the model.

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-gujarati-finetuned",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)



Creating the finetuning trainer.

In [None]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Saving the model in hugging face hub.

In [None]:
save_dir = "/content/whisper-small-gujarati"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
feature_extractor.save_pretrained(save_dir)
processor.save_pretrained(training_args.output_dir)