In [None]:
!pip install --upgrade pip
!pip install --upgrade "datasets[audio]" transformers accelerate evaluate jiwer tensorboard gradio


In [None]:
# generate a token from your hf account n use it so the model can be saved to the hub
from huggingface_hub import notebook_login

notebook_login()

Run this only in colab.

In [None]:
# download the "dataset" folder from onedrive and put this "dataset" folder in your google drive once, then use it because downloading from hf takes time on colab only
from google.colab import drive
drive.mount('/content/drive')  # Mount Google Drive

In [None]:
from datasets import load_dataset, DatasetDict

my_dataset = DatasetDict()
my_dataset["train"] = load_dataset("/content/drive/My Drive/dataset", split='train') # USE THIS ON COLAB
print(my_dataset)

Run this on your machine.

In [None]:
from datasets import load_dataset, DatasetDict

my_dataset = DatasetDict()
my_dataset["train"] = load_dataset("itskavya/gp", split='train') # USE THIS ON YOUR MACHINE
print(my_dataset)


In [None]:
# Get column names to identify the second column
column_names = my_dataset["train"].column_names
second_column = column_names[1]  # Get the second column name
second_column

In [None]:
from datasets import Dataset

# Assuming your dataset is already loaded into 'my_dataset'
# Here's a way to filter out rows containing '[موسيقى]' in 'transcription'

def filter_dataset(dataset):
    filtered_rows = []
    for i in range(len(dataset['transcription'])):
        if '[موسيقى]' not in dataset['transcription'][i]:
            filtered_rows.append(i)

    return dataset.select(filtered_rows)

# Applying the filter
filtered_dataset = filter_dataset(my_dataset['train'])

# Print the new dataset information to verify
print(filtered_dataset)

# If you want to update your original DatasetDict object
my_dataset['train'] = filtered_dataset

# Now 'my_dataset' will have the filtered dataset without rows containing '[موسيقى]'
print(my_dataset)


Hear some audios

In [None]:
import random

rand_int = random.randint(0, len(my_dataset['train'])-1)
print(my_dataset['train'][rand_int])

In [None]:
import IPython.display as ipd

print(my_dataset['train'][rand_int]["transcription"])
ipd.Audio(data=my_dataset['train'][rand_int]["audio"]["array"], autoplay=True, rate=my_dataset['train'][rand_int]["audio"]["sampling_rate"])


In [None]:
# split the dataset for testing
split_dataset = my_dataset["train"].train_test_split(test_size=0.2)
split_dataset

In [None]:
my_dataset['train'] = split_dataset['train']
my_dataset['test'] = split_dataset['test']
my_dataset

In [None]:
# input to whisper should be log-mel, this is done automatically by the whisper feature extractor
# it also performs padding and truncation
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium")

In [10]:

# load the whisper tokenizer to convert map the indices predicted by model to text
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="Arabic", task="transcribe")


In [None]:
input_str = my_dataset["train"][0]["transcription"]
labels = tokenizer(input_str).input_ids # it returns a dict of input ids and attention mask so just get the input ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


In [12]:
# can combine the tokenizer and feature extractor into one object
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="Arabic", task="transcribe")


In [None]:
print(my_dataset["train"][0])

In [14]:
# need to sample the audio to match whisper's sampling rate, this does it on the fly when audio is loaded
from datasets import Audio

my_dataset = my_dataset.cast_column("audio", Audio(sampling_rate=16000))


In [None]:
print(my_dataset["train"][0])

In [16]:
def prepare_dataset(batch):
    # load and resample audio data 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0] # its a batch

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch


In [None]:
my_dataset = my_dataset.map(prepare_dataset, remove_columns=my_dataset.column_names["train"], num_proc=4) # use num_proc=4 to make it process faster, if gives error remove it


In [None]:
# load the model
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")


In [None]:
# tokens in transcript can't be more than 448
max_label_length = model.config.max_length
def is_labels_in_length_range(labels):
    return len(labels) < max_label_length

my_dataset = my_dataset.filter(is_labels_in_length_range, num_proc=4, input_columns=["labels"])
print(my_dataset)

In [19]:
model.generation_config.language = "Arabic"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None # don't use the legacy method instead use the config above


In [20]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass #decorator that provides init function
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt") # pad the input audio and return tensors

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt") # pad the transcript and return tensors

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) #ne not equal to 1, means get padding tokens from attention mask and replace with -100 so the loss function can ignore them

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [21]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)


In [22]:
import evaluate

metric = evaluate.load("wer")


In [23]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) # use batch decode to get literal tokens for calculating error
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-medium-informal-arabic",  # change small if diff checkpoint
    per_device_train_batch_size=16, # this can be reduced if out of memory
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size, accumulate gradients before updating weights when using big batch size to help w memory
    learning_rate=1e-5,
    warmup_steps=500, # for lr
    max_steps=3000, # train for max 3000 steps
    gradient_checkpointing=True, # keep subset of activatons in fp n calculate again in bp for memory
    fp16=True, # mixed preciison training with 16 bits instead of 32 for faster training n memory
    evaluation_strategy="steps", # steps not epoch
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225, # tokens
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False, # because lower wer is better
    push_to_hub=True,
)


In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=my_dataset["train"],
    eval_dataset=my_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


In [None]:
trainer.train()


In [None]:
# write meta data
kwargs = {
    "dataset_tags": "itskavya/gp",
    "dataset": "Informal Arabic",
    "language": ["ar"],
    "model_name": "Whisper Medium Informal Arabic",
    "finetuned_from": "openai/whisper-medium", # should be changed if we use diff checkpoint
    "tags": ["automatic-speech-recognition", "arabic"],
    "tasks": "automatic-speech-recognition",
}
trainer.push_to_hub(**kwargs)


References:

-https://huggingface.co/blog/fine-tune-whisper
