# Prepare Environment

In [None]:
# Verify that the Colab notebook is connected to a GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Dec  9 20:18:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0              33W /  70W |   4497MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Install the dependencies
!pip install --upgrade --quiet pip
!pip install --upgrade --quiet datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio
# !pip install fsspec==2024.10.0
# !pip install tensorboard==2.17.0

In [None]:
from huggingface_hub import notebook_login

# Link the Colab notebook to the Hugging Face Hub
# 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
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load Dataset

In [None]:
from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load the CORAAL Dataset
coraal = load_dataset("DynamicSuperb/AAVESpeechRecognition_CORAAL", split = "test")

# Convert the CORAAL Dataset into a DataFrame
coraal_df = coraal.to_pandas()

# Perform an 80/10/10 split to create training, validation, and test sets
train_df, temp_df = train_test_split(
    coraal_df, test_size = 0.2, random_state = 11, shuffle = True
)

val_df, test_df = train_test_split(
    temp_df, test_size = 0.5, random_state = 11, shuffle = True
)

# Convert the DataFrames into Datasets
coraal = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "val": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})

# Check the sizes of each set
print(coraal)

DatasetDict({
    train: Dataset({
        features: ['audio', 'file', 'instruction', 'label', '__index_level_0__'],
        num_rows: 640
    })
    val: Dataset({
        features: ['audio', 'file', 'instruction', 'label', '__index_level_0__'],
        num_rows: 80
    })
    test: Dataset({
        features: ['audio', 'file', 'instruction', 'label', '__index_level_0__'],
        num_rows: 80
    })
})


# Prepare Feature Extractor, Tokenizer and Data

In [None]:
from transformers import WhisperFeatureExtractor

# Load the feature extractor from the pre-trained checkpoint with the default values
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")

In [None]:
from transformers import WhisperTokenizer

# Load the Whisper tokenizer
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language = "English", task = "transcribe")

In [None]:
from transformers import WhisperProcessor

# Create a Whisper processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language = "English", task = "transcribe")

In [None]:
# Print the first example of the CORAAL dataset
print(coraal["train"][0])

{'audio': {'bytes': b'RIFF\xeaa\x03\x00WAVEfmt \x12\x00\x00\x00\x03\x00\x01\x00\x80>\x00\x00\x00\xfa\x00\x00\x04\x00 \x00\x00\x00fact\x04\x00\x00\x00n\xd8\x00\x00data\xb8a\x03\x00\x004\x00:\x00\xcc#:\x00\xcc*:\x00\xac\x1a:\x00 \x1a:\x00,\x1f:\x00\x9c\x17:\x008\x19:\x00,\x18:\x00\xf0\xf59\x00 \xa99\x00\x80\x929\x00\xd0m9\x00\xe0\xf48\x00\x80\x12\xb8\x00 \n\xb9\x00@j\xb9\x008\x93\xb9\x00\x98\xa5\xb9\x00\xd8\xc9\xb9\x00\x98\xd4\xb9\x00\x88\xff\xb9\x00\x0c\x1b\xba\x00\xa4"\xba\x00\xe4%\xba\x00\x1c\x1f\xba\x00\xfc\n\xba\x00\x8c\x02\xba\x00 \x05\xba\x00\x90\xf9\xb9\x00\x18\xd1\xb9\x008\xc9\xb9\x00\x08\xbb\xb9\x000\xa1\xb9\x00\xb0\x80\xb9\x00\x00\xf8\xb8\x00@\xe0\xb8\x00\xa0\xd4\xb8\x00@ 8\x00\x8029\x00\xd0n9\x00P:9\x00\xd0\x149\x00p?9\x00`x9\x00X\xa19\x00\xf8\xbb9\x00\xc8\xb49\x00\x88\xb49\x00\xa8\xd29\x00 \xea9\x00h\r:\x00\x00 :\x004\x1a:\x00x\x14:\x00x\xe09\x00\xb8\xaf9\x00\x80=9\x00`\xba8\x00\xd0!9\x000\x149\x00\x00U8\x00\x80\xb4\xb7\x000\x0b\xb9\x00\xb8\xae\xb9\x00\xa8\xdb\xb9\x00\xf0\xe

In [None]:
from datasets import Audio

# Downsample the audio to 16kHz
coraal = coraal.cast_column("audio", Audio(sampling_rate = 16000))

In [None]:
# Print the first example of the CORAAL dataset, downsampled
print(coraal["train"][0])

{'audio': {'path': 'utt_238_DCB_se2_ag1_f_01_1.wav', 'array': array([ 0.00048906,  0.00062484,  0.00065154, ..., -0.00034851,
       -0.0003866 , -0.00040179]), 'sampling_rate': 16000}, 'file': 'utt_238_DCB_se2_ag1_f_01_1', 'instruction': 'Convert the given spoken phrase into a transcription. Use uppercase letters only and remove all punctuation.', 'label': 'THEY THEY WERE LIKE I DIDNT EVER HAVE A PROBLEM LIKE I DONT GET IN TROUBLE IN SCHOOL', '__index_level_0__': 153}


In [None]:
# Prepares the parameter batch data to be inputted into the Whisper model.
def prepare_dataset(batch):

    # Load and resample the audio from 48kHz to 16kHz
    audio = batch["audio"]

    # Compute the log-Mel spectrogram input features from the 1-dimensional audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate = audio["sampling_rate"]).input_features[0]

    # Encode the target text to label IDs
    batch["labels"] = tokenizer(batch["label"]).input_ids

    return batch

In [None]:
# Apply prepare_dataset() to the training examples
coraal = coraal.map(prepare_dataset, remove_columns = coraal.column_names["train"], num_proc = 2)

Map (num_proc=2):   0%|          | 0/640 [00:00<?, ? examples/s]

TimeoutError: 

# Training and Evaluation

In [None]:
from transformers import WhisperForConditionalGeneration

# Uncomment only one of the following lines of code

# Load the untrained Whisper tiny model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

# Load the trained Whisper tiny model
# model = WhisperForConditionalGeneration.from_pretrained("melvinrajendran/whisper-tiny-coraal")

In [None]:
# Configure the model to transcribe to English
model.generation_config.language = "english"
model.generation_config.task = "transcribe"

In [None]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Define a data collator
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split the inputs and labels because they must have different lengths and need different padding methods

        # Treat the audio inputs by returning PyTorch Tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors = "pt")

        # Get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad the labels to the maximum length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors = "pt")

        # Replace the padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # If the BOS token is appended in the previous tokenization step, cut it because it is appended later
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
# Initialize the data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor = processor,
    decoder_start_token_id = model.config.decoder_start_token_id,
)

In [None]:
import evaluate

# Use the Word Error Rate (WER) metric
metric = evaluate.load("wer")

In [None]:
# Computes the WER metrics for the parameter model predictions.
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 with the padding token ID
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # Do not group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens = True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens = True)

    # Compute the WER
    wer = 100 * metric.compute(predictions = pred_str, references = label_str)

    return {"wer": wer}

In [None]:
from transformers import Seq2SeqTrainingArguments, get_scheduler
from torch.optim import AdamW

# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir = "./whisper-tiny-coraal",
    per_device_train_batch_size = 16,
    gradient_accumulation_steps = 1, # Increase by 2x for every 2x decrease in batch size
    learning_rate = 1e-5,
    warmup_steps = 100, # 500
    max_steps = 1000, # 4000
    gradient_checkpointing = True,
    fp16 = True,
    eval_strategy = "steps",
    per_device_eval_batch_size = 8,
    predict_with_generate = True,
    generation_max_length = 225,
    save_steps = 25,
    eval_steps = 25,
    logging_steps = 25,
    logging_dir = "./logs",
    report_to = ["tensorboard"],
    load_best_model_at_end = True,
    metric_for_best_model = "wer",
    greater_is_better = False,
    push_to_hub = True,
    disable_tqdm = False,
    label_names = [],
)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr = training_args.learning_rate)

# Define the scheduler
lr_scheduler = get_scheduler(
    name = "linear",
    optimizer=optimizer,
    num_warmup_steps=training_args.warmup_steps,
    num_training_steps=training_args.max_steps,
)

In [None]:
from transformers import Seq2SeqTrainer

# Forward the training arguments to Hugging Face
trainer = Seq2SeqTrainer(
    args = training_args,
    model = model,
    train_dataset = coraal["train"],
    eval_dataset = coraal["val"],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    processing_class = processor.feature_extractor,
    optimizers = (optimizer, lr_scheduler)
)

In [None]:
# Save the processor object before training
processor.save_pretrained(training_args.output_dir)

In [None]:
# Evalute the untrained model on the test set
test_results = trainer.evaluate(coraal["test"])

# Print the evaluation results
print(f"Test results: {test_results}")

In [None]:
# Train the Whisper model
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Model Preparation Time,Wer
25,2.0383,1.996065,0.0029,98.265896
50,1.7246,1.707464,0.0029,97.816313


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.proc

KeyboardInterrupt: 

In [None]:
# Define the Hugging Face Hub arguments
kwargs = {
    "dataset_tags": "DynamicSuperb/AAVESpeechRecognition_CORAAL",
    "dataset": "AAVE Speech Recognition CORAAL",
    "dataset_args": "config: hi, split: test",
    "model_name": "Whisper Tiny - CORAAL",
    "finetuned_from": "openai/whisper-tiny",
    "tasks": "automatic-speech-recognition",
}

In [None]:
# Push the training results to Hugging Face Hub
# trainer.push_to_hub(**kwargs)

CommitInfo(commit_url='https://huggingface.co/melvinrajendran/whisper-tiny-coraal/commit/8544e9d751dded2d9a60fc51e1d21b6e2a60b208', commit_message='End of training', commit_description='', oid='8544e9d751dded2d9a60fc51e1d21b6e2a60b208', pr_url=None, repo_url=RepoUrl('https://huggingface.co/melvinrajendran/whisper-tiny-coraal', endpoint='https://huggingface.co', repo_type='model', repo_id='melvinrajendran/whisper-tiny-coraal'), pr_revision=None, pr_num=None)

# Evaluate Performance

In [None]:
# Evaluate the model on the test set
test_results = trainer.evaluate(coraal["test"])

# Print the evaluation results
print(f"Test results: {test_results}")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Test results: {'eval_loss': 1.3805172443389893, 'eval_model_preparation_time': 0.0026, 'eval_wer': 44.77898782831518, 'eval_runtime': 20.9253, 'eval_samples_per_second': 3.823, 'eval_steps_per_second': 0.478}


In [None]:
# Output the prediction vs. ground truth for 10 examples in the test set