## Inital Setup

In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

Repository: 'deb https://ppa.launchpadcontent.net/jonathonf/ffmpeg-4/ubuntu/ jammy main'
Description:
Backport of FFmpeg 4 and associated libraries. Now includes AOM/AV1 support!

FDK AAC is not compatible with GPL and FFmpeg can't be redistributed with it included. Please don't ask for it to be added to this public PPA.

---

PPA supporters:

BigBlueButton (https://bigbluebutton.org)

---

Donate to FFMPEG: https://ffmpeg.org/donations.html
Donate to Debian: https://www.debian.org/donations
Donate to this PPA: https://ko-fi.com/jonathonf
More info: https://launchpad.net/~jonathonf/+archive/ubuntu/ffmpeg-4
Adding repository.
Adding deb entry to /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-4-jammy.list
Adding disabled deb-src entry to /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-4-jammy.list
Adding key to /etc/apt/trusted.gpg.d/jonathonf-ubuntu-ffmpeg-4.gpg with fingerprint 4AB0F789CBA31744CC7DA76A8CF63AD3F06FC659
Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [1

In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.
ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[0mCollecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ua1525qb
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-ua1525qb
  Resolved https://github.com/huggingface/transformers to commit 653eb40425344b89b5a24e7b07eb3095b04cdc9d
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Bui

Linking the notebook to the Hub is straightforward - it simply requires entering your Hub authentication token when prompted. Find your Hub authentication token [here](https://huggingface.co/settings/tokens):

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_name_or_path = "openai/whisper-small"
language = "English"
language_abbr = "en"
task = "transcribe"
dataset_name = "lord-reso/inbrowser-proctor-dataset"

## Load Dataset

In [None]:
from datasets import load_dataset, DatasetDict

dataset = DatasetDict()

dataset["train"] = load_dataset(dataset_name, split="train")
dataset["validation"] = load_dataset(dataset_name, split="validation")

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'start_time', 'end_time'],
        num_rows: 6
    })
    validation: Dataset({
        features: ['audio', 'text', 'start_time', 'end_time'],
        num_rows: 5
    })
})


## Prepare Feature Extractor, Tokenizer and Data

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

### Prepare Data

In [None]:
print(dataset["train"][0])

{'audio': {'path': 'segment_0.wav', 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00305176,
       -0.00241089, -0.00100708]), 'sampling_rate': 44100}, 'text': 'Search online for the key terms in the WIS and send them to the messenger. Take the answers for none of them and send them back to me. Use chat jpt to find the solutions for the fifth option and send them to me. Can you glance at your notes for the last question and tell me the answer? Send correct answers via Facebook. Use jminite to find solutions. Share the correct equation for the solution to the fourth question in the chat.', 'start_time': '00:00:00.000', 'end_time': '00:00:28.000'}


In [None]:
from datasets import Audio

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
print(dataset["train"][0])

{'audio': {'path': 'segment_0.wav', 'array': array([-1.30289561e-06,  1.78798587e-06, -2.44241937e-06, ...,
       -8.57556734e-06, -1.58315583e-03, -2.22158781e-03]), 'sampling_rate': 16000}, 'text': 'Search online for the key terms in the WIS and send them to the messenger. Take the answers for none of them and send them back to me. Use chat jpt to find the solutions for the fifth option and send them to me. Can you glance at your notes for the last question and tell me the answer? Send correct answers via Facebook. Use jminite to find solutions. Share the correct equation for the solution to the fourth question in the chat.', 'start_time': '00:00:00.000', 'end_time': '00:00:28.000'}


In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'start_time', 'end_time'],
        num_rows: 6
    })
    validation: Dataset({
        features: ['audio', 'text', 'start_time', 'end_time'],
        num_rows: 5
    })
})


In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=2)

In [None]:
dataset["train"]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 6
})

In [None]:
dataset["validation"]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 5
})

## Training and Evaluation

### Load a Pre-Trained Checkpoint

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small", device_map="auto")

In [None]:
model.generation_config.language = "English"
model.generation_config.task = "transcribe"

model.config.forced_decoder_ids = None

### Define a Data Collator

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

### Evaluation Metrics

In [None]:
import evaluate

metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

### Apply LoRA

Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.

In [None]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

# Apply LoRA
config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")
model = get_peft_model(model, config)

# Set requires_grad for all parameters to True
for param in model.parameters():
    param.requires_grad = True

### Define the Training Configuration

In the final step, we define all the parameters related to training. For more detail on the training arguments, refer to the Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments).

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-inbrowser-proctor",  # repo name
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=5e-6,
    warmup_steps=50,
    max_steps=500,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=125,
    eval_steps=25,
    logging_steps=10,
    report_to=["tensorboard"],
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    weight_decay=0.1,
    remove_unused_columns=False,
    label_names=["labels"]
)



In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [None]:
processor.save_pretrained(training_args.output_dir)

[]

In [None]:
trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
25,0.241,0.41662,5.965909
50,0.0912,0.42448,7.670455
75,0.0441,0.41022,31.25
100,0.0009,0.394779,7.670455
125,0.0001,0.389588,7.102273
150,0.0001,0.394627,7.102273
175,0.0,0.397132,7.102273
200,0.0,0.398771,7.102273
225,0.0,0.399715,7.102273
250,0.0,0.40016,7.102273


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.a

TrainOutput(global_step=250, training_loss=0.0718426004202338, metrics={'train_runtime': 621.3708, 'train_samples_per_second': 6.437, 'train_steps_per_second': 0.402, 'total_flos': 4.4052221952e+17, 'train_loss': 0.0718426004202338, 'epoch': 250.0})

In [None]:
kwargs = {
    "dataset_tags": "lord-reso/dummy-test",
    "dataset": "Procotor-Dataset",
    "dataset_args": "config: en, split: test",
    "language": "en",
    "model_name": "Whisper-Small-Proctor-lora",
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
}

In [None]:
trainer.push_to_hub(**kwargs)

events.out.tfevents.1726842704.d96cecff2f87.521.8:   0%|          | 0.00/20.6k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lord-reso/whisper-small-proctor-lora/commit/694038f2c3389d9a76badceaefe64d09cb87083a', commit_message='End of training', commit_description='', oid='694038f2c3389d9a76badceaefe64d09cb87083a', pr_url=None, pr_revision=None, pr_num=None)