# Model PEFT training

## Create LoRA model

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))

from modules.data import get_quebecois_data
from modules.peft_utils import add_peft_to_model
from modules.model import get_whisper
from transformers import (
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperFeatureExtractor,
    WhisperProcessor
)

model = get_whisper()

# 2 French tokenizer
tokenizer = WhisperTokenizer.from_pretrained(
    "openai/whisper-small.en",
)

# 3. English extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    "openai/whisper-small"
)

# 4. Hybrid processor
processor = WhisperProcessor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
)

model.resize_token_embeddings(len(tokenizer))

model.config.forced_decoder_ids = None 
model.generation_config.forced_decoder_ids = None
model.generation_config.language = "English"
model.generation_config.task = "transcribe"
model.requires_grad_(False)
lora_model = add_peft_to_model(model)
lora_model.print_trainable_parameters()


  from .autonotebook import tqdm as notebook_tqdm


trainable params: 3,594,240 || all params: 245,328,384 || trainable%: 1.4651


## Setup trainer

In [2]:
from modules.training import compute_metrics, DataCollatorSpeechSeq2SeqWithPadding
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, WhisperProcessor

checkpoint_dir = "../checkpoints/"
# checkpoint_dir.mkdir(parents=True, exist_ok=True)
training_args = Seq2SeqTrainingArguments(
    output_dir=checkpoint_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    eval_strategy="no",
    save_strategy="steps",
    save_steps=1000,
    #eval_steps=500,
    gradient_checkpointing=False,
    fp16=True,
    per_device_eval_batch_size=1,
    predict_with_generate=True,
    generation_max_length=225,
    logging_steps=25,
    report_to=[],
    #load_best_model_at_end=True,
    #metric_for_best_model="wer",
    #greater_is_better=False,
    push_to_hub=False,
    #save_only_model=True,
    save_total_limit=2,
)
data = get_quebecois_data()
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor,
                                                     decoder_start_token_id=model.config.decoder_start_token_id)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=lora_model,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    data_collator=data_collator,
    processing_class=processor,
    compute_metrics=compute_metrics,
)

Downloading builder script: 5.13kB [00:00, 10.0MB/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


## Training

In [3]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 50256, 'bos_token_id': 50256, 'pad_token_id': 50256}.


Step,Training Loss
25,3.7611
50,3.7771
75,3.7136
100,3.5797
125,3.5557
150,3.7087
175,3.5087
200,3.5988
225,3.4498
250,3.4266




TrainOutput(global_step=4000, training_loss=1.3623435163497926, metrics={'train_runtime': 10656.7235, 'train_samples_per_second': 3.003, 'train_steps_per_second': 0.375, 'total_flos': 9.40035538944e+18, 'train_loss': 1.3623435163497926, 'epoch': 4.514672686230249})

In [4]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))

import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from peft import PeftModel
from modules.data import get_quebecois_data
from modules.peft_utils import add_peft_to_model
from modules.model import get_whisper
from transformers import (
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperFeatureExtractor,
    WhisperProcessor
)

path_to_peft = "/home/ulaval.ca/lemun9/work/peftTest/checkpoints/checkpoint-4000"
base_model_name = "openai/whisper-small"

local_adapter_path = path_to_peft
model = get_whisper()

# French tokenizer
tokenizer = WhisperTokenizer.from_pretrained(
    "openai/whisper-small.en",
)

# English extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    "openai/whisper-small"
)

# Hybrid processor
processor = WhisperProcessor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
)

model.resize_token_embeddings(len(tokenizer))

model = PeftModel.from_pretrained(model, local_adapter_path)

device = "cuda"if torch.cuda.is_available() else"cpu"
model.to(device)
 

PeftModel(
  (base_model): LoraModel(
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 768)
          (layers): ModuleList(
            (0-11): 12 x WhisperEncoderLayer(
              (self_attn): WhisperAttention(
                (k_proj): Linear(in_features=768, out_features=768, bias=False)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Identity()
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=32, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (d

In [5]:
import torch



def transcribe_from_audio_with(model, processor, audio_array):
    device = next(model.parameters()).device

    inputs = processor(
        audio_array,
        sampling_rate=16000,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            inputs["input_features"],
            max_new_tokens=225
        )

    return processor.batch_decode(ids, skip_special_tokens=True)[0]

In [6]:
from jiwer import wer
from tqdm import tqdm
import torch

# Make sure models are on GPU
model = model.to("cuda")

def compute_dataset_wer(model, processor, dataset):
    preds = []
    refs = []
    for item in tqdm(dataset, desc="Processing"):
        audio = item["audio"]["array"]
        text = item["text"]
        inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
        with torch.no_grad():
            ids = model.generate(inputs["input_features"].to("cuda"), max_length=225)
        pred = processor.tokenizer.batch_decode(ids, skip_special_tokens=True)[0]
        preds.append(pred)
        refs.append(text)
    return wer(refs, preds)

# Use the RAW dataset
from datasets import load_dataset

common_voice_raw = load_dataset("rishabbahal/quebecois_canadian_french_dataset", split="test")

print("Evaluating fine-tuned model...")
wer_finetuned = compute_dataset_wer(model, processor, common_voice_raw)

print(f"\n{'='*60}")
print("RESULTS")
print(f"{'='*60}")
print(f"WER fine-tuned: {wer_finetuned*100:.2f}%")

Generating train split: 100%|██████████| 5389/5389 [00:15<00:00, 350.85 examples/s] 
Generating test split: 100%|██████████| 1348/1348 [00:00<00:00, 3331.24 examples/s]


Evaluating fine-tuned model...


Processing:   0%|          | 0/1348 [00:00<?, ?it/s]Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Processing: 100%|██████████| 1348/1348 [14:55<00:00,  1.51it/s]


RESULTS
WER fine-tuned: 104.32%





In [7]:
#Compare with test data
from datasets import load_dataset, DatasetDict

common_voice_raw = DatasetDict()
common_voice_raw["train"] = load_dataset("rishabbahal/quebecois_canadian_french_dataset", split="train")
common_voice_raw["test"] = load_dataset("rishabbahal/quebecois_canadian_french_dataset", split="test")

sample = common_voice_raw["test"][1]
audio = sample["audio"]["array"]
sr = sample["audio"]["sampling_rate"]
expected_text = sample["text"]
print("Reference text :", expected_text)

txt_ft = transcribe_from_audio_with(model, processor, audio)

print("Fine-tuned :", txt_ft)

import evaluate
metric = evaluate.load("wer")

wer_ft = metric.compute(predictions=[txt_ft], references=[expected_text])

print("WER fine tuned:", wer_ft)

Reference text : Oui, c'est bien, mais je ne vois pas d'alligator. Ah non ?
Fine-tuned :  pale c'uliance text unforth misses Pres' stig walk.
WER fine tuned: 1.0
