# Model PEFT training

In [2]:
!pip install evaluate
!pip install jiwer
!pip install IProgress
!pip install bitsandbytes

# !pip install transformers[torch]
# !pip install soundfile
# !pip install torchaudio

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo2023/x86-64-v3, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo2023/generic, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic
Processing /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic/pyarrow-9999+dummy.computecanada-py3-none-any.whl (from datasets>=2.0.0->evaluate)
Processing /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic/pyarrow_noinstall-9999+dummy.computecanada.tar.gz (from pyarrow>=21.0.0->datasets>=2.0.0->evaluate)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[19 lines of output][0m
  [31m   [0m 
  [31m   [0m

## Load Data

In [1]:
import gc
import os
import sys

sys.path.append(os.path.abspath(os.path.join('..')))

from modules.data import load_common_voice_data

data = load_common_voice_data()
gc.collect()

  from .autonotebook import tqdm as notebook_tqdm


50

## Create LoRA model

In [10]:
from peft import get_peft_model, LoraConfig

sys.path.append(os.path.abspath(os.path.join('..')))

from config.variables import TOKENIZER_LANGUAGE, MODEL_VERSION, MODEL_LANGUAGE
from modules.model import get_model, get_tokenizer

model = get_model()
tokenizer = get_tokenizer()

model.generation_config.language = MODEL_LANGUAGE
model.generation_config.task = "transcribe"

#Freeze Whisper weights
model.requires_grad_(False)
model.resize_token_embeddings(len(tokenizer))
peft_config = LoraConfig(r=32,
                         lora_alpha=64,
                         target_modules=['q_proj', 'v_proj'],
                         bias='none',
                         use_dora=True)

lora_model = get_peft_model(model, peft_config)
lora_model.config.use_cache = False
lora_model.print_trainable_parameters()


trainable params: 3,594,240 || all params: 245,329,152 || trainable%: 1.4651


## Setup trainer

In [4]:
from pathlib import Path
from modules.training import compute_metrics, DataCollatorSpeechSeq2SeqWithPadding
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from modules.model import get_processor

processor = get_processor()

training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # increase by 2x for every 2x decrease in batch size
    gradient_checkpointing=False,
    learning_rate=1e-5,
    max_steps=7000,
    warmup_steps=500,
    save_strategy='best',  #Keep only best model when saving
    save_steps=500,
    save_only_model=True,
    save_total_limit=2,
    # eval_strategy="no",
    eval_strategy="steps",
    eval_steps=500,
    fp16=True,
    per_device_eval_batch_size=1,
    predict_with_generate=True,
    generation_max_length=225,
    logging_steps=100,
    report_to=[],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    optim="adamw_torch"
)
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor,
                                                     decoder_start_token_id=model.config.decoder_start_token_id)

data['test'].take(1000)
trainer = Seq2SeqTrainer(
    args=training_args,
    model=lora_model,
    train_dataset=data["train"],
    eval_dataset=data['test'],
    data_collator=data_collator,
    processing_class=processor,
    compute_metrics=compute_metrics,
)
print(trainer)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


<transformers.trainer_seq2seq.Seq2SeqTrainer object at 0x7f66d658c0b0>


## Training

In [4]:
from datetime import datetime

#Create new directory for checkpoints with the training specs
output_dir = Path(
    f'cps/cp_{MODEL_VERSION.replace('/', '-')}-{MODEL_LANGUAGE}_T{TOKENIZER_LANGUAGE}_{datetime.now().strftime("%d-%m-%Y_%H:%M")}')
output_dir.mkdir(parents=True, exist_ok=True)
trainer.args.output_dir = output_dir
trainer.train()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Wer
500,2.1401,1.8531,87.26025
1000,1.3482,1.295434,69.675641
1500,1.172,1.124788,61.129685
2000,1.0635,1.03079,58.243885
2500,1.0132,0.97513,55.205584
3000,0.9271,0.936408,53.281717
3500,0.9141,0.90971,51.481025
4000,0.8483,0.889016,50.853422
4500,0.8318,0.873036,49.920816
5000,0.8187,0.862203,49.11725


TrainOutput(global_step=7000, training_loss=1.1183025665283204, metrics={'train_runtime': 34971.2198, 'train_samples_per_second': 1.601, 'train_steps_per_second': 0.2, 'total_flos': 1.645062193152e+19, 'train_loss': 1.1183025665283204, 'epoch': 7.114})

## Evaluate

In [None]:
from jiwer import wer
from tqdm import tqdm


def compute_dataset_wer(model, processor, dataset):
    preds = []
    refs = []
    for item in tqdm(dataset, desc="Processing"):
        audio = item["audio"]["array"]
        text = item["text"]
        inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
        with torch.no_grad():
            ids = model.generate(inputs["input_features"].to("cuda"), max_length=225)
        pred = processor.tokenizer.batch_decode(ids, skip_special_tokens=True)[0]
        preds.append(pred)
        refs.append(text)
    return wer(refs, preds)


common_voice_test = load_common_voice_data()["test"]

print("Evaluating fine-tuned model...")
wer_finetuned = compute_dataset_wer(lora_model, processor, common_voice_test)

print(f"\n{'=' * 60}")
print("RESULTS")
print(f"{'=' * 60}")
print(f"WER fine-tuned: {wer_finetuned * 100:.2f}%")

## Try example

In [17]:
import torch
from peft import PeftModel
from modules.data import load_common_voice_raw
from modules.model import get_model


def transcribe_from_audio_with(model, processor, audio_array):
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")

    with torch.no_grad():
        ids = model.generate(inputs["input_features"], max_length=225)

    txt = processor.tokenizer.batch_decode(ids, skip_special_tokens=True)[0]
    return txt


sample = load_common_voice_raw()["test"][119]
audio = sample["audio"]["array"]
sr = sample["audio"]["sampling_rate"]
expected_text = sample["text"]
print("Reference text :", expected_text)
model = get_model()

lora_model = PeftModel.from_pretrained(model,
                                       './cps/cp_openai-whisper-small-french_TFrench_21-12-2025_23:25/checkpoint-7000')
txt_ft = transcribe_from_audio_with(lora_model, processor, audio)

print("Fine-tuned :", txt_ft)

import evaluate

metric = evaluate.load("wer")

wer_ft = metric.compute(predictions=[txt_ft], references=[expected_text])

print("WER fine tuned:", wer_ft)

Reference text : Il est directeur de l'Observatoire astronomique de Majorque.
Fine-tuned : Il était éteint de l'observatoire astronomique de Majork.
WER fine tuned: 0.5
