In [None]:
#Download required imports
import sys
!{sys.executable} -m pip install torch
!{sys.executable} -m pip install torchcodec
!{sys.executable} -m pip install evaluate
!{sys.executable} -m pip install jiwer
!{sys.executable} -m pip install transformers[torch]
!{sys.executable} -m pip install soundfile
!{sys.executable} -m pip install torchaudio


In [1]:
import datasets, huggingface_hub
print("datasets:", datasets.__version__)
print("huggingface_hub:", huggingface_hub.__version__)


  from .autonotebook import tqdm as notebook_tqdm


datasets: 4.4.1
huggingface_hub: 0.36.0


In [2]:
#Import dataset and split
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset("rishabbahal/quebecois_canadian_french_dataset", "default", split="train")
common_voice["test"] = load_dataset("rishabbahal/quebecois_canadian_french_dataset", "default", split="test")

print(common_voice)

#Keep only needed data (audio and text)
common_voice = common_voice.remove_columns(["audio_filepath", "__index_level_0__"])

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'audio_filepath', '__index_level_0__'],
        num_rows: 5389
    })
    test: Dataset({
        features: ['audio', 'text', 'audio_filepath', '__index_level_0__'],
        num_rows: 1348
    })
})
DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 5389
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 1348
    })
})


In [3]:
#Prepare data
sample = common_voice["train"][2]
audio_decoder = sample['audio']

sampling_rate = audio_decoder["sampling_rate"] 

print(f"Sampling rate: {sampling_rate}")

Sampling rate: 16000


In [4]:
#Prepare dataset
def prepare_dataset(batch):

    audio_array = batch["audio"]["array"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio_array, sampling_rate=16000).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

In [7]:
from transformers import (
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperFeatureExtractor,
    WhisperProcessor
)

# English model
model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-small.en"
)

# 2 French tokenizer
tokenizer = WhisperTokenizer.from_pretrained(
    "openai/whisper-small",
    language="fr",
    task="transcribe"
)

# 3. English extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    "openai/whisper-small.en"
)

# 4. Hybrid processor
processor = WhisperProcessor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
)

model.resize_token_embeddings(len(tokenizer))

model.config.forced_decoder_ids = None
model.generation_config.forced_decoder_ids = None
model.generation_config.language = None
model.generation_config.task = "transcribe"
model.generation_config.suppress_tokens = []

model.generation_config.repetition_penalty = 1.5
model.generation_config.no_repeat_ngram_size = 4
model.generation_config.max_new_tokens = 225

model.eval()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [8]:
#Apply data preparation function to training examples
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=None)

Map: 100%|██████████| 1348/1348 [00:18<00:00, 71.82 examples/s]


In [9]:
# freeze 100% of the encoder
for param in model.model.encoder.parameters():
    param.requires_grad = False

# freeze 80% of the decoder
decoder_layers = model.model.decoder.layers
total_layers = len(decoder_layers)
layers_to_freeze = int(total_layers * 0.8)

for i in range(layers_to_freeze):
    for param in decoder_layers[i].parameters():
        param.requires_grad = False

# freeze embedding layer
for param in model.model.decoder.embed_tokens.parameters():
    param.requires_grad = False

for param in model.model.decoder.embed_positions.parameters():
    param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen_params = total_params - trainable_params

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Frozen parameters: {frozen_params:,}")
print(f"Percentage trainable: {100 * trainable_params / total_params:.2f}%")

Total parameters: 241,734,912
Trainable parameters: 28,352,256
Frozen parameters: 213,382,656
Percentage trainable: 11.73%


In [10]:
#Define data collector
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # create attention mask for encoder inputs
        attention_mask = torch.ones(batch["input_features"].shape[:-1], dtype=torch.long)
        is_padding = (batch["input_features"] == 0).all(dim=-1)
        attention_mask[is_padding] = 0
        batch["attention_mask"] = attention_mask

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [11]:
#Initialize data collector
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [12]:
#Choose evaluation metrics (error rate)
import evaluate

metric = evaluate.load("wer")

In [13]:
#Define a function that takes the model's predictions and returns the evaluation metric
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [14]:
import shutil
import os

ckpt_dir = "/scratch/lemun9@ulaval.ca/whisper_checkpoints_en"

if os.path.exists(ckpt_dir):
    shutil.rmtree(ckpt_dir)

In [15]:
#Define training configuration
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir = "/scratch/lemun9@ulaval.ca/whisper_checkpoints_en",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    eval_strategy="no", 
    save_strategy="steps", 
    save_steps=1000,
    eval_steps=1000, 
    gradient_checkpointing=False,
    fp16=True,
    per_device_eval_batch_size=1,
    predict_with_generate=False,
    generation_max_length=225,
    logging_steps=25,
    report_to=[],
    # load_best_model_at_end=True,
    # metric_for_best_model="wer",
    # greater_is_better=False,
    push_to_hub=False,
    save_only_model=True,
    save_total_limit=2,
)
model.gradient_checkpointing_disable()

In [16]:
#Forward training agruments, model, dataset, data collector, and compute metrics function to HuggingFace trainer
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    processing_class=processor,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [17]:
#Save processor object
processor.save_pretrained(training_args.output_dir)

[]

In [18]:
#Make space for the model training
import torch, gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [19]:
#Train the model
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 50257, 'pad_token_id': 50257}.


Step,Training Loss
25,10.983
50,10.8033
75,10.5778
100,10.2082
125,9.8123
150,9.4495
175,8.9725
200,8.657
225,8.3686
250,8.1449




TrainOutput(global_step=4000, training_loss=6.5095822143554685, metrics={'train_runtime': 4919.7006, 'train_samples_per_second': 6.504, 'train_steps_per_second': 0.813, 'total_flos': 9.2304040292352e+18, 'train_loss': 6.5095822143554685, 'epoch': 5.93523844869178})

In [10]:
#Load trained model
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import torch
import soundfile as sf

processor = WhisperProcessor.from_pretrained("/scratch/lemun9@ulaval.ca/whisper_checkpoints_en")

model = WhisperForConditionalGeneration.from_pretrained("/scratch/lemun9@ulaval.ca/whisper_checkpoints_en/checkpoint-4000")

model.generation_config.forced_decoder_ids = None
model.generation_config.language = None
model.generation_config.task = None
model.generation_config.suppress_tokens = []
model.generation_config.begin_suppress_tokens = []
model.generation_config.repetition_penalty = 1.5
model.generation_config.no_repeat_ngram_size = 4
model.generation_config.max_length = 225

model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

model.eval()


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [2]:
#Load original model
from transformers import WhisperForConditionalGeneration, WhisperProcessor

base_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
base_proc = WhisperProcessor.from_pretrained("openai/whisper-small")
base_model.eval()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [3]:
import torchaudio
import torch

def transcribe_from_path_with(model, processor, path):
    audio, sr = torchaudio.load(path)

    if audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        audio = resampler(audio)

    audio = audio.squeeze().numpy()
import torchaudio
import torch

def transcribe_from_path_with(model, processor, path):
    audio, sr = torchaudio.load(path)
    
    if audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)
    
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        audio = resampler(audio)
    
    audio = audio.squeeze().numpy()
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
    
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        ids = model.generate(
            inputs["input_features"],
            max_length=225
        )
    
    text = processor.tokenizer.batch_decode(ids, skip_special_tokens=True)[0]
    return text
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")

    with torch.no_grad():
        ids = model.generate(
    inputs["input_features"],
    max_length=225
)

    text = processor.tokenizer.batch_decode(ids, skip_special_tokens=True)[0]
    return text

In [7]:
path = "audioQc.mp3"

txt_finetuned = transcribe_from_path_with(model, processor, path)
txt_base = transcribe_from_path_with(base_model, base_proc, path)

print("fine tuned model transcription:")
print(txt_finetuned)
print()
print("original model transcription:")
print(txt_base)

Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361], 'begin_suppress_tokens': [220, 50256]}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may ob

fine tuned model transcription:
C'est vraiment un petit, c'est vraiment un petit?  Puis, c'est vraiment un petit?  Puis, c'est vraiment un petit?  Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? Puis, c'est vraiment un petit? P fe, c'est vraiment un petit? P

original model transcription:
 C'est tellement beau, les étoiles. Hum. Garde, c'est la grande ours. Oh! Et ça, c'est quoi? Ça, c'est... Attends, c'est la terre, ça! Je t'avais dit qu'on n'avait pas pris le bon chemin. Ça, à gauche, il faut continuer de tourner par aupar. Ben, je pense que c'est tout droit parce qu'à gauche, c'est l'heure d'avoir un vor

In [11]:
def transcribe_from_audio_with(model, processor, audio_array):
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")

    with torch.no_grad():
        ids = model.generate(inputs["input_features"], 
                             max_length=225,
                             repetition_penalty=1.5,      
                             no_repeat_ngram_size=4,      
                             temperature=1.0,             
                             do_sample=False)             

    txt = processor.tokenizer.batch_decode(ids, skip_special_tokens=True)[0]
    return txt

In [12]:
#Compare with test data
from datasets import load_dataset, DatasetDict

common_voice_raw = DatasetDict()
common_voice_raw["train"] = load_dataset("rishabbahal/quebecois_canadian_french_dataset", split="train")
common_voice_raw["test"] = load_dataset("rishabbahal/quebecois_canadian_french_dataset", split="test")

sample = common_voice_raw["test"][1]
audio = sample["audio"]["array"]
sr = sample["audio"]["sampling_rate"]
expected_text = sample["text"]
print("Reference text :", expected_text)

txt_ft = transcribe_from_audio_with(model, processor, audio)
txt_orig = transcribe_from_audio_with(base_model, base_proc, audio)

print("Fine-tuned :", txt_ft)
print("Original :", txt_orig)

import evaluate
metric = evaluate.load("wer")

wer_ft = metric.compute(predictions=[txt_ft], references=[expected_text])
wer_orig = metric.compute(predictions=[txt_orig], references=[expected_text])

print("WER fine tuned:", wer_ft)
print("WER original:", wer_orig)

Reference text : Oui, c'est bien, mais je ne vois pas d'alligator. Ah non ?


`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [], 'begin_suppress_tokens': []}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> to see related `.generate()` flags.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokens

Fine-tuned : az是 down?
Original :  oui c'est bien mais là je vois pas d-alié-gâteau


Downloading builder script: 5.13kB [00:00, 19.5MB/s]


WER fine tuned: 1.0
WER original: 0.6666666666666666


In [13]:
from jiwer import wer
from tqdm import tqdm
import torch

# Make sure models are on GPU
model = model.to("cuda")

def compute_dataset_wer(model, processor, dataset):
    preds = []
    refs = []
    for item in tqdm(dataset, desc="Processing"):
        audio = item["audio"]["array"]
        text = item["text"]
        inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
        with torch.no_grad():
            ids = model.generate(inputs["input_features"].to("cuda"), max_length=225)
        pred = processor.tokenizer.batch_decode(ids, skip_special_tokens=True)[0]
        preds.append(pred)
        refs.append(text)
    return wer(refs, preds)

# Use the RAW dataset
from datasets import load_dataset

common_voice_raw = load_dataset("rishabbahal/quebecois_canadian_french_dataset", split="test")

print("Evaluating fine-tuned model...")
wer_finetuned = compute_dataset_wer(model, processor, common_voice_raw)

print(f"\n{'='*60}")
print("RESULTS")
print(f"{'='*60}")
print(f"WER fine-tuned: {wer_finetuned*100:.2f}%")

Evaluating fine-tuned model...


Processing: 100%|██████████| 1348/1348 [04:11<00:00,  5.37it/s]


RESULTS
WER fine-tuned: 100.26%



