In [None]:
#Download required imports
import sys
!{sys.executable} -m pip install torch
!{sys.executable} -m pip install torchcodec
!{sys.executable} -m pip install evaluate
!{sys.executable} -m pip install jiwer
!{sys.executable} -m pip install transformers[torch]
!{sys.executable} -m pip install soundfile
!{sys.executable} -m pip install torchaudio


In [7]:
import datasets, huggingface_hub
print("datasets:", datasets.__version__)
print("huggingface_hub:", huggingface_hub.__version__)


datasets: 4.4.1
huggingface_hub: 0.36.0


In [8]:
#Import dataset and split
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset("rishabbahal/quebecois_canadian_french_dataset", "default", split="train")
common_voice["test"] = load_dataset("rishabbahal/quebecois_canadian_french_dataset", "default", split="test")

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'audio_filepath', '__index_level_0__'],
        num_rows: 5389
    })
    test: Dataset({
        features: ['audio', 'text', 'audio_filepath', '__index_level_0__'],
        num_rows: 1348
    })
})


In [9]:
#Keep only needed data (audio and text)
common_voice = common_voice.remove_columns(["audio_filepath", "__index_level_0__"])

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 5389
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 1348
    })
})


In [10]:
#Load feature extractor from pre-trained check-point
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [11]:
#Load tokenizer
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="French", task="transcribe")

In [12]:
#Combine feature extractor and tokenizer to create the processor
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="French", task="transcribe")

In [13]:
#Prepare data
sample = common_voice["train"][2]
audio_decoder = sample['audio']
#audio_samples = audio_decoder.get_all_samples()

sampling_rate = audio_decoder["sampling_rate"] 

print(f"Sampling rate: {sampling_rate}")

Sampling rate: 16000


In [14]:
#Prepare dataset
def prepare_dataset(batch):

    audio_array = batch["audio"]["array"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio_array, sampling_rate=16000).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

In [15]:
test_example = common_voice["train"][0]
print("Original keys:", test_example.keys())

try:
    result = prepare_dataset(test_example)
    print("Success! Result keys:", result.keys())
    print("Input features shape:", result["input_features"].shape)
    print("Labels:", result["labels"])
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()

print("Current columns:", common_voice["train"].column_names)

Original keys: dict_keys(['audio', 'text'])
Success! Result keys: dict_keys(['audio', 'text', 'input_features', 'labels'])
Input features shape: (80, 3000)
Labels: [50258, 50265, 50359, 50363, 50257]
Current columns: ['audio', 'text']


In [16]:
#Apply data preparation function to training examples
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=None)

In [11]:
#Load pre-trained checkpoint
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [12]:
#Disable automatic language detection and force model to generate french
model.generation_config.language = "french"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [13]:
# freeze 100% of the encoder
for param in model.model.encoder.parameters():
    param.requires_grad = False

# freeze 80% of the decoder
decoder_layers = model.model.decoder.layers
total_layers = len(decoder_layers)
layers_to_freeze = int(total_layers * 0.8)

for i in range(layers_to_freeze):
    for param in decoder_layers[i].parameters():
        param.requires_grad = False

# freeze embedding layer
for param in model.model.decoder.embed_tokens.parameters():
    param.requires_grad = False

for param in model.model.decoder.embed_positions.parameters():
    param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen_params = total_params - trainable_params

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Frozen parameters: {frozen_params:,}")
print(f"Percentage trainable: {100 * trainable_params / total_params:.2f}%")

Total parameters: 241,734,912
Trainable parameters: 28,352,256
Frozen parameters: 213,382,656
Percentage trainable: 11.73%


In [14]:
#Define data collector
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # create attention mask for encoder inputs
        attention_mask = torch.ones(batch["input_features"].shape[:-1], dtype=torch.long)
        is_padding = (batch["input_features"] == 0).all(dim=-1)
        attention_mask[is_padding] = 0
        batch["attention_mask"] = attention_mask

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [15]:
#Initialize data collector
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [16]:
#Choose evaluation metrics (error rate)
import evaluate

metric = evaluate.load("wer")

In [17]:
#Define a function that takes the model's predictions and returns the evaluation metric
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [18]:
import shutil
import os

ckpt_dir = "/scratch/lemun9@ulaval.ca/whisper_checkpoints"

if os.path.exists(ckpt_dir):
    shutil.rmtree(ckpt_dir)

In [19]:
#Define training configuration
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir = "/scratch/lemun9@ulaval.ca/whisper_checkpoints",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    eval_strategy="steps", 
    save_strategy="steps", 
    save_steps=1000,
    eval_steps=500, 
    gradient_checkpointing=False,
    fp16=True,
    per_device_eval_batch_size=1,
    predict_with_generate=True,
    generation_max_length=225,
    logging_steps=25,
    report_to=[],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    save_only_model=True,
    save_total_limit=2,
)
model.gradient_checkpointing_disable()

In [20]:
#Forward training agruments, model, dataset, data collector, and compute metrics function to HuggingFace trainer
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    processing_class=processor,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [21]:
#Save processor object
processor.save_pretrained(training_args.output_dir)

[]

In [22]:
#Make space for the model training
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [23]:
#Train the model
trainer.train()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Wer
500,1.2358,1.224333,60.594542
1000,1.0244,1.11899,58.540875
1500,0.907,1.081819,57.627503
2000,0.8701,1.060266,57.701254
2500,0.8187,1.0536,55.483066
3000,0.7752,1.052394,58.098372
3500,0.7575,1.047832,56.623362
4000,0.7039,1.048281,56.83894


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=4000, training_loss=0.9608395533561707, metrics={'train_runtime': 9767.5105, 'train_samples_per_second': 3.276, 'train_steps_per_second': 0.41, 'total_flos': 9.2304040292352e+18, 'train_loss': 0.9608395533561707, 'epoch': 5.935064935064935})

In [28]:
#Load trained model
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import torch
import soundfile as sf

model = WhisperForConditionalGeneration.from_pretrained("/scratch/lemun9@ulaval.ca/whisper_checkpoints/checkpoint-4000")
model_proc = WhisperProcessor.from_pretrained("/scratch/lemun9@ulaval.ca/whisper_checkpoints")
model.eval()


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [29]:
#Load original model
from transformers import WhisperForConditionalGeneration, WhisperProcessor

base_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
base_proc = WhisperProcessor.from_pretrained("openai/whisper-small")
base_model.eval()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [30]:
import torchaudio
import torch

def transcribe_from_path_with(model, processor, path):
    audio, sr = torchaudio.load(path)

    if audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        audio = resampler(audio)

    audio = audio.squeeze().numpy()

    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")

    with torch.no_grad():
        ids = model.generate(inputs["input_features"], max_length=225)

    text = processor.tokenizer.batch_decode(ids, skip_special_tokens=True)[0]
    return text

In [31]:
path = "audioQc.mp3"

txt_finetuned = transcribe_from_path_with(model, model_proc, path)
txt_base = transcribe_from_path_with(base_model, base_proc, path)

print("fine tuned model transcription:")
print(txt_finetuned)
print()
print("original model transcription:")
print(txt_base)

fine tuned model transcription:
T'es tellement beau les étoiles. Garde, c'est la grande ours. Puis, ça c'est quoi? Ça c'est... Attends, c'est la tête ça! Je te l'avais dit qu'on n'avait pas pris le bon chemin. Ça à gauche, il faut continuer de se trouver pour aller au parc. Je pense que c'est tout droit, parce qu'à gauche, il a l'air d'avoir un vortex spatio-temporel qui mène vers une autre planète. Non, c'est sûr que c'est un vortex qui mène au parc. S'il y a autre l'effet,

original model transcription:
 C'est tellement beau, les étoiles. Hum. Garde, c'est la grande ours. Oh! Et ça, c'est quoi? Ça, c'est... Attends, c'est la terre, ça! Je t'avais dit qu'on n'avait pas pris le bon chemin. Ça, à gauche, il faut continuer de tourner par aupar. Ben, je pense que c'est tout droit parce qu'à gauche, c'est l'heure d'avoir un vortex spatio-temporel qui mène vers une autre planète. Non, c'est sûr que c'est un vortex qui mène à haupar. Toute l'effet, vous avez pas le sens d'orientation.


In [32]:
def transcribe_from_audio_with(model, processor, audio_array):
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")

    with torch.no_grad():
        ids = model.generate(inputs["input_features"], max_length=225)

    txt = processor.tokenizer.batch_decode(ids, skip_special_tokens=True)[0]
    return txt

In [39]:
#Compare with test data
from datasets import load_dataset, DatasetDict

common_voice_raw = DatasetDict()
common_voice_raw["train"] = load_dataset("rishabbahal/quebecois_canadian_french_dataset", split="train")
common_voice_raw["test"] = load_dataset("rishabbahal/quebecois_canadian_french_dataset", split="test")

sample = common_voice_raw["train"][1]
audio = sample["audio"]["array"]
sr = sample["audio"]["sampling_rate"]
expected_text = sample["text"]
print("Reference text :", expected_text)

txt_ft = transcribe_from_audio_with(model, processor, audio)
txt_orig = transcribe_from_audio_with(base_model, base_proc, audio)

print("Fine-tuned :", txt_ft)
print("Original :", txt_orig)

import evaluate
metric = evaluate.load("wer")

wer_ft = metric.compute(predictions=[txt_ft], references=[expected_text])
wer_orig = metric.compute(predictions=[txt_orig], references=[expected_text])

print("WER fine tuned:", wer_ft)
print("WER original:", wer_orig)

Reference text : l'île qu'on voit pas du côté sud c'est
magnifique les champs fleurs les maisons mais
Fine-tuned : l'île qu'on voit pas du côté sud. C'est magnifique les champs de fleurs, les maisons, mais
Original :  qu'on voit pas du côté sud. C'est magnifique les champs de fleurs, les maisons, les canons.
WER fine tuned: 0.42857142857142855
WER original: 0.6428571428571429
