# Resources
Based on https://www.kaggle.com/code/vitouphy/phoneme-recognition-with-wav2vec2 with alterations for my own understanding and use case.

Additional resources:
* https://huggingface.co/blog/fine-tune-xlsr-wav2vec2

In [1]:
!pip install datasets>=1.18.3
!pip install transformers>=4.22.0
!pip install evaluate
!pip install wandb
!pip install torch
!pip install jiwer
!pip install librosa
!pip install seaborn

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting click>=8.1.8 (from jiwer)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading click-8.2.1-py3-none-any.whl (102 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)


In [2]:
import torch
import torch.nn as nn
import multiprocessing
from evaluate import load

from datasets import load_dataset, Audio, Dataset, load_from_disk
from tqdm.auto import tqdm

from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from transformers import TrainingArguments
from transformers import Trainer
from tokenizers.processors import TemplateProcessing

from huggingface_hub import login, logout
from kaggle_secrets import UserSecretsClient
import wandb, os
import numpy as np
import pandas as pd

In [3]:
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("huggingface")
secret_value_1 = user_secrets.get_secret("WANDB_API_KEY")

login(secret_value_0)

In [4]:
# https://docs.wandb.ai/quickstart/
os.environ["WANDB_API_KEY"] = secret_value_1
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpccady[0m ([33mpccady-uppsala-universitet[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

# Model setup

Define collator and metrics to be fetched by child processes

In [5]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    also here: https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch


# Evaluation Metrics
Character Error rate (CER) is a good start, but some dipthongs are present, which will require processing.

In [6]:
def make_compute_metrics(processor, cer_metric):
    def compute_metrics(pred):
        """
        https://huggingface.co/docs/transformers/en/main_classes/trainer
        Must take a EvalPrediction and return a dictionary string to metric values
        """
        pred_logits = pred.predictions
        pred_ids = np.argmax(pred_logits, axis=-1)

        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id

        pred_str = tokenizer.batch_decode(pred_ids)
        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
    
        cer = cer_metric.compute(predictions=pred_str, references=label_str)

        return {"cer": cer}
    return compute_metrics

# Training

In [7]:
from transformers import Wav2Vec2ForCTC
from transformers import TrainingArguments
# Early stopping and Wandb logging for later analysis
from transformers import EarlyStoppingCallback
from transformers.integrations import WandbCallback

In [8]:
# What fold are we using for this run? (0-3)
fold = 2

In [9]:
speakers = ["EBVS","ERMS","MBMPS","NJS"]
spkr = speakers[fold]

In [10]:
dataset_dict = load_from_disk(f"/kaggle/input/l2-arctic-phoneme-data-prep-for-wav2vec2/spanish_loso_es/fold_{spkr}/")
train_dataset = dataset_dict["train"]
val_dataset = dataset_dict["validation"]

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('/kaggle/input/l2-arctic-phoneme-data-prep-for-wav2vec2/', unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", )  # './' load vocab.json in the current directory
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)  
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
    
cer_metric = load("cer")
compute_metrics=make_compute_metrics(processor,cer_metric)

Downloading builder script: 0.00B [00:00, ?B/s]

if I keep using dry runs for any purpose.
def get_training_args(dry_run=False):
    return TrainingArguments(...

In [11]:
# save to same project to simplify comparison later
wandb.init(
    project="wav2vec2-cross-validation", 
    name=f"fold_{fold}", 
    group="4-fold-cv"
)

# Load the model and data specific to this fold
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", 
    attention_dropout=0.1,
    layerdrop=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.75, 
    mask_time_length=10,
    mask_feature_prob=0.25,
    mask_feature_length=64,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)
model.freeze_feature_encoder()
    
# Define TrainingArguments
training_args = TrainingArguments(
    output_dir=f'/kaggle/working/spanish_{spkr}/',
    group_by_length=True,        
    per_device_train_batch_size=4, #Down from 8 for comparability with english model
    gradient_accumulation_steps=1, #down from 4 for comparability with english model
    evaluation_strategy="epoch",
    save_strategy="epoch",
    gradient_checkpointing=True,
    fp16=True,  #comment out for dryrun
    #max_steps=5, # for dry run only. comment out for full run 
    logging_steps=100,
    learning_rate=3e-5,
    warmup_steps=2000,
    save_total_limit=2,
    num_train_epochs=30,
    load_best_model_at_end=True,# False for dry runs
    metric_for_best_model="cer",
    greater_is_better=False, # minimizing, not maximizing the above
    report_to="wandb", # "none" for dry runs, wandb otherwise
    push_to_hub=True, # pushes automatically after every save_steps. false for dry runs
    hub_model_id= f"duck-hug-567/xls-r-300m-es-ipa-{spkr}",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor, #processor or processor.feature_extractor
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=6), #decomment for fullrun
        # WandbCallback() # may be redundant
    ]
)

# Start the training
trainer.train() # trainer saves to output_dir
wandb.finish() # signals run is finished, ensure logs are synced. maybe redundant

[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250824_121126-g9almbiw[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfold_2[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/pccady-uppsala-universitet/wav2vec2-cross-validation[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/pccady-uppsala-universitet/wav2vec2-cross-validation/runs/g9almbiw[0m


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Cer
1,3.0461,2.830713,1.0
2,2.5713,2.652215,0.988185
3,2.1513,2.10616,0.941759
4,1.918,1.645556,0.883371
5,1.5178,0.938182,0.323708
6,1.2655,0.622709,0.185704
7,1.098,0.493614,0.154574
8,1.0511,0.414251,0.133297
9,0.9338,0.364303,0.121482
10,0.9201,0.336449,0.114276


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

optuna hyperparameter tuning is built into HF trainers! This is also connected to WandB for l33t graphs!
https://huggingface.co/docs/transformers/en/hpo_train

#using optuna

def optuna_hp_space(trial):
    return {
            "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-4),
            "warmup_steps": trial.suggest_int("warmup_steps", 0, 2000),
            "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
            }

best_trials = trainer.hyperparameter_search(
    direction="minimize",
    backend="optuna",
    hp_space=hp_space,
    n_trials=10
)
