# 

In [2]:
# pip install datasets transformers torchaudio jiwer librosa soundfile

^C
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl (491 kB)
Collecting transformers
  Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Collecting torchaudio
  Using cached torchaudio-2.7.0-cp311-cp311-win_amd64.whl (2.5 MB)
Collecting jiwer
  Using cached jiwer-3.1.0-py3-none-any.whl (22 kB)
Collecting pandas (from datasets)
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl (143 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Using cached huggingface_hub-0.31.1-py3-none-any.whl (484 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl (2.4 MB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached aiohttp-3.11.18-cp311-cp311-win_amd64.whl (443 kB)
Collecting aiosignal>=1.1.2 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025

In [1]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset, Audio
import torch
import json
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Using device: cpu


In [3]:
audio_path = "./creolese-audio-dataset/Audio Files/finetune_eligible"
transcription_path = "./creolese-audio-dataset/Audio Files/finetune_eligible/transcripts.json"

# Load transcripts JSON
with open(transcription_path, 'r') as f:
    transcripts = json.load(f)

# Create a list of dicts pairing audio files and transcripts
data = []
for item in transcripts:
    audio_file = os.path.join(audio_path, item['audio'])
    if os.path.exists(audio_file):
        print(f"Found file: {audio_file}")
        data.append({'audio': audio_file, 'text': item['text']})
    else:
        print(f"Missing file: {audio_file}")

Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\2.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\7.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\12.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\13.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\14.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\20.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\21.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\27.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\33.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\34.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\36.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\./41.wav
Found file: ./creolese-audio-dataset/Audio Files/finetune_eligible\./47.wav
Found file: ./creolese-audio-dataset

In [4]:
dataset = Dataset.from_list(data)

# Cast the audio column to automatically load audio
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
print(dataset)



Dataset({
    features: ['audio', 'text'],
    num_rows: 21
})


In [36]:
split_dataset = dataset.train_test_split(test_size=0.2)


In [5]:
from transformers import Wav2Vec2Processor

# Load processor (tokenizer + feature extractor)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")


In [6]:
def prepare_dataset(batch, processor):
    audio = batch["audio"]

    # Get input values from audio
    input_values = processor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt"
    ).input_values[0]

    # Get labels from text
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids
    

    # Return proper format for CTC
    return {
        "input_values": input_values,
        "labels": batch["labels"]
    }

# Apply preprocessing
processed_dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names, num_proc=4, fn_kwargs={"processor": processor})


Map (num_proc=4):   0%|          | 0/21 [00:00<?, ? examples/s]

In [7]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-960h",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
import transformers
print(transformers.__version__)

4.51.3


# This is a custom attempt

In [9]:
import torch
from dataclasses import dataclass
from typing import Dict, List, Union, Any

@dataclass
class SimpleCTCDataCollator:
        processor: Any

        def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
            # Get the input_values from each feature
            input_values = [feature["input_values"].squeeze(0) if isinstance(feature["input_values"], torch.Tensor) else torch.tensor(feature["input_values"]) for feature in features]

            # Determine max length for padding
            max_length = max(len(x) for x in input_values)

            # Pad the input_values manually
            padded_input_values = []
            attention_mask = []

            for val in input_values:
                # Create attention mask (1 for real values, 0 for padding)
                length = len(val)
                mask = torch.ones(length)
                if length < max_length:
                    pad_length = max_length - length
                    # Pad the input values
                    val = torch.nn.functional.pad(val, (0, pad_length), value=0.0)
                    # Extend the attention mask with zeros for padding
                    mask = torch.nn.functional.pad(mask, (0, pad_length), value=0.0)

                padded_input_values.append(val)
                attention_mask.append(mask)

            # Stack the padded inputs and attention masks
            batch = {
                "input_values": torch.stack(padded_input_values),
                "attention_mask": torch.stack(attention_mask)
            }

            # Get labels
            if "labels" in features[0]:
                labels = [feature["labels"] for feature in features]

                # Pad labels manually with -100 (ignore index for CTC loss)
                padded_labels = []
                max_label_length = max(len(l) for l in labels)

                for label in labels:
                    if isinstance(label, torch.Tensor):
                        label = label.tolist()

                    if len(label) < max_label_length:
                        # Pad with -100
                        label = label + [-100] * (max_label_length - len(label))

                    padded_labels.append(torch.tensor(label, dtype=torch.long))

                batch["labels"] = torch.stack(padded_labels)

            return batch

In [10]:
data_collator = SimpleCTCDataCollator(processor=processor)

# This is the version that doesn't work

In [48]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=processor.tokenizer, padding=True)


# Continue

In [55]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./wav2vec2-creolese-finetuned",
    per_device_train_batch_size=1,
    num_train_epochs=25,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
    fp16=False,  # True if on GPU with mixed precision
    gradient_accumulation_steps=4
)


In [12]:
import jiwer
import torch

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = torch.argmax(torch.tensor(pred_logits), dim=-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = jiwer.wer(label_str, pred_str)
    mer = jiwer.mer(label_str, pred_str)
    cer = jiwer.cer(label_str, pred_str)
    return {"wer": wer, "mer": mer, "cer": cer}
    

In [13]:
# To avoid maxing out ram
import torch
import numpy as np
import os

# Create a directory to store processed features
os.makedirs("processed_features", exist_ok=True)

# Process each example once and save to disk
for idx, example in enumerate(dataset):
    print(f"Processing example {idx+1}/{len(dataset)}")

    # Get audio
    audio = example["audio"]

    # Extract features
    input_values = processor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt"
    ).input_values[0]

    # Get labels
    labels = processor.tokenizer(example["text"]).input_ids

    # Save to disk
    torch.save({
        "input_values": input_values,
        "labels": labels
    }, f"processed_features/example_{idx}.pt")

# Create a custom dataset that loads from disk
class AudioFeatureDataset(torch.utils.data.Dataset):
    def __init__(self, feature_dir, num_examples):
        self.feature_dir = feature_dir
        self.num_examples = num_examples

    def __len__(self):
        return self.num_examples
    
    def __getitem__(self, idx):
        # Load features from disk
        features = torch.load(f"{self.feature_dir}/example_{idx}.pt")
        return features

# Use the disk-based dataset
train_dataset = AudioFeatureDataset("processed_features", len(dataset))

Processing example 1/21
Processing example 2/21
Processing example 3/21
Processing example 4/21
Processing example 5/21
Processing example 6/21
Processing example 7/21
Processing example 8/21
Processing example 9/21
Processing example 10/21
Processing example 11/21
Processing example 12/21
Processing example 13/21
Processing example 14/21
Processing example 15/21
Processing example 16/21
Processing example 17/21
Processing example 18/21
Processing example 19/21
Processing example 20/21
Processing example 21/21


In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=SimpleCTCDataCollator(processor=processor),
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=processed_dataset,
    tokenizer=processor.feature_extractor
)


  trainer = Trainer(


In [None]:
trainer.train()
trainer.evaluate()



Step,Training Loss


In [None]:
model.save_pretrained("./wav2vec2-creolese-finetuned")
processor.save_pretrained("./wav2vec2-creolese-finetuned")
