In [14]:
!pip install --upgrade pip
!pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio


Collecting pip
  Using cached pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-25.1.1-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-25.1.1
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.meta

In [3]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset, Audio
import torch
import json
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Using device: cpu


# Dataset creation

Using the guideline: https://huggingface.co/blog/fine-tune-whisper

In [8]:
audio_path = "./creolese-audio-dataset/"
transcription_path = "./creolese-audio-dataset/transcripts.json"

# Load transcripts JSON
with open(transcription_path, 'r') as f:
    transcripts = json.load(f)

# Create a list of dicts pairing audio files and transcripts
data = []
for item in transcripts:
    audio_file = os.path.join(audio_path, item['audio'])
    if os.path.exists(audio_file):
        print(f"Found file: {audio_file}")
        data.append({'audio': audio_file, 'text': item['text']})
    else:
        print(f"Missing file: {audio_file}")

Found file: ./creolese-audio-dataset/Audio Files/1.wav
Found file: ./creolese-audio-dataset/Audio Files/2.wav
Found file: ./creolese-audio-dataset/Audio Files/3.wav
Found file: ./creolese-audio-dataset/Audio Files/4.wav
Found file: ./creolese-audio-dataset/Audio Files/5.wav
Found file: ./creolese-audio-dataset/Audio Files/6.wav
Found file: ./creolese-audio-dataset/Audio Files/7.wav
Found file: ./creolese-audio-dataset/Audio Files/8.wav
Found file: ./creolese-audio-dataset/Audio Files/9.wav
Found file: ./creolese-audio-dataset/Audio Files/10.wav
Found file: ./creolese-audio-dataset/Audio Files/11.wav
Found file: ./creolese-audio-dataset/Audio Files/12.wav
Found file: ./creolese-audio-dataset/Audio Files/13.wav
Found file: ./creolese-audio-dataset/Audio Files/14.wav
Found file: ./creolese-audio-dataset/Audio Files/15.wav
Found file: ./creolese-audio-dataset/Audio Files/16.wav
Found file: ./creolese-audio-dataset/Audio Files/17.wav
Found file: ./creolese-audio-dataset/Audio Files/18.wav
F

In [15]:
dataset = Dataset.from_list(data)

# Cast the audio column to automatically load audio
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
print(dataset)



Dataset({
    features: ['audio', 'text'],
    num_rows: 36
})


## Load the Model

In [22]:
model_id = "openai/whisper-large-v3"  
processor = WhisperProcessor.from_pretrained(model_id, task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(model_id)

model.to(device)


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bia

In [24]:
# Set the language and task for the processor
processor.tokenizer.set_prefix_tokens(task="transcribe")


In [None]:
def batch_prepare_dataset(examples):
    audio_arrays = [audio["array"] for audio in examples["audio"]]
    sampling_rates = [audio["sampling_rate"] for audio in examples["audio"]]

    # Process all examples in a batch
    inputs = processor(
        audio_arrays, 
        sampling_rate=sampling_rates[0],  # Assuming all are same rate
        return_tensors="pt",
        padding=True
    )

    # Process all texts in batch
    labels = processor.tokenizer(examples["text"], return_tensors="pt", padding=True).input_ids

    return {
        "input_features": inputs.input_features,
        "labels": labels
    }

# Process in batches
prepared_dataset = dataset.map(
    batch_prepare_dataset,
    batched=True,
    batch_size=4,  # Adjust based on memory
    remove_columns=dataset.column_names,
    num_proc=4
)

Map (num_proc=4):   0%|          | 0/36 [00:00<?, ? examples/s]

In [None]:
# OG VER

# Create a function to prepare the dataset for training
def prepare_dataset(examples):
    # Load and resample audio data
    audio = examples["audio"]

    # Process audio
    input_features = processor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt"
    ).input_features

    # Process text
    labels = processor.tokenizer(examples["text"], return_tensors="pt").input_ids

    examples["input_features"] = input_features[0]
    examples["labels"] = labels[0]

    return examples

# Prepare dataset for training
prepared_dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names, num_proc=4)

In [None]:
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None
