In [8]:
from datasets import load_dataset
from transformers import WhisperProcessor

# Load TORGO
dataset = load_dataset("abnerh/TORGO-database")

# Load Whisper processor
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small",
    language="en",
    task="transcribe"
)

print("Processor loaded")

Processor loaded


In [12]:
sample = dataset["train"][0]

audio = sample["audio"]["array"]
sampling_rate = sample["audio"]["sampling_rate"]
text = sample["transcription"]

# Convert audio → log-Mel
inputs = processor(
    audio,
    sampling_rate=sampling_rate,
    return_tensors="pt"
)

# Tokenize text
labels = processor.tokenizer(
    text,
    return_tensors="pt"
).input_ids

print("Audio features shape:", inputs.input_features.shape)
print("Label tokens shape:", labels.shape)
print("Text:", text)

Audio features shape: torch.Size([1, 80, 3000])
Label tokens shape: torch.Size([1, 5])
Text: alpha


In [13]:
def prepare_sample(batch):
    audio = batch["audio"]["array"]
    sampling_rate = batch["audio"]["sampling_rate"]
    text = batch["transcription"]

    inputs = processor(
        audio,
        sampling_rate=sampling_rate
    )

    labels = processor.tokenizer(text).input_ids

    batch["input_features"] = inputs.input_features[0]
    batch["labels"] = labels

    return batch

In [None]:
processed_dataset = dataset.map(
    prepare_sample,
    remove_columns=dataset["train"].column_names,
    num_proc=1

)

Map (num_proc=1): 100%|██████████| 16552/16552 [02:17<00:00, 120.64 examples/s]


In [15]:
print(processed_dataset)

print(processed_dataset["train"][0].keys())

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 16552
    })
})
dict_keys(['input_features', 'labels'])


In [17]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-base"
)

model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [18]:
for param in model.model.encoder.parameters():
    param.requires_grad = False

In [19]:
from dataclasses import dataclass
import torch


@dataclass
class DataCollatorWhisper:
    processor: any

    def __call__(self, features):
        input_features = torch.stack(
            [torch.tensor(f["input_features"]) for f in features]
        )

        labels = [f["labels"] for f in features]
        labels = self.processor.tokenizer.pad(
            {"input_ids": labels},
            return_tensors="pt"
        ).input_ids

        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        return {
            "input_features": input_features,
            "labels": labels,
        }

In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./whisper-torgo",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=100,
    max_steps=2000,
    fp16=False,
    logging_steps=25,
    save_steps=500,
    # evaluation_strategy="no",
    report_to="none",
    push_to_hub=False
)

In [23]:
from transformers import Trainer

data_collator = DataCollatorWhisper(processor)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor
)

  trainer = Trainer(


In [24]:
trainer.train()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,10.8531
50,7.6708
75,2.9029
100,0.87
125,0.6892
150,0.5822
175,0.5707
200,0.5591
225,0.517
250,0.5959




TrainOutput(global_step=2000, training_loss=0.572132069349289, metrics={'train_runtime': 2064.6884, 'train_samples_per_second': 7.749, 'train_steps_per_second': 0.969, 'total_flos': 1.03775993856e+18, 'train_loss': 0.572132069349289, 'epoch': 0.9666505558240696})