# Import Libraries

In [None]:
# !pip install audiomentations==0.41.0 --no-dependencies
# !pip install librosa numpy-minmax numpy-rms python-stretch
# !pip install jiwer
# !pip install evaluate




In [27]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, Audio, load_dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate

In [28]:
print("Torch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB") if torch.cuda.is_available() else print("No GPU")


Torch version: 2.6.0+cu124
CUDA version: 12.4
CUDA available: True
CUDA device: Tesla T4
GPU Memory: 15.83 GB


In [29]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["NUMPY_EXPERIMENTAL_ARRAY_FUNCTION"] = "0"

In [30]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [31]:
# Adjust these based on your hardware constraints:
MODEL_SIZE = "small"
MAX_AUDIO_LENGTH = 15.0 
BATCH_SIZE = 16  
GRADIENT_ACCUMULATION = 2  
USE_AUGMENTATION = True 
NUM_PROC = None
model_checkpoint = f"openai/whisper-{MODEL_SIZE}"

### Import dataset

In [32]:
dataset_root = "/kaggle/input/common-voice-ds/cv-corpus-17.0-2024-03-15/id"
clips_dir = os.path.join(dataset_root, "clips")
train_tsv = os.path.join(dataset_root, "train.tsv")
dev_tsv = os.path.join(dataset_root, "dev.tsv")

# Load the TSV files into pandas DataFrames
train_df = pd.read_csv(train_tsv, sep='\t')
dev_df = pd.read_csv(dev_tsv, sep='\t')

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(dev_df)}")

Training set size: 4970
Validation set size: 3349


In [33]:
def create_dataset_from_df(df, clips_dir):
    # Get the full paths to audio files
    audio_paths = [os.path.join(clips_dir, filename) for filename in df['path'].values]
    
    # Create a dictionary with our data
    dataset_dict = {
        "audio": audio_paths,
        "sentence": df['sentence'].values,
        "path": df['path'].values,
    }
    
    # Create a Hugging Face Dataset
    dataset = Dataset.from_dict(dataset_dict)
    
    # Add audio feature
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    
    return dataset

In [34]:
train_dataset = create_dataset_from_df(train_df, clips_dir)
eval_dataset = create_dataset_from_df(dev_df, clips_dir)

# Filter out examples that don't have transcriptions
train_dataset = train_dataset.filter(lambda example: example["sentence"] is not None)
eval_dataset = eval_dataset.filter(lambda example: example["sentence"] is not None)

print(f"Training set after filtering: {len(train_dataset)} examples")
print(f"Validation set after filtering: {len(eval_dataset)} examples")

Filter:   0%|          | 0/4970 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3349 [00:00<?, ? examples/s]

Training set after filtering: 4970 examples
Validation set after filtering: 3349 examples


# Prepare Feature Processor

In [35]:
processor = WhisperProcessor.from_pretrained(model_checkpoint)
model = WhisperForConditionalGeneration.from_pretrained(model_checkpoint)

In [36]:
model = model.to(device)

# Preparing Data

In [37]:
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
augment_waveform = Compose([
    AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=0.3),
    TimeStretch(min_rate=0.9, max_rate=1.25, p=0.3, leave_length_unchanged=False),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.3)
    ,])

In [38]:
def prepare_dataset(batch, apply_augmentation=False):
    # Process audio data
    audio = batch["audio"]
    array = batch["audio"]["array"]

    if apply_augmentation:
        # Apply your augmentations here
        array = augment_waveform(samples=array, sample_rate=16000)
    # Extract features
    input_features = processor(
        array, 
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt"
    ).input_features[0]
    
    # Tokenize text
    labels = processor.tokenizer(batch["sentence"]).input_ids
    
    return {"input_features": input_features, "labels": labels}

# Map the preparation function to our datasets
train_dataset = train_dataset.map(lambda batch: prepare_dataset(batch, apply_augmentation=True), remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(lambda batch: prepare_dataset(batch, apply_augmentation=False), remove_columns=eval_dataset.column_names)

Map:   0%|          | 0/4970 [00:00<?, ? examples/s]



Map:   0%|          | 0/3349 [00:00<?, ? examples/s]

In [39]:
from typing import Any, Dict, List, Union
import torch

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Extract input_features and labels
        input_features = [feature["input_features"] for feature in features]
        label_features = [feature["labels"] for feature in features]

        # Pad input features using the feature extractor
        batch = {
            "input_features": torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f) for f in input_features], batch_first=True
            )
        }

        # Pad labels using the tokenizer
        labels_batch = self.processor.tokenizer.pad(
            [{"input_ids": l} for l in label_features],
            return_tensors="pt",
        )

        # Replace padding token id's with -100 to ignore in loss computation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [40]:
output_dir = f"/kaggle/working/{model_checkpoint}-indonesian"

# Training and evaluation

In [42]:
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)
    cer = 100 * cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}

# Load pre-trained Checkpoint

In [44]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

In [45]:
torch.cuda.empty_cache()

# Training config

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=BATCH_SIZE//2,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=1500,
    gradient_checkpointing=True,
    fp16=torch.cuda.is_available(),
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=50,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    generation_max_length=225,
    predict_with_generate=True,
)

In [47]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
)


  trainer = Seq2SeqTrainer(


# Training

In [None]:
try:
     print("Starting training...")
     trainer.train()
     model.save_pretrained(os.path.join(output_dir,'-final'))
     print("Training completed successfully!")
     
     trainer.save_model()
     print(f"Model saved to {training_args.output_dir}")
        
except Exception as e:
    print(f"Training error: {str(e)}")
    try:
        trainer.save_model("./checkpoint-error")
        print("Saved model checkpoint despite error")
    except:
        print("Could not save checkpoint")

Starting training...




Step,Training Loss,Validation Loss,Wer,Cer
500,0.1565,0.388132,40.672961,14.528497
1000,0.0106,0.437587,38.247796,13.814474


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


KeyboardInterrupt: 

Error happened due to stopping the training midway because of time constraint on the session runtime