# Import Libraries

In [2]:
import os
import sys
import pandas as pd
import numpy as np
from datasets import DatasetDict, Dataset, concatenate_datasets, Audio
import torchaudio
import torch
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Import Datasets

#### Edit Librovox Csv

In [31]:
base_path_train = "./librivox-indonesia/data/train/metadata_train.csv" 
base_path_test = "./librivox-indonesia/data/test/metadata_test.csv" 
lib_train = pd.read_csv(base_path_train)
lib_test = pd.read_csv(base_path_test)

In [32]:
lib_train.head()

Unnamed: 0,path,language,reader,sentence
0,train/sundanese/universal-declaration-of-human...,sun,3174,pernyataan umum ngeunaan hak hak asasi manusa ...
1,train/sundanese/universal-declaration-of-human...,sun,3174,gubragna ka alam dunya teh bari nampa hak hak ...
2,train/sundanese/universal-declaration-of-human...,sun,3174,kalawan dibarung ku ayana kabebasan anu fundam...
3,train/sundanese/universal-declaration-of-human...,sun,3174,perserikatan bangsa bangsa boga komitmen pikeu...
4,train/sundanese/universal-declaration-of-human...,sun,3174,ieu komitmen teh awalna lahir dina piagem pers...


In [33]:
lib_test.head()

Unnamed: 0,path,language,reader,sentence
0,test/sundanese/universal-declaration-of-human-...,sun,3174,mun inget kana ieu pernyataan rek satekah pola...
1,test/sundanese/universal-declaration-of-human-...,sun,3174,asal usul kabangsaan atawa kamasarakatan hak ...
2,test/sundanese/universal-declaration-of-human-...,sun,3174,sacara gembleng
3,test/sundanese/universal-declaration-of-human-...,sun,3174,pon kitu deui dipahing nibankeun hukuman leuwi...
4,test/sundanese/universal-declaration-of-human-...,sun,3174,pasal lima belas sing saha bae boga hak dina n...


In [34]:
lib_train = lib_train[lib_train['language'] == 'ind']
lib_train = lib_train.drop(columns=['reader'], axis=1)
lib_test = lib_test[lib_test['language'] == 'id']
lib_test = lib_test.drop(columns=['reader'], axis=1)

In [35]:
lib_test.head()

Unnamed: 0,path,language,sentence
151,test/indonesian/mengelilingi-doenia-dalam-80-h...,id,perdjalanannja itoe seolah olah seperti seboea...
152,test/indonesian/mengelilingi-doenia-dalam-80-h...,id,sampailah ia keroemah reform club di pall mall
153,test/indonesian/mengelilingi-doenia-dalam-80-h...,id,makanan paginja itoe jaitoe
154,test/indonesian/mengelilingi-doenia-dalam-80-h...,id,makanan itoe matjamnja sama djoega dengan maka...
155,test/indonesian/mengelilingi-doenia-dalam-80-h...,id,djadi pentjoerinja diketahoei orang tanda tand...


In [36]:
lib_train.head()

Unnamed: 0,path,language,sentence
1426,train/indonesian/mengelilingi-doenia-dalam-80-...,ind,bab jang ketiga peri meriwajatkan pertjakapan ...
1427,train/indonesian/mengelilingi-doenia-dalam-80-...,ind,djam poekoel setengah doea belas phileas fogg ...
1428,train/indonesian/mengelilingi-doenia-dalam-80-...,ind,setelah lima ratoes toedjoeh poeloeh lima kali...
1429,train/indonesian/mengelilingi-doenia-dalam-80-...,ind,jaitoe seboeah roemah jang telah didirikan den...
1430,train/indonesian/mengelilingi-doenia-dalam-80-...,ind,phileas fogg teroes menoedjoe kekamar makan


In [37]:
csv_train_name = "id_metadata_train.csv"
csv_test_name = "id_metadata_test.csv"
lib_train.to_csv(csv_train_name)
lib_test.to_csv(csv_test_name)

### Import dataset

In [6]:
# Load common-voice dataset
# Path configuration
base_path = "./"
cv_path = os.path.join(base_path, "cv-corpus-17.0-2024-03-15\id")

# Load TSV files
def load_cv_split(split):
    df = pd.read_csv(
        os.path.join(cv_path, f"{split}.tsv"),
        sep="\t",
        usecols=["path", "sentence", "client_id"]
    )
    df["audio"] = df["path"].apply(
        lambda x: os.path.join(cv_path, "clips", x)
    )
    df = df.drop(columns=["client_id", "path"])
    sentence = df.pop("sentence")
    df["text"] = sentence
    return Dataset.from_pandas(df).cast_column("audio", Audio())

common_voice = DatasetDict({
    "train": load_cv_split("train"),
    "validation": load_cv_split("dev"),
    "test": load_cv_split("test")
})

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 4970
    })
    validation: Dataset({
        features: ['audio', 'text'],
        num_rows: 3349
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 3641
    })
})


In [7]:
common_voice['train'].select(range(5)).to_pandas()

Unnamed: 0,audio,text
0,"{'bytes': None, 'path': './cv-corpus-17.0-2024...",Saya mendengarkan cerita membosankan dari tema...
1,"{'bytes': None, 'path': './cv-corpus-17.0-2024...",halo dunia!
2,"{'bytes': None, 'path': './cv-corpus-17.0-2024...",Sudah makan? sudah sholat...?
3,"{'bytes': None, 'path': './cv-corpus-17.0-2024...",mau pergi kemana hari ini?
4,"{'bytes': None, 'path': './cv-corpus-17.0-2024...",udah keluar hasil testnya?


In [8]:
# Load Librivox dataset

def load_librivox_split(base_path: str, split: str) -> Dataset:
    """Load and process a single split (train/test)"""
    
    # Load metadata CSV
    csv_path = os.path.join(base_path, split, f"id_metadata_{split}.csv")
    df = pd.read_csv(csv_path)
    
    # Construct full audio paths
    df["full_audio_path"] = df["audio_path"].apply(
        lambda x: os.path.join(base_path, x)
    )
    
    # Verify files exist
    df = df[df["full_audio_path"].apply(os.path.exists)]
    
    # Create dataset with proper columns
    return Dataset.from_pandas(df[["full_audio_path", "transcription"]]).rename_columns({
        "full_audio_path": "audio",
        "transcription": "text"
    }).cast_column("audio", Audio())

# Configuration
BASE_PATH = "./librivox-indonesia/data"

# Create DatasetDict
librivox_dataset = DatasetDict({
    "train": load_librivox_split(BASE_PATH, "train"),
    "test": load_librivox_split(BASE_PATH, "test")
})

# Verify structure
print("Dataset structure:", librivox_dataset)

Dataset structure: DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 5635
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 603
    })
})


In [9]:
librivox_dataset['train'].select(range(5)).to_pandas()

Unnamed: 0,audio,text
0,"{'bytes': None, 'path': './librivox-indonesia/...",bab jang ketiga peri meriwajatkan pertjakapan ...
1,"{'bytes': None, 'path': './librivox-indonesia/...",djam poekoel setengah doea belas phileas fogg ...
2,"{'bytes': None, 'path': './librivox-indonesia/...",setelah lima ratoes toedjoeh poeloeh lima kali...
3,"{'bytes': None, 'path': './librivox-indonesia/...",jaitoe seboeah roemah jang telah didirikan den...
4,"{'bytes': None, 'path': './librivox-indonesia/...",phileas fogg teroes menoedjoe kekamar makan


# Prepare Feature Extractor, Tokenizer and Data

In [10]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="Indonesian", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="Indonesian", task="transcribe")



# Preparing Data

In [11]:
librivox_dataset = librivox_dataset.cast_column("audio", Audio(sampling_rate=16000))
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [12]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

do_lower_case = False
do_remove_punctuation = False

normalizer = BasicTextNormalizer()

Audiomentation

In [13]:
augment_waveform = Compose([
    AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=0.3),
    TimeStretch(min_rate=0.9, max_rate=1.25, p=0.3, leave_length_unchanged=False),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.3)
    ,])

def augment_dataset(batch):

    audio = batch["audio"]["array"]
    # apply augmentation
    augmented_audio = augment_waveform(samples=audio, sample_rate=16000)

    batch["audio"]["array"] = augmented_audio

    return batch

In [1]:
def prepare_dataset(batch):
    # load and (possibly) resample audio data to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # compute input length of audio sample in seconds
    batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    
    # optional pre-processing steps
    transcription = batch["transcription"]
    if do_lower_case:
        transcription = transcription.lower()
    if do_remove_punctuation:
        transcription = normalizer(transcription).strip()
    
    # encode target text to label ids
    batch["labels"] = processor.tokenizer(transcription).input_ids
    return batch

In [None]:
# Ensure compatibility with NumPy and resolve potential LLVM issues
os.environ["NUMPY_EXPERIMENTAL_ARRAY_FUNCTION"] = "0"

# Apply augmentation to datasets
common_voice['train'] = common_voice['train'].map(augment_dataset, num_proc=None).with_format("torch")
librivox_dataset['train'] = librivox_dataset['train'].map(augment_dataset, num_proc=None).with_format("torch")

Map:   0%|          | 0/4970 [00:00<?, ? examples/s]

Merging dataset

In [None]:
dataset = DatasetDict()

dataset['train'] = concatenate_datasets([common_voice['train'], librivox_dataset['train']])
dataset['test'] = common_voice['test']

In [None]:
max_input_length = 30.0

def is_audio_in_length_range(length):
    return length < max_input_length

dataset['train'] = dataset['train'].filter(
    is_audio_in_length_range,
    input_columns=["input_length"],
)

# Training and evaluation

In [None]:


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch
    
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [None]:
# evaluate with the 'normalised' WER
do_normalize_eval = True

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    if do_normalize_eval:
        pred_str = [normalizer(pred) for pred in pred_str]
        label_str = [normalizer(label) for label in label_str]

    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)
    cer = 100 * cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}

# Load pre-trained Checkpoint

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

# Training config

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=10000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,               # push to hub = false
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [None]:
processor.save_pretrained(training_args.output_dir)

# Training

In [None]:
trainer.train()