In [None]:
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
import torch
from transformers import WhisperForConditionalGeneration

from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
from transformers import WhisperProcessor
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
import speech_utils as su
import os
import os
import pandas as pd
from datasets import Dataset, Audio

metric = evaluate.load("wer")
chunk_length = 16

model_name = 'openai/whisper-medium'
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name, chunk_length=chunk_length)
tokenizer = WhisperTokenizer.from_pretrained(model_name, language="Punjabi", task="transcribe")
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
model = WhisperForConditionalGeneration.from_pretrained('/home/kd/Desktop/proj/apr/Punjabi_ASR/checkpoints/whisper/whisper-medium-pa/checkpoint-20400')

model.generation_config.language = "punjabi"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

def prepare_dataset(batch):
    audio = batch["audio"]
    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch
    
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

training_args = Seq2SeqTrainingArguments(
    output_dir="./checkpoints/whisper/whisper-medium-pa-eval",  
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,  
    learning_rate=1e-5,
    num_train_epochs=14,
    warmup_ratio=0.1,
    dataloader_num_workers=4,
    dataloader_prefetch_factor=8,
    save_total_limit=4,
    gradient_checkpointing=True,
    bf16=True,
    evaluation_strategy="steps",
    run_name='whisper-medium-pa-1-eval',
    predict_with_generate=True,
    save_steps=300,
    eval_steps=300,
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)


In [None]:
wers_res = []

dir = '/mnt/sea/speech/benchmarks/vistaar/benchmarks/'
dirs = os.listdir(dir)
for d in dirs[:1]:
    su.print_red(f'Processing {d}...')
    manifest = f'{d}/punjabi/manifest.json' # path in manifest is {d}/punjabi/wavs/

    df = pd.read_json(f'{dir}{manifest}', lines=True)
    df['audio_filepath'] = df['audio_filepath'].apply(lambda x: f'{dir}{x}')
    df = df.rename(columns={'audio_filepath': 'audio'})

    ds = Dataset.from_pandas(df.reset_index(drop=True))
    ds = ds.cast_column('audio', Audio(sampling_rate = 16000))
    print(ds)
    
    ds = ds.map(prepare_dataset, num_proc=1)

    # TODO: Handle this case of long sequences
    lengths = ds['labels']
    lengths = [len(i) for i in lengths]
    selected_indexes = [i for i in range(len(lengths)) if lengths[i] < 448 ] 
    su.print_red(f"Removing {len(ds) - len(selected_indexes)} samples from this benchmark")
    ds = ds.select(selected_indexes)

    trainer = Seq2SeqTrainer(
        args=training_args,
        model=model,
        eval_dataset=ds,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=processor.feature_extractor,
    )
    wer = trainer.evaluate(eval_dataset=ds)['eval_wer']
    print(f'WER of {d}: {wer}')
    wers_res.append((d, f'{wer}:.2f'))


print(wers_res)
s = 0
for i in wers_res:
    s += i[1]
print(f'Average WER: {sum/len(wers_res)}')

In [None]:
# vistar benchmark
[('commonvoice', 0.224), ('fleurs', 0.231), ('kathbath', 0.169), ('kathbath_noisy', 0.197)]

# WHISPER

# /home/kd/Desktop/proj/apr/Punjabi_ASR/checkpoints/whisper/whisper-medium-pa/checkpoint-20400
[('commonvoice', 21.604938271604937), ('fleurs', 24.047335516522775), ('kathbath', 23.58632650266989), ('kathbath_noisy', 26.694354433846012)]



print('-----WER-----')