In [1]:
import pandas as pd
import os
from datasets import Dataset, Audio
import speech_utils as su
from transformers import Wav2Vec2CTCTokenizer
from transformers import SeamlessM4TFeatureExtractor
from transformers import Wav2Vec2BertProcessor
from transformers import Wav2Vec2BertForCTC
from transformers import TrainingArguments
from transformers import Trainer
from data_collator import DataCollatorCTCWithPadding


processor_with_lm_path = 'kdcyberdude/w2v-bert-punjabi'
# Uncomment below line if you want to evaluate on benchmarks without LM decoder
# processor_with_lm_path = None
dir = '/mnt/sea/speech/benchmarks/vistaar/benchmarks/'
model_path = '/home/kd/Desktop/proj/apr/Punjabi_ASR/checkpoints/wav2vec2-bert-pa_indicvoice_verbatim_2/checkpoint-3500'
tokenizer_path = './'
dirs = os.listdir(dir)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(tokenizer_path, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained(model_path)

for_lm = False
if processor_with_lm_path is not None:
    from m4t_processor_with_lm import M4TProcessorWithLM
    processor = M4TProcessorWithLM.from_pretrained(processor_with_lm_path)
    for_lm = True
else:
    processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

model = Wav2Vec2BertForCTC.from_pretrained(
    model_path, ignore_mismatched_sizes=True,
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.0,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    add_adapter=True,
    use_intermediate_ffn_before_adapter=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

batch_size = 16
accumulation_steps = 1

training_args = TrainingArguments(
    output_dir="./benchmark_runs/",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=accumulation_steps,
    per_device_eval_batch_size=batch_size,
    eval_accumulation_steps=accumulation_steps,
    fp16=True,
    dataloader_num_workers=8,
    dataloader_prefetch_factor=8,
    dataloader_persistent_workers=False,
    report_to="none"
)
 

wers_res = []
from datasets import load_metric
import numpy as np

wer_metric = load_metric("wer")

vocab_chars = list(tokenizer.get_vocab().keys())[:-5]
vocab_chars.append(' ')
def compute_wer_metrics(pred):

    pred_logits = pred.predictions
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    if processor_with_lm_path is None:
        # needed for wihout LM decoding
        pred_ids = np.argmax(pred_logits, axis=-1)
        pred_str = processor.batch_decode(pred_ids)
        # we do not want to group tokens when computing the metrics
        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    else:
        # we do not want to group tokens when computing the metrics
        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
        pred_str = processor.batch_decode(pred_logits).text
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


for d in dirs[:]:
    su.print_red(f'Processing {d}...')
    manifest = f'{d}/punjabi/manifest.json' # path in manifest is {d}/punjabi/wavs/

    df = pd.read_json(f'{dir}{manifest}', lines=True)
    df['audio_filepath'] = df['audio_filepath'].apply(lambda x: f'{dir}{x}')
    df = df.rename(columns={'audio_filepath': 'audio'})

    ds = Dataset.from_pandas(df.reset_index(drop=True))
    ds = ds.cast_column('audio', Audio(sampling_rate = 16000))
    
    ds = su.normalize_texts_for_inference(ds, vocab_chars, strategy='remove')
    ds = ds.map(lambda batch: su.process_dataset(batch, processor), remove_columns=ds.column_names, num_proc=1, batch_size=64, writer_batch_size=64, )

    trainer = Trainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=compute_wer_metrics,
        eval_dataset=ds,
        tokenizer=processor.feature_extractor,
    )
    wer = trainer.evaluate(eval_dataset=ds)['eval_wer']
    print(f'WER of {d}: {wer}')
    wers_res.append((d, wer))


print(wers_res)  

  wer_metric = load_metric("wer")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading the LM will be faster if you build a binary file.
Reading /home/kd/.cache/pyctcdecode/models--kdcyberdude--w2v-bert-punjabi/snapshots/242700f010dcc5181e31b66b3ef0965a3c1aa307/language_model/5gram_pa_3_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


[31m"Processing commonvoice..."[0m


Map:   0%|          | 0/171 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/11 [00:00<?, ?it/s]

WER of commonvoice: 0.13503086419753085
[31m"Processing fleurs..."[0m
Removed 104 sentences


Map:   0%|          | 0/470 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  return F.conv1d(input, weight, bias, self.stride,


  0%|          | 0/30 [00:00<?, ?it/s]

WER of fleurs: 0.11489810260014055
[31m"Processing kathbath..."[0m


Map:   0%|          | 0/1914 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/120 [00:00<?, ?it/s]

  return F.conv1d(input, weight, bias, self.stride,


WER of kathbath: 0.11317567567567567
[31m"Processing kathbath_noisy..."[0m


Map:   0%|          | 0/1914 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/120 [00:00<?, ?it/s]

WER of kathbath_noisy: 0.12367604090577064
[('commonvoice', 0.13503086419753085), ('fleurs', 0.11489810260014055), ('kathbath', 0.11317567567567567), ('kathbath_noisy', 0.12367604090577064)]


In [2]:
# dir = '/mnt/sea/speech/'
# d = 'indictts_ds'
# su.print_red(f'Processing {d}...')
# manifest = f'{d}/punjabi/manifest.json' # path in manifest is {d}/punjabi/wavs/

# df = pd.read_json(f'{dir}{manifest}', lines=True)
# df['audio_filepath'] = df['audio_filepath'].apply(lambda x: f'{dir}{x}')
# df = df.rename(columns={'audio_filepath': 'audio'})

# ds = Dataset.from_pandas(df.reset_index(drop=True))
# ds = ds.cast_column('audio', Audio(sampling_rate = 16000))
# ds = su.add_silence(ds)

# vocab_chars = list(tokenizer.get_vocab().keys())[1:-5]
# ds = su.normalize_texts_for_inference(ds, vocab_chars)
# ds = su.remove_text_samples(ds, column_name='normalized_text')
# ds = ds.map(lambda batch: su.process_dataset(batch, processor), remove_columns=ds.column_names, num_proc=1, batch_size=64, writer_batch_size=64, )

# trainer = Trainer(
#     model=model,
#     data_collator=data_collator,
#     args=training_args,
#     compute_metrics=compute_wer_metrics,
#     eval_dataset=ds,
#     tokenizer=processor.feature_extractor,
# )
# wer = trainer.evaluate(eval_dataset=ds)['eval_wer']
# print(f'WER of {d}: {wer}')
# wers_res.append((d, wer))


# print(wers_res) 

In [3]:
# vistar benchmark
[('commonvoice', 0.224), ('fleurs', 0.231), ('kathbath', 0.169), ('kathbath_noisy', 0.197)]

# at checkpoint 6900
[('commonvoice', 0.19135802469135801), ('fleurs', 0.1991391426563598), ('kathbath', 0.17808406736342478), ('kathbath_noisy', 0.1969330473278262)]

# at checkpoint 6900 with LM decoding (Vocab with vowels) - FIRST 
[('commonvoice', 0.1111111111111111), ('fleurs', 0.06175333801827126), ('kathbath', 0.07343343526082789), ('kathbath_noisy', 0.08584729131486468)]

# checkpoint 12000
[('commonvoice', 0.22376543209876543), ('fleurs', 0.19246310611384398), ('kathbath', 0.17904249007347908), ('kathbath_noisy', 0.19780019168454202)]

# checkpoint 7500 previous data
[('commonvoice', 0.24228395061728394), ('fleurs', 0.2122276879831342), ('kathbath', 0.2253206152161015), ('kathbath_noisy', 0.2402446259869472)]

# checkpoint wav2vec2-bert-pa_4/checkpoint-12300
[('commonvoice', 0.19521604938271606), ('fleurs', 0.18587491215741392), ('kathbath', 0.170005933092967), ('kathbath_noisy', 0.18716626352060609)]

# checkpoint wav2vec2-bert-pa_4/checkpoint-12300 - text normalization function changes
[('commonvoice', 0.19444444444444445), ('fleurs', 0.18640196767392833), ('kathbath', 0.16996895544192842), ('kathbath_noisy', 0.18672388604821039), ('indictts_ds', 0.3322171662399517)]

# checkpoint wav2vec2-bert-pa_4/checkpoint-12300 - text normalization function changes with lm (Vocab with no vowels) - SECOND
[('commonvoice', 0.10802469135802469), ('fleurs', 0.06553056921995784), ('kathbath', 0.07432432432432433), ('kathbath_noisy', 0.09085098612125639), ('indictts_ds', 0.32349093836297976)]

# checkpoint wav2vec2-bert-pa_4/checkpoint-12300 with LM - trained on stories and wiki
[('commonvoice', 0.1697530864197531), ('fleurs', 0.13729796205200281), ('kathbath', 0.12413257852447042), ('kathbath_noisy', 0.14024835646457268), ('indictts_ds', 0.3242511990876871)]

# above with strategy num2word
(0, 104, 0, 0) # number of numeric samples converted
[('commonvoice', 0.1697530864197531), ('fleurs', 0.15446811502495958), ('kathbath', 0.12432111724704487), ('kathbath_noisy', 0.14043174661129113)]

# above with not removing non-vocab characters
[('commonvoice', 0.1705246913580247), ('fleurs', 0.2245491977850348), ('kathbath', 0.12432111724704487), ('kathbath_noisy', 0.14043174661129113)]
# above + fluers with strategy remove -
[('fleurs', 0.208237378210806)] # characters not presented in vocab - ['-', ':', '?', 'n', '.', '6', "'", '8', 'r', 'h', '2', '1', 'i', '3', '¾', '7', '½', '4', '‘', '5', 'f', '~', 'c', '$', '।', '0', '"', 'o', 'g', 'l', 's', '”', '9', 'a', 'e', ';', '/'] - same without numbers - ['-', ':', '?', 'n', '.', 'r', "'", 'h', 'i', '¾', '½', '‘', 'f', '~', 'c', '$', '।', '"', 'o', 'g', 'l', 's', '”', 'a', 'e', ';', '/']

# further fine tune based model with clean data sources and vocabulary extension - removed ds are shrutilipi, cmu & google synth. with no LM - /home/kd/Desktop/proj/apr/speech_pa/wav2vec2-bert-pa_5/checkpoint-8400
[('commonvoice', 0.18209876543209877), ('fleurs', 0.15144061841180603), ('kathbath', 0.15362490869247625), ('kathbath_noisy', 0.1681428049671293)]

# above with - /home/kd/Desktop/proj/apr/speech_pa/wav2vec2-bert-pa_5/checkpoint-1800
[('commonvoice', 0.21141975308641975), ('fleurs', 0.19167252283907238), ('kathbath', 0.19133491599707816), ('kathbath_noisy', 0.2114682249817385)]

# wav2vec2-bert-pa_5/checkpoint-1800 with LM(all data)
[('commonvoice', 0.11805555555555555), ('fleurs', 0.07290934645115953), ('kathbath', 0.08391161431701973), ('kathbath_noisy', 0.09893170197224252)]

# wav2vec2-bert-pa_5/checkpoint-1800 with LM(all data) - do not removed out of vocab characters
[('commonvoice', 0.11496913580246913), ('fleurs', 0.07782853127196064), ('kathbath', 0.08213111760409057), ('kathbath_noisy', 0.095827246165084)]

# wav2vec2-bert-pa_5/checkpoint-1800 with LM - On Wiki And ASR DS - do not removed out of vocab characters on Training LM (Vocab with vowels) - THIRD
[('commonvoice', 0.10185185185185185), ('fleurs', 0.06728742094167252), ('kathbath', 0.08222242512783054), ('kathbath_noisy', 0.091672753834916), ('indictts_ds', 0.26857328130764846)]

# /home/kd/Desktop/proj/apr/speech_pa/wav2vec2-bert-pa_5/checkpoint-4000 - above LM wav2vec2-bert-pa-lm-processor-all_3 - FOURTH
[('commonvoice', 0.10185185185185185), ('fleurs', 0.06965917076598735), ('kathbath', 0.08308984660336012), ('kathbath_noisy', 0.0931336742147553)]

# wav2vec2-bert-pa_5/checkpoint-1800 with LM - On Wiki And ASR DS - do not removed out of vocab characters on Training LM (Vocab with vowels) - wav2vec2-bert-pa-lm-processor-all_3 - strategy - nothing
[('commonvoice', 0.11574074074074074), ('fleurs', 0.10082704063286588), ('kathbath', 0.08464207450693938), ('kathbath_noisy', 0.09888604821037253)]

# wav2vec2-bert-pa_5/checkpoint-1800 with LM - On Wiki And ASR DS - do not removed out of vocab characters on Training LM (Vocab with vowels) - wav2vec2-bert-pa-lm-processor-all_3 - strategy - remove
[('commonvoice', 0.11574074074074074), ('fleurs', 0.07703794799718904), ('kathbath', 0.08464207450693938), ('kathbath_noisy', 0.09888604821037253)]


# IndicVoice Verbatim trained for 2 epochs checkpoint - checkpoints/wav2vec2-bert-pa_indicvoice_verbatim/checkpoint-3000
[('commonvoice', 0.13040123456790123), ('fleurs', 0.11226282501756851), ('kathbath', 0.1039536157779401), ('kathbath_noisy', 0.11641709276844411)]


[('commonvoice', 0.13503086419753085), ('fleurs', 0.11489810260014055), ('kathbath', 0.11317567567567567), ('kathbath_noisy', 0.12367604090577064)]


print('-----WER-----')

-----WER-----


In [9]:
len('ਏਐ')

2