# Goal of this notebook

Develop a training loop for finetuning ASR models using TTS loss by recreating RL training found in RL4LMs/rl4lms/envs/text_generation/training_utils.py

```bash
conda activate speller
cd /home/s1785140/rlspeller
```

# automatic reloading magic

# imports

In [373]:
import torch
from typing import List, Dict, Tuple, Any
import hyperpyyaml
from tqdm import tqdm
from torchaudio.models.decoder import ctc_decoder
from torch.nn.functional import softmax
import random
from jiwer import cer
import numpy as np
import speechbrain as sb
from IPython.display import Audio

## check if gpu available

In [374]:
# print hostname to make sure we are on correct node
import socket
print(socket.gethostname())

levi.inf.ed.ac.uk


In [375]:
torch.cuda.is_available()

True

In [376]:
import os
os.getcwd()

'/disk/nfs/ostrom/s1785140/rlspeller'

# HPARAMS

In [377]:
hparams = {
    "softdtw_temp": 0.01,
    "softdtw_bandwidth": 120,
    "dist_func": "l1",
    "sentencepiece_model_path": "/home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/Tokenizer/save/0_char.model",
    # 'speechbrain_hparams_file': '/home/s1785140/rlspeller/infer_speechbrain_subsampling1x_nowhitespace.yaml',
    'speechbrain_hparams_file': '/home/s1785140/rlspeller/infer_speechbrain_subsampling1x.yaml',
    # 'speechbrain_hparams_file': '/home/s1785140/rlspeller/infer_speechbrain_subsampling2x.yaml',
    # 'speechbrain_hparams_file': '/home/s1785140/rlspeller/infer_speechbrain.yaml', # 4x subsampling
    'sample_rate': 16000,
}

# TOKENIZER

In [378]:
# load pretrained tokenizer used to tokenizer ASR training inputs 
import sentencepiece as spm 
spm_path = hparams["sentencepiece_model_path"]
sp = spm.SentencePieceProcessor()
sp.load(spm_path)
print(sp.vocab_size())

28


In [379]:
# test tokenizer
s = "hello world my name is jason"
# TODO pass string through text cleaners? 
encoded = sp.EncodeAsIds(s)
assert 0 not in encoded, "tried to encode an unknown character"
print(" ".join(str(idx) for idx in encoded))

1 10 2 12 12 4 1 17 4 9 12 11 1 16 20 1 6 5 16 2 1 7 8 1 26 5 8 4 6


In [380]:
sp.DecodeIds(encoded)

'hello world my name is jason'

# NEW! SIMPLE TOKENIZER

In [381]:
from speechbrain.tokenizers.SimpleTokenizer import SimpleTokenizer

In [382]:
tokenizer = SimpleTokenizer()

In [383]:
text = "hello my name is jason"
text = text.replace(' ', '|')
print(text)
ids = tokenizer.encode_as_ids(text)
ids

hello|my|name|is|jason


[9, 6, 13, 13, 16, 1, 14, 26, 1, 15, 2, 14, 6, 1, 10, 20, 1, 11, 2, 20, 16, 15]

In [384]:
tokenizer.decode_ids(ids)

'hello|my|name|is|jason'

## test simple tokenizer with probability distribution, and see if CTC decoder successfully generates n-best lists

In [385]:
# create empty array of correct dimensions
min_len, max_len = 50, 100
bsz = 4
lens = torch.randint(min_len, max_len, (bsz,))
vocab_size = len(tokenizer.vocab)

# randomly assign probaility distribution to each timestep

# try to decode

In [386]:
randn = torch.randn(bsz, max_len, vocab_size)

In [387]:
ctc_probs = softmax(randn, dim=1)
# ctc_probs

In [388]:
ctc_beamsearch_decoder_test = ctc_decoder(
    lexicon=None,
    # tokens="/home/s1785140/rlspeller/templates/speech_recognition_CharTokens_NoLM/Tokenizer/save/tokens.txt",
    tokens=tokenizer.vocab,
    nbest=2,
    blank_token='-',
    sil_token="|",
)

predicted_ids = ctc_beamsearch_decoder_test(ctc_probs, lens)

predicted_words = []
for i, hyps in enumerate(predicted_ids):
    for j, hyp in enumerate(hyps):
        words = tokenizer.decode_ids(hyp.tokens.tolist()).split(" ")
        tup = (f"sample {i+1}, hyp {j+1}/{len(hyps)}", words)
        predicted_words.append(tup)
        print(tup)

('sample 1, hyp 1/2', ['|jshwvbilavjumrpeinyvzvznefekdebiz|shc|viyv|rgeqbzax|aonxq|iayewknbt|bwkoswuo|bxmncoadxydlm|'])
('sample 1, hyp 2/2', ['|jshwvbilavjumrpeinyvzvznefekdebiz|shc|viyv|rgeqbzax|aonxq|iayewknbt|bwkoswuoybxmncoadxydlm|'])
('sample 2, hyp 1/2', ['|omecu|f|ufsgkeglhxwkcnsotflrmjdwqjenpnzsrvlwpxakir|xsvnkmhcoxaidtobunwjpcgqhuityfzlhlizvj|'])
('sample 2, hyp 2/2', ['|omecu|f|ufsgkeglhxwkcnsotflrmjdwqjenpnzsrvlwpxakir|xsvnkmhcoxaidtobunwjpcgqhubtyfzlhlizvj|'])
('sample 3, hyp 1/2', ['|mcabdpfgrstqlczxyxkokni|auarevcd|hrprnljrfvbqzckckwf|xfebos|tctgmsmocnoiofhcg|nteswduywlcxus|'])
('sample 3, hyp 2/2', ['|mcabdpfgrstqlczxyxkokni|auarevcd|hrprnljrfvbqzckckwf|xfebos|tctgmsmocnoiofhcg|bteswduywlcxus|'])
('sample 4, hyp 1/2', ['|tusmtncyfzkvyhjtkubup|gapbtuzltjvosrobeliyxzdlgeoeoivqlybygnlwjxzrc|chxtndimx|uardhszinjypf|'])
('sample 4, hyp 2/2', ['|tusmtncvfzkvyhjtkubup|gapbtuzltjvosrobeliyxzdlgeoeoivqlybygnlwjxzrc|chxtndimx|uardhszinjypf|'])


# LOAD ASR (PRETRAINED)

In [389]:
from templates.speech_recognition_CharTokens_NoLM.ASR.train import ASR
from templates.speech_recognition_CharTokens_NoLM.ASR.train import dataio_prepare
from torch.utils.data import DataLoader
from speechbrain.dataio.dataloader import LoopedLoader

In [390]:
# Load hyperparameters file with command-line overrides
speechbrain_hparams_file = hparams['speechbrain_hparams_file']
with open(speechbrain_hparams_file) as f:
    speechbrain_hparams = hyperpyyaml.load_hyperpyyaml(f)

/home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/data/rirs_noises.zip exists. Skipping download


In [391]:
speechbrain_hparams['save_folder']

'/home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/ASR/results/CRDNN_CHAR_LJSpeech_halved_subsampling1x/2602/save'

In [392]:
# initialise trainer (we don't want to train, but model is tightly coupled with trainer)
asr_brain = ASR(
    modules=speechbrain_hparams["modules"],
    opt_class=speechbrain_hparams["opt_class"],
    hparams=speechbrain_hparams,
    checkpointer=speechbrain_hparams["checkpointer"],
)



def setup_asr_brain_for_infer(asr_brain, enable_dropout=False):
    asr_brain.on_evaluate_start(min_key="WER") # We call the on_evaluate_start that will load the best model
    if enable_dropout:
        asr_brain.modules.train()
    else:
        asr_brain.modules.eval() # We set the model to eval mode (remove dropout etc)

print("if on_evaluate_start() get runtime error, likely need to restart notebook kernel")
setup_asr_brain_for_infer(asr_brain, enable_dropout=True)

if on_evaluate_start() get runtime error, likely need to restart notebook kernel


In [393]:
# create dataset and dataloader for inference
datasets = dataio_prepare(speechbrain_hparams)

test_set = datasets['test']

if not isinstance(test_set, DataLoader) or isinstance(test_set, LoopedLoader):
    test_loader_kwargs=speechbrain_hparams["test_dataloader_opts"]
    test_set = asr_brain.make_dataloader(
        test_set, stage=sb.Stage.TEST, **test_loader_kwargs
    )

In [394]:
# get vocab from tokenizer (needed for ctc decoding)
vocab_size = len(asr_brain.hparams.tokenizer)
vocab = []
for i in range(vocab_size):
    vocab.append(asr_brain.hparams.tokenizer.decode_ids([i]))
print(vocab)

# edit vocab to match default ctc decoder symbols for blank and silence
vocab[0] = '-'
vocab[1] = "|"

print(vocab)

[' ⁇ ', '', 'e', 't', 'o', 'a', 'n', 'i', 's', 'r', 'h', 'd', 'l', 'c', 'f', 'u', 'm', 'w', 'p', 'g', 'y', 'b', 'v', 'k', 'x', 'q', 'j', 'z']
['-', '|', 'e', 't', 'o', 'a', 'n', 'i', 's', 'r', 'h', 'd', 'l', 'c', 'f', 'u', 'm', 'w', 'p', 'g', 'y', 'b', 'v', 'k', 'x', 'q', 'j', 'z']


In [395]:
ctc_beamsearch_decoder = ctc_decoder(
    lexicon=None,
    # tokens="/home/s1785140/rlspeller/templates/speech_recognition_CharTokens_NoLM/Tokenizer/save/tokens.txt",
    tokens=vocab,
    nbest=100,
    blank_token='-',
    sil_token="|",
)

In [396]:
# generate transcriptions for all batches in test set
def transcribe_dataset(asr_brain, dataset, greedy=False, num_batches_to_transcribe=None):
    # Now we iterate over the dataset and we simply compute_forward and decode
    with torch.no_grad():
        transcripts = []
        for batch in tqdm(list(dataset)[:num_batches_to_transcribe], dynamic_ncols=True):
            orig_transcriptions = batch.words

            # Make sure that your compute_forward returns the predictions !!!
            # In the case of the template, when stage = TEST, a beam search is applied 
            # in compute_forward(). 
            predictions = asr_brain.compute_forward(batch, stage=sb.Stage.TEST)
            
            ctc_probs = predictions['ctc_logprobs'] # FOR DEBUG

            if greedy:
                predicted_ids = sb.decoders.ctc_greedy_decode(
                    predictions["ctc_logprobs"], asr_brain.feat_lens, blank_id=asr_brain.hparams.blank_index
                )
                predicted_words = [
                    asr_brain.tokenizer.decode_ids(ids).split(" ")
                    for ids in predicted_ids
                ]
            else:
                # get mel lens from wav len ratios since torch ctc decoder requires lens in frames
                batch_max_len = predictions["ctc_logprobs"].size(1)
                bsz = predictions["ctc_logprobs"].size(0)
                mel_lens = torch.zeros(bsz)
                for i, len_ratio in enumerate(asr_brain.feat_lens):
                    mel_lens[i] = int(torch.round(len_ratio * batch_max_len))
                
                predicted_ids = ctc_beamsearch_decoder(
                    predictions["ctc_logprobs"], lengths=mel_lens
                )

                predicted_words = []
                for i, (utt_id, orig_text, hyps) in enumerate(zip(batch.utt_id, orig_transcriptions, predicted_ids)):
                    print(f"\nsample {i+1} - ({utt_id}: '{orig_text}')")
                    sample_cers = []
                    for j, hyp in enumerate(hyps):
                        words = asr_brain.hparams.tokenizer.decode_ids(hyp.tokens.tolist()) # .split("|")
                        # words = tokenizer.decode_ids(hyp.tokens.tolist()) # .split("|")
                        hyp_cer = 100 * cer(orig_text, words)
                        sample_cers.append(hyp_cer)
                        print(f"\thyp {j+1}/{len(hyps)} (CER={hyp_cer:.1f}%): '{words}'")
                        predicted_words.append((f"sample {i+1}, hyp {j+1}/{len(hyps)}", words))
                        
                    print(f"\t=== Mean CER: {np.mean(sample_cers):.1f}%, Std CER: {np.std(sample_cers):.1f}% ===")

            transcripts.append(predicted_words)

    return transcripts, ctc_probs

# transcripts, ctc_probs = transcribe_dataset(asr_brain, test_set, greedy=False, num_batches_to_transcribe=1)

# LOAD WORD ALIGNED WAVS into dataset

In [397]:
# imitate CLAs
import sys
import argparse
import math
import glob
from tqdm import tqdm

In [398]:
# set these in yaml config!
train_annotation_path = '/home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/data/respeller_train_wordtoken_annotation.json'
valid_annotation_path = '/home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/data/respeller_valid_wordtoken_annotation.json'
test_annotation_path = '/home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/data/respeller_test_wordtoken_annotation.json'

In [399]:
speechbrain_hparams['train_annotation'] = '/home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/data/respeller_train_wordtoken_annotation.json'
speechbrain_hparams['valid_annotation'] = '/home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/data/respeller_valid_wordtoken_annotation.json'
speechbrain_hparams['test_annotation'] = '/home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/data/respeller_test_wordtoken_annotation.json'

In [400]:
def dataio_prepare(hparams):
    """This function prepares the datasets to be used in the brain class.
    It also defines the data processing pipeline through user-defined functions.


    Arguments
    ---------
    hparams : dict
        This dictionary is loaded from the `train.yaml` file, and it includes
        all the hyperparameters needed for dataset construction and loading.

    Returns
    -------
    datasets : dict
        Dictionary containing "train", "valid", and "test" keys that correspond
        to the DynamicItemDataset objects.
    """
    # Define audio pipeline. In this case, we simply read the path contained
    # in the variable wav with the audio reader.
    # wav path like: data/ljspeech_wavs_16khz_word_aligned/differs/differs__LJ001-0001__occ1__len8320.wav
    @sb.utils.data_pipeline.takes("wav")
    @sb.utils.data_pipeline.provides("sig", "wav_path", "utt_id")
    def audio_pipeline(wav_path):
        """Load the audio signal. This is done on the CPU in the `collate_fn`."""
        sig = sb.dataio.dataio.read_audio(wav_path)
        yield sig

        yield wav_path

        utt_id = wav_path.split("/")[-1].split(".")[0]
        yield utt_id

    @sb.utils.data_pipeline.takes("samples_to_graphemes_ratio")
    @sb.utils.data_pipeline.provides("samples_to_graphemes_ratio")
    def ratio_pipeline(samples_to_graphemes_ratio):
        yield samples_to_graphemes_ratio

    @sb.utils.data_pipeline.takes("length")
    @sb.utils.data_pipeline.provides("length")
    def length_pipeline(length):
        yield length

    @sb.utils.data_pipeline.takes("words")
    @sb.utils.data_pipeline.provides("words")
    def text_pipeline(words):
        """Processes the transcriptions to generate proper labels

        NB Make sure that you yield exactly what is defined above in @sb.utils.data_pipeline.provides()"""
        yield words

        # TODO also yield mel for calculating fastpitch softdtw loss

    # Define datasets from json data manifest file
    # Define datasets sorted by ascending lengths for efficiency
    datasets = {}
    data_info = {
        "train": hparams["train_annotation"],
        "valid": hparams["valid_annotation"],
        "test": hparams["test_annotation"],
    }

    for split in data_info:
        datasets[split] = sb.dataio.dataset.DynamicItemDataset.from_json(
            json_path=data_info[split],
            dynamic_items=[audio_pipeline, ratio_pipeline, length_pipeline, text_pipeline],
            output_keys=[
                "id",
                "sig",
                "wav_path",
                "utt_id",
                "samples_to_graphemes_ratio",
                "length",
                "words",
            ],
        )
        hparams[f"{split}_dataloader_opts"]["shuffle"] = True

    def print_dataset_lens(extra_str):
        for split in data_info:
            dataset_split = datasets[split]
            print(f"{split} dataset has {len(dataset_split)} samples", extra_str)

    print_dataset_lens("before any filtering")

    # Filter data for samples_to_graphemes_ratio that is either too small or too large
    key_min_value = {}
    if hparams["min_samples_to_graphemes_ratio"] is not None:
        key_min_value = {"samples_to_graphemes_ratio": hparams["min_samples_to_graphemes_ratio"]}

    key_max_value = {}
    if hparams["max_samples_to_graphemes_ratio"] is not None:
        key_max_value = {"samples_to_graphemes_ratio": hparams["max_samples_to_graphemes_ratio"]}

    for split in data_info:
        datasets[split] = datasets[split].filtered_sorted(
            key_min_value=key_min_value,
            key_max_value=key_max_value,
        )
    print_dataset_lens("after filtering by min and max samples to graphemes ratio")

    # Filter samples whos length is too short
    key_min_value = {}
    if hparams["min_length_seconds"] is not None:
        key_min_value = {"length": hparams["min_length_seconds"]}
    for split in data_info:
        datasets[split] = datasets[split].filtered_sorted(
            key_min_value=key_min_value,
        )
    print_dataset_lens("after filtering by minimum length")

    # Sorting training data with ascending order makes the code  much
    # faster  because we minimize zero-padding. In most of the cases, this
    # does not harm the performance.
    if hparams["sorting"] == "ascending":
        datasets["train"] = datasets["train"].filtered_sorted(sort_key="length")
        hparams["train_dataloader_opts"]["shuffle"] = False

    elif hparams["sorting"] == "descending":
        datasets["train"] = datasets["train"].filtered_sorted(
            sort_key="length", reverse=True
        )
        hparams["train_dataloader_opts"]["shuffle"] = False

    elif hparams["sorting"] == "random":
        hparams["train_dataloader_opts"]["shuffle"] = True
        pass

    else:
        raise NotImplementedError(
            "sorting must be random, ascending or descending"
        )
    
    return datasets

datasets = dataio_prepare(speechbrain_hparams)

train dataset has 11453 samples before any filtering
valid dataset has 415 samples before any filtering
test dataset has 414 samples before any filtering
train dataset has 10642 samples after filtering by min and max samples to graphemes ratio
valid dataset has 391 samples after filtering by min and max samples to graphemes ratio
test dataset has 384 samples after filtering by min and max samples to graphemes ratio
train dataset has 10622 samples after filtering by minimum length
valid dataset has 389 samples after filtering by minimum length
test dataset has 384 samples after filtering by minimum length


In [401]:
# convert from datasets to dataloaders
split2stage = {"train": sb.Stage.TRAIN, "valid": sb.Stage.VALID, "test": sb.Stage.TEST}
for split in ["train", "valid", "test"]:
    if not isinstance(datasets[split], DataLoader) or isinstance(datasets[split], LoopedLoader):
        dataloader_kwargs=speechbrain_hparams[f"{split}_dataloader_opts"]
        datasets[split] = asr_brain.make_dataloader(
            datasets[split], stage=split2stage[split], **dataloader_kwargs
        )

In [402]:
def set_whitespace_to_0_probability(ctc_probs, vocab, log_probs=True, whitespace_symbol="|"):
    """ctc_probs [bsz, max_seq_len, vocab_size]"""
    new_probability = -math.inf if log_probs else 0.0
    ctc_probs[:,:,vocab.index(whitespace_symbol)] = new_probability
    return ctc_probs

# TRANSCRIBE WORD ALIGNED WAVS

In [403]:
from collections import defaultdict

# generate transcriptions for all batches in test set
def transcribe_dataset(asr_brain, dataset, 
                       num_batches_to_transcribe=None,
                       hack_whitespace_probs=False, collapse_whitespace=True,
                       print_info=False, max_hyps_per_sample=None,
                       wordtypes_to_transcribe=[]):
    # Now we iterate over the dataset and we simply compute_forward and decode

    orig_words = []
    printouts = []
    orig_wavs = []
    transcribed_words = defaultdict(list)
    n = 0 # number of batches transcribed

    with torch.no_grad():
        for batch in tqdm(dataset, dynamic_ncols=True, total=num_batches_to_transcribe):
            # break out of loop if we have transcribed enough batches
            if n >= num_batches_to_transcribe:
                break
            n += 1

            orig_transcriptions = batch.words
            wavs = batch.sig.data

            # Make sure that your compute_forward returns the predictions !!!
            # In the case of the template, when stage = TEST, a beam search is applied 
            # in compute_forward(). 
            predictions = asr_brain.compute_forward(batch, stage=sb.Stage.TEST)
            ctc_probs = predictions['ctc_logprobs'] # FOR DEBUG

            # hack probabilities to set all probs to 0 for whitespace
            if hack_whitespace_probs:
                ctc_probs = set_whitespace_to_0_probability(ctc_probs, vocab, log_probs=True, whitespace_symbol="|")

            # get mel lens from wav len ratios since torch ctc decoder requires lens in frames
            batch_max_len = predictions["ctc_logprobs"].size(1)
            bsz = predictions["ctc_logprobs"].size(0)
            mel_lens = torch.zeros(bsz)
            for i, len_ratio in enumerate(asr_brain.feat_lens):
                mel_lens[i] = int(torch.round(len_ratio * batch_max_len))
            
            predicted_ids = ctc_beamsearch_decoder(
                predictions["ctc_logprobs"], lengths=mel_lens
            )

            # iterate over samples in batch
            for i, (utt_id, orig_text, hyps, wav) in enumerate(zip(batch.utt_id, orig_transcriptions, predicted_ids, wavs)):
                sample_printout = f"\nsample {i+1} - ({utt_id}: '{orig_text}')"
                orig_utt_text = "INSERT ORIG UTT TEXT"
                sample_printout += f"\nOriginal Utterance: {orig_utt_text}"
                # if print_info: print(f"\nsample {i+1} - ({utt_id}: '{orig_text}')")
                sample_cers = []
                for j, hyp in enumerate(list(hyps)[:max_hyps_per_sample]):
                    words = asr_brain.hparams.tokenizer.decode_ids(hyp.tokens.tolist())
                    if collapse_whitespace:
                        words = "".join(c for c in words if c != " ")
                    hyp_cer = 100 * cer(orig_text, words)
                    sample_cers.append(hyp_cer)
                    sample_printout += f"\n\thyp {j+1}/{len(hyps)} (CER={hyp_cer:.1f}%): '{words}'"
                    # if print_info: print(f"\thyp {j+1}/{len(hyps)} (CER={hyp_cer:.1f}%): '{words}'")
                    if not wordtypes_to_transcribe or orig_text in wordtypes_to_transcribe:
                        transcribed_words[orig_text].append(words)
                    
                sample_printout += f"\n\t=== Mean CER: {np.mean(sample_cers):.1f}%, Std CER: {np.std(sample_cers):.1f}% ==="
                # if print_info: print(f"\t=== Mean CER: {np.mean(sample_cers):.1f}%, Std CER: {np.std(sample_cers):.1f}% ===")

                if not wordtypes_to_transcribe or orig_text in wordtypes_to_transcribe:
                    orig_words.append(orig_text)
                    printouts.append(sample_printout)
                    orig_wavs.append(wav)

    output_dict = {
        "orig_words": orig_words,
        "transcribed_words": transcribed_words,
        "wavs": orig_wavs,
        "printouts": printouts,
    }

    return output_dict


## Compare hack with no hack (to set prob of whitespace to 0)

In [404]:
# NUM_BATCHES_TO_TRANSCRIBE = 1
# MAX_HYPS_PER_SAMPLE = 20

# transcription_output_dict_no_hack = transcribe_dataset(asr_brain, datasets["train"], 
#                                                num_batches_to_transcribe=NUM_BATCHES_TO_TRANSCRIBE,
#                                                collapse_whitespace=False,
#                                                max_hyps_per_sample=MAX_HYPS_PER_SAMPLE)

# transcription_output_dict_hack = transcribe_dataset(asr_brain, datasets["train"], 
#                                                num_batches_to_transcribe=NUM_BATCHES_TO_TRANSCRIBE,
#                                                collapse_whitespace=False, hack_whitespace_probs=True,
#                                                max_hyps_per_sample=MAX_HYPS_PER_SAMPLE)

# for printout1, printout2, wav in zip(transcription_output_dict_no_hack["printouts"], 
#                          transcription_output_dict_hack["printouts"], 
#                          transcription_output_dict_no_hack["wavs"]):
#     print('NO HACK', printout1)
#     print('HACK', printout2)
#     display(Audio(wav, rate=16000))

100%|███████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.63s/it]
100%|███████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.40s/it]

NO HACK 
sample 1 - (unconstitutionality__LJ021-0191__occ1__len25760: 'unconstitutionality')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=10.5%): 'on constitutionality '
	hyp 2/50 (CER=5.3%): 'un constitutionality '
	hyp 3/50 (CER=15.8%): 'on constitution ality '
	hyp 4/50 (CER=10.5%): 'un constitution ality '
	hyp 5/50 (CER=15.8%): 'on constitutionaloity '
	hyp 6/50 (CER=15.8%): 'on constitu tionality '
	hyp 7/50 (CER=15.8%): 'on constitutionaloty '
	hyp 8/50 (CER=10.5%): 'un constitutionaloity '
	hyp 9/50 (CER=21.1%): 'on constitution aloity '
	hyp 10/50 (CER=10.5%): 'un constitu tionality '
	hyp 11/50 (CER=10.5%): 'un constitutionaloty '
	hyp 12/50 (CER=21.1%): 'on constitu tion ality '
	hyp 13/50 (CER=21.1%): 'on constitution aloty '
	hyp 14/50 (CER=15.8%): 'un constitution aloity '
	hyp 15/50 (CER=15.8%): 'un constitu tion ality '
	hyp 16/50 (CER=15.8%): 'un constitution aloty '
	hyp 17/50 (CER=10.5%): 'oun constitutionality '
	hyp 18/50 (CER=15.8%): 'on constituti onal




NO HACK 
sample 2 - (anesthesiologists__LJ031-0023__occ1__len24000: 'anesthesiologists')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=52.9%): 'an as thease eolgiests '
	hyp 2/50 (CER=47.1%): 'an as thease eolgists '
	hyp 3/50 (CER=58.8%): 'an as thease eolgiest s '
	hyp 4/50 (CER=52.9%): 'an as thease eolgests '
	hyp 5/50 (CER=58.8%): 'an as thease e olgiests '
	hyp 6/50 (CER=52.9%): 'an as thease eolgist s '
	hyp 7/50 (CER=58.8%): 'an as thease eolgest s '
	hyp 8/50 (CER=52.9%): 'an as thease e olgists '
	hyp 9/50 (CER=64.7%): 'an as thease e olgiest s '
	hyp 10/50 (CER=47.1%): 'an as theaseeolgiests '
	hyp 11/50 (CER=58.8%): 'an as thease e olgests '
	hyp 12/50 (CER=58.8%): 'and as thease eolgiests '
	hyp 13/50 (CER=47.1%): 'anas thease eolgiests '
	hyp 14/50 (CER=58.8%): 'an as thease e olgist s '
	hyp 15/50 (CER=41.2%): 'an as theaseeolgists '
	hyp 16/50 (CER=52.9%): 'an as theaseeolgiest s '
	hyp 17/50 (CER=64.7%): 'an as thease e olgest s '
	hyp 18/50 (CER=52.9%): 'and

NO HACK 
sample 3 - (misrepresentations__LJ005-0034__occ1__len23200: 'misrepresentations')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=11.1%): 'misrepras entations '
	hyp 2/50 (CER=16.7%): 'mis repras entations '
	hyp 3/50 (CER=11.1%): 'misreprs entations '
	hyp 4/50 (CER=16.7%): 'misreprase entations '
	hyp 5/50 (CER=16.7%): 'misrepras untations '
	hyp 6/50 (CER=16.7%): 'misrebras entations '
	hyp 7/50 (CER=16.7%): 'mis reprs entations '
	hyp 8/50 (CER=22.2%): 'mis reprase entations '
	hyp 9/50 (CER=16.7%): 'misrebpras entations '
	hyp 10/50 (CER=22.2%): 'mis repras untations '
	hyp 11/50 (CER=22.2%): 'mis rebras entations '
	hyp 12/50 (CER=16.7%): 'missrepras entations '
	hyp 13/50 (CER=22.2%): 'mis rebpras entations '
	hyp 14/50 (CER=11.1%): 'misreprse entations '
	hyp 15/50 (CER=16.7%): 'misreprs untations '
	hyp 16/50 (CER=22.2%): 'miss repras entations '
	hyp 17/50 (CER=16.7%): 'misrebrs entations '
	hyp 18/50 (CER=11.1%): 'misrepris entations '
	hyp 19/50 (CER=16.7%)

NO HACK 
sample 4 - (institutionalized__LJ040-0144__occ1__len21760: 'institutionalized')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/44 (CER=0.0%): 'institutionalized '
	hyp 2/44 (CER=5.9%): 'institutionalizsed '
	hyp 3/44 (CER=5.9%): 'nstitutionalized '
	hyp 4/44 (CER=11.8%): 'a institutionalized '
	hyp 5/44 (CER=5.9%): 'institutionaliezed '
	hyp 6/44 (CER=5.9%): 'institutionalinzed '
	hyp 7/44 (CER=5.9%): 'institutionallized '
	hyp 8/44 (CER=11.8%): 'e institutionalized '
	hyp 9/44 (CER=11.8%): 'nstitutionalizsed '
	hyp 10/44 (CER=17.6%): 'a institutionalizsed '
	hyp 11/44 (CER=5.9%): 'institutionalised '
	hyp 12/44 (CER=11.8%): 'a nstitutionalized '
	hyp 13/44 (CER=11.8%): 'institutionaliezsed '
	hyp 14/44 (CER=11.8%): 'nstitutionaliezed '
	hyp 15/44 (CER=17.6%): 'a institutionaliezed '
	hyp 16/44 (CER=0.0%): 'institutionalized '
	hyp 17/44 (CER=0.0%): 'institutionalized '
	hyp 18/44 (CER=11.8%): 'institutionalinzsed '
	hyp 19/44 (CER=11.8%): 'nstitutionalinzed '
	hyp 20/44 (CER

NO HACK 
sample 5 - (specifications__LJ043-0092__occ1__len21600: 'specifications')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/47 (CER=0.0%): 'specifications '
	hyp 2/47 (CER=7.1%): 'spescifications '
	hyp 3/47 (CER=0.0%): 'specifications '
	hyp 4/47 (CER=7.1%): 'specificcations '
	hyp 5/47 (CER=7.1%): 'speecifications '
	hyp 6/47 (CER=7.1%): 'specificationsd '
	hyp 7/47 (CER=7.1%): 'spectifications '
	hyp 8/47 (CER=7.1%): 'spefcifications '
	hyp 9/47 (CER=7.1%): 'spe cifications '
	hyp 10/47 (CER=7.1%): 'specifocations '
	hyp 11/47 (CER=7.1%): 'speciffications '
	hyp 12/47 (CER=7.1%): 'pecifications '
	hyp 13/47 (CER=7.1%): 'speccifications '
	hyp 14/47 (CER=7.1%): 'spetcifications '
	hyp 15/47 (CER=7.1%): 'specificationes '
	hyp 16/47 (CER=7.1%): 'supecifications '
	hyp 17/47 (CER=7.1%): 'spescifications '
	hyp 18/47 (CER=7.1%): 'specificatsions '
	hyp 19/47 (CER=14.3%): 'spescificcations '
	hyp 20/47 (CER=7.1%): 'specsifications '
	=== Mean CER: 6.8%, Std CER: 2.7% ===
HACK 
sam

## Generate multiple times to see if outputs change (i.e. should change if dropout is enabled in model)

In [424]:
NUM_BATCHES_TO_TRANSCRIBE = 1
MAX_HYPS_PER_SAMPLE = 20

wordtypes_to_transcribe = None
# wordtypes_to_transcribe = ['anesthesiologists'] # used for debugging and checking results only for a subset of wordtypes

NUM_ITERATIONS = 5

word_sets = defaultdict(set)

for _ in range(NUM_ITERATIONS):
    transcription_output_dict_hack = transcribe_dataset(asr_brain, datasets["train"], 
                                                num_batches_to_transcribe=NUM_BATCHES_TO_TRANSCRIBE,
                                                collapse_whitespace=False, hack_whitespace_probs=True,
                                                max_hyps_per_sample=MAX_HYPS_PER_SAMPLE,
                                                wordtypes_to_transcribe=wordtypes_to_transcribe)

    for orig_word, printout, wav in zip(transcription_output_dict_hack["orig_words"], transcription_output_dict_hack["printouts"], 
                            transcription_output_dict_hack["wavs"]):
        print(printout)
        display(Audio(wav, rate=16000))

        old_count = len(word_sets[orig_word])
        word_sets[orig_word].update(transcription_output_dict_hack["transcribed_words"][orig_word])
        new_count = len(word_sets[orig_word])
        if new_count > old_count:
            print(f"added {new_count - old_count} new spellings for '{orig_word}' to its word set")

100%|███████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.41s/it]


sample 1 - (unconstitutionality__LJ021-0191__occ1__len25760: 'unconstitutionality')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/49 (CER=0.0%): 'unconstitutionality '
	hyp 2/49 (CER=5.3%): 'onconstitutionality '
	hyp 3/49 (CER=5.3%): 'unconstitutionaloity '
	hyp 4/49 (CER=10.5%): 'onconstitutionaloity '
	hyp 5/49 (CER=5.3%): 'unconstitotionality '
	hyp 6/49 (CER=5.3%): 'unconstitutionaleity '
	hyp 7/49 (CER=5.3%): 'gunconstitutionality '
	hyp 8/49 (CER=5.3%): 'unconstitutionaloty '
	hyp 9/49 (CER=5.3%): 'unconstitutcionality '
	hyp 10/49 (CER=5.3%): 'unconstitutionaality '
	hyp 11/49 (CER=5.3%): 'unconstitutionalty '
	hyp 12/49 (CER=5.3%): 'tunconstitutionality '
	hyp 13/49 (CER=10.5%): 'onconstitotionality '
	hyp 14/49 (CER=5.3%): 'ounconstitutionality '
	hyp 15/49 (CER=5.3%): 'unconstitutionhality '
	hyp 16/49 (CER=5.3%): 'cunconstitutionality '
	hyp 17/49 (CER=10.5%): 'onconstitutionaleity '
	hyp 18/49 (CER=10.5%): 'gonconstitutionality '
	hyp 19/49 (CER=10.5%): 'onconstitutiona




added 20 new spellings for 'unconstitutionality' to its word set

sample 2 - (anesthesiologists__LJ031-0023__occ1__len24000: 'anesthesiologists')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=35.3%): 'anistheaseeolegests '
	hyp 2/50 (CER=35.3%): 'anastheaseeolegests '
	hyp 3/50 (CER=29.4%): 'anistheaseeologests '
	hyp 4/50 (CER=29.4%): 'anastheaseeologests '
	hyp 5/50 (CER=29.4%): 'anistheaseeolegists '
	hyp 6/50 (CER=29.4%): 'anastheaseeolegists '
	hyp 7/50 (CER=23.5%): 'anistheaseeologists '
	hyp 8/50 (CER=23.5%): 'anastheaseeologists '
	hyp 9/50 (CER=29.4%): 'anistheaseolegests '
	hyp 10/50 (CER=29.4%): 'anastheaseolegests '
	hyp 11/50 (CER=23.5%): 'anistheaseologests '
	hyp 12/50 (CER=23.5%): 'anastheaseologests '
	hyp 13/50 (CER=23.5%): 'anistheaseolegists '
	hyp 14/50 (CER=29.4%): 'anestheaseeolegests '
	hyp 15/50 (CER=23.5%): 'anastheaseolegists '
	hyp 16/50 (CER=17.6%): 'anistheaseologists '
	hyp 17/50 (CER=23.5%): 'anestheaseeologests '
	hyp 18/50 (CER=41.2%): 'anist

added 20 new spellings for 'anesthesiologists' to its word set

sample 3 - (misrepresentations__LJ005-0034__occ1__len23200: 'misrepresentations')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/48 (CER=5.6%): 'misrepresuntations '
	hyp 2/48 (CER=11.1%): 'misreprasuntations '
	hyp 3/48 (CER=0.0%): 'misrepresentations '
	hyp 4/48 (CER=11.1%): 'misreprisuntations '
	hyp 5/48 (CER=5.6%): 'misreprasentations '
	hyp 6/48 (CER=11.1%): 'misrepreasuntations '
	hyp 7/48 (CER=5.6%): 'misreprisentations '
	hyp 8/48 (CER=5.6%): 'misrepreasentations '
	hyp 9/48 (CER=16.7%): 'misrepriasuntations '
	hyp 10/48 (CER=11.1%): 'misreprsuntations '
	hyp 11/48 (CER=5.6%): 'misrepreseuntations '
	hyp 12/48 (CER=11.1%): 'misrepriasentations '
	hyp 13/48 (CER=11.1%): 'misrepraseuntations '
	hyp 14/48 (CER=5.6%): 'misreprsentations '
	hyp 15/48 (CER=5.6%): 'misrepreseentations '
	hyp 16/48 (CER=11.1%): 'misrepriseuntations '
	hyp 17/48 (CER=11.1%): 'misrepraseentations '
	hyp 18/48 (CER=11.1%): 'misrepreaseuntat

added 20 new spellings for 'misrepresentations' to its word set

sample 4 - (institutionalized__LJ040-0144__occ1__len21760: 'institutionalized')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/47 (CER=5.9%): 'einstitutionalized '
	hyp 2/47 (CER=11.8%): 'einstitutionaliezed '
	hyp 3/47 (CER=0.0%): 'institutionalized '
	hyp 4/47 (CER=5.9%): 'institutionaliezed '
	hyp 5/47 (CER=11.8%): 'heinstitutionalized '
	hyp 6/47 (CER=17.6%): 'heinstitutionaliezed '
	hyp 7/47 (CER=5.9%): 'enstitutionalized '
	hyp 8/47 (CER=5.9%): 'hinstitutionalized '
	hyp 9/47 (CER=11.8%): 'enstitutionaliezed '
	hyp 10/47 (CER=11.8%): 'hinstitutionaliezed '
	hyp 11/47 (CER=5.9%): 'nstitutionalized '
	hyp 12/47 (CER=11.8%): 'einstotutionalized '
	hyp 13/47 (CER=11.8%): 'nstitutionaliezed '
	hyp 14/47 (CER=17.6%): 'einstotutionaliezed '
	hyp 15/47 (CER=5.9%): 'instotutionalized '
	hyp 16/47 (CER=11.8%): 'henstitutionalized '
	hyp 17/47 (CER=11.8%): 'instotutionaliezed '
	hyp 18/47 (CER=17.6%): 'henstitutionaliezed '
	

added 20 new spellings for 'institutionalized' to its word set

sample 5 - (specifications__LJ043-0092__occ1__len21600: 'specifications')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/47 (CER=0.0%): 'specifications '
	hyp 2/47 (CER=7.1%): 'speciffications '
	hyp 3/47 (CER=7.1%): 'spescifications '
	hyp 4/47 (CER=7.1%): 'spetcifications '
	hyp 5/47 (CER=7.1%): 'speecifications '
	hyp 6/47 (CER=7.1%): 'specsifications '
	hyp 7/47 (CER=7.1%): 'specifocations '
	hyp 8/47 (CER=14.3%): 'spesciffications '
	hyp 9/47 (CER=7.1%): 'speifications '
	hyp 10/47 (CER=7.1%): 'sprecifications '
	hyp 11/47 (CER=14.3%): 'spetciffications '
	hyp 12/47 (CER=14.3%): 'speeciffications '
	hyp 13/47 (CER=7.1%): 'speccifications '
	hyp 14/47 (CER=7.1%): 'sppecifications '
	hyp 15/47 (CER=7.1%): 'spesifications '
	hyp 16/47 (CER=14.3%): 'spestcifications '
	hyp 17/47 (CER=14.3%): 'specsiffications '
	hyp 18/47 (CER=7.1%): 'specificcations '
	hyp 19/47 (CER=14.3%): 'speescifications '
	hyp 20/47 (CER=14.3%): '

added 20 new spellings for 'specifications' to its word set


100%|███████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.40s/it]


sample 1 - (unconstitutionality__LJ021-0191__occ1__len25760: 'unconstitutionality')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=5.3%): 'unconstitutionaloity '
	hyp 2/50 (CER=0.0%): 'unconstitutionality '
	hyp 3/50 (CER=5.3%): 'unconstitutionaloty '
	hyp 4/50 (CER=10.5%): 'onconstitutionaloity '
	hyp 5/50 (CER=5.3%): 'onconstitutionality '
	hyp 6/50 (CER=10.5%): 'onconstitutionaloty '
	hyp 7/50 (CER=5.3%): 'unconstitutionalioty '
	hyp 8/50 (CER=5.3%): 'unconstitutionaleity '
	hyp 9/50 (CER=10.5%): 'onconstitutionalioty '
	hyp 10/50 (CER=10.5%): 'unconstitutionaleoty '
	hyp 11/50 (CER=10.5%): 'onconstitutionaleity '
	hyp 12/50 (CER=10.5%): 'unconstitutiownaloity '
	hyp 13/50 (CER=5.3%): 'unconstitutiownality '
	hyp 14/50 (CER=10.5%): 'unconstitotionaloity '
	hyp 15/50 (CER=5.3%): 'unconstitotionality '
	hyp 16/50 (CER=10.5%): 'unconstitutionaloirty '
	hyp 17/50 (CER=5.3%): 'unconstitutionalirty '
	hyp 18/50 (CER=15.8%): 'onconstitutionaleoty '
	hyp 19/50 (CER=10.5%): 'uncons




added 11 new spellings for 'unconstitutionality' to its word set

sample 2 - (anesthesiologists__LJ031-0023__occ1__len24000: 'anesthesiologists')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=29.4%): 'andastheseeologests '
	hyp 2/50 (CER=35.3%): 'andastheseeologeests '
	hyp 3/50 (CER=35.3%): 'andastheseheologests '
	hyp 4/50 (CER=35.3%): 'andastheaseeologests '
	hyp 5/50 (CER=41.2%): 'andastheseheologeests '
	hyp 6/50 (CER=41.2%): 'andastheaseeologeests '
	hyp 7/50 (CER=41.2%): 'andastheaseheologests '
	hyp 8/50 (CER=23.5%): 'anastheseeologests '
	hyp 9/50 (CER=47.1%): 'andastheaseheologeests '
	hyp 10/50 (CER=29.4%): 'anastheseeologeests '
	hyp 11/50 (CER=29.4%): 'anastheseheologests '
	hyp 12/50 (CER=29.4%): 'andatheseeologests '
	hyp 13/50 (CER=29.4%): 'anastheaseeologests '
	hyp 14/50 (CER=35.3%): 'anastheseheologeests '
	hyp 15/50 (CER=35.3%): 'andasheseeologests '
	hyp 16/50 (CER=35.3%): 'andatheseeologeests '
	hyp 17/50 (CER=35.3%): 'anastheaseeologeests '
	hyp 18/50 (

added 19 new spellings for 'anesthesiologists' to its word set

sample 3 - (misrepresentations__LJ005-0034__occ1__len23200: 'misrepresentations')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=5.6%): 'misreprsentations '
	hyp 2/50 (CER=5.6%): 'misreprasentations '
	hyp 3/50 (CER=11.1%): 'misreprsuntations '
	hyp 4/50 (CER=11.1%): 'misreprasuntations '
	hyp 5/50 (CER=11.1%): 'misreprsontations '
	hyp 6/50 (CER=11.1%): 'misrepersentations '
	hyp 7/50 (CER=11.1%): 'misreprasontations '
	hyp 8/50 (CER=11.1%): 'misreperasentations '
	hyp 9/50 (CER=16.7%): 'misrepersuntations '
	hyp 10/50 (CER=16.7%): 'misreperasuntations '
	hyp 11/50 (CER=11.1%): 'misrebrsentations '
	hyp 12/50 (CER=11.1%): 'mistreprsentations '
	hyp 13/50 (CER=11.1%): 'misreprseentations '
	hyp 14/50 (CER=11.1%): 'misrebrasentations '
	hyp 15/50 (CER=11.1%): 'misrepprsentations '
	hyp 16/50 (CER=11.1%): 'mistreprasentations '
	hyp 17/50 (CER=11.1%): 'misrepraseentations '
	hyp 18/50 (CER=11.1%): 'misrepprasentatio

added 15 new spellings for 'misrepresentations' to its word set

sample 4 - (institutionalized__LJ040-0144__occ1__len21760: 'institutionalized')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=5.9%): 'einstitutionalized '
	hyp 2/50 (CER=0.0%): 'institutionalized '
	hyp 3/50 (CER=11.8%): 'heinstitutionalized '
	hyp 4/50 (CER=5.9%): 'hinstitutionalized '
	hyp 5/50 (CER=11.8%): 'einstitutionallized '
	hyp 6/50 (CER=5.9%): 'institutionallized '
	hyp 7/50 (CER=11.8%): 'einstitutionalizced '
	hyp 8/50 (CER=11.8%): 'einstitutionalizsed '
	hyp 9/50 (CER=5.9%): 'institutionalizced '
	hyp 10/50 (CER=5.9%): 'institutionalizsed '
	hyp 11/50 (CER=17.6%): 'heinstitutionallized '
	hyp 12/50 (CER=11.8%): 'hinstitutionallized '
	hyp 13/50 (CER=17.6%): 'hheinstitutionalized '
	hyp 14/50 (CER=17.6%): 'heinstitutionalizced '
	hyp 15/50 (CER=17.6%): 'heinstitutionalizsed '
	hyp 16/50 (CER=11.8%): 'hhinstitutionalized '
	hyp 17/50 (CER=11.8%): 'hinstitutionalizced '
	hyp 18/50 (CER=5.9%): 'ainstitut

added 16 new spellings for 'institutionalized' to its word set

sample 5 - (specifications__LJ043-0092__occ1__len21600: 'specifications')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/43 (CER=0.0%): 'specifications '
	hyp 2/43 (CER=7.1%): 'spescifications '
	hyp 3/43 (CER=7.1%): 'speciffications '
	hyp 4/43 (CER=7.1%): 'specificcations '
	hyp 5/43 (CER=7.1%): 'spefcifications '
	hyp 6/43 (CER=7.1%): 'speccifications '
	hyp 7/43 (CER=7.1%): 'specificastions '
	hyp 8/43 (CER=7.1%): 'speifications '
	hyp 9/43 (CER=7.1%): 'specsifications '
	hyp 10/43 (CER=14.3%): 'spesciffications '
	hyp 11/43 (CER=7.1%): 'specificationns '
	hyp 12/43 (CER=7.1%): 'specefications '
	hyp 13/43 (CER=7.1%): 'specafications '
	hyp 14/43 (CER=7.1%): 'spesifications '
	hyp 15/43 (CER=7.1%): 'sppecifications '
	hyp 16/43 (CER=14.3%): 'spesscifications '
	hyp 17/43 (CER=14.3%): 'spescificcations '
	hyp 18/43 (CER=7.1%): 'spetcifications '
	hyp 19/43 (CER=14.3%): 'specifficcations '
	hyp 20/43 (CER=7.1%): 'spefif

added 9 new spellings for 'specifications' to its word set


100%|███████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.41s/it]


sample 1 - (unconstitutionality__LJ021-0191__occ1__len25760: 'unconstitutionality')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=0.0%): 'unconstitutionality '
	hyp 2/50 (CER=5.3%): 'unconstitutionalioty '
	hyp 3/50 (CER=5.3%): 'unconstitutionaloity '
	hyp 4/50 (CER=5.3%): 'onconstitutionality '
	hyp 5/50 (CER=5.3%): 'unconstitutionaloty '
	hyp 6/50 (CER=10.5%): 'onconstitutionalioty '
	hyp 7/50 (CER=10.5%): 'onconstitutionaloity '
	hyp 8/50 (CER=5.3%): 'unconstitutionaulity '
	hyp 9/50 (CER=5.3%): 'unconstitutionalty '
	hyp 10/50 (CER=10.5%): 'onconstitutionaloty '
	hyp 11/50 (CER=5.3%): 'unconsttitutionality '
	hyp 12/50 (CER=10.5%): 'unconstitutionaulioty '
	hyp 13/50 (CER=10.5%): 'unconstitutionauloity '
	hyp 14/50 (CER=5.3%): 'cunconstitutionality '
	hyp 15/50 (CER=10.5%): 'unconsttitutionalioty '
	hyp 16/50 (CER=10.5%): 'onconstitutionaulity '
	hyp 17/50 (CER=10.5%): 'onconstitutionalty '
	hyp 18/50 (CER=10.5%): 'unconsttitutionaloity '
	hyp 19/50 (CER=10.5%): 'unconst




added 10 new spellings for 'unconstitutionality' to its word set

sample 2 - (anesthesiologists__LJ031-0023__occ1__len24000: 'anesthesiologists')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=41.2%): 'andastheaseeolegests '
	hyp 2/50 (CER=35.3%): 'andastheseeolegests '
	hyp 3/50 (CER=35.3%): 'andastheaseeolegists '
	hyp 4/50 (CER=29.4%): 'andastheseeolegists '
	hyp 5/50 (CER=35.3%): 'anastheaseeolegests '
	hyp 6/50 (CER=29.4%): 'anastheseeolegests '
	hyp 7/50 (CER=47.1%): 'andastheaseheolegests '
	hyp 8/50 (CER=41.2%): 'andastheseheolegests '
	hyp 9/50 (CER=29.4%): 'anastheaseeolegists '
	hyp 10/50 (CER=23.5%): 'anastheseeolegists '
	hyp 11/50 (CER=41.2%): 'andistheaseeolegests '
	hyp 12/50 (CER=41.2%): 'andastheaseheolegists '
	hyp 13/50 (CER=35.3%): 'andistheseeolegests '
	hyp 14/50 (CER=35.3%): 'andastheseheolegists '
	hyp 15/50 (CER=35.3%): 'andistheaseeolegists '
	hyp 16/50 (CER=29.4%): 'andistheseeolegists '
	hyp 17/50 (CER=41.2%): 'anastheaseheolegests '
	hyp 18/50 (CE

added 17 new spellings for 'anesthesiologists' to its word set

sample 3 - (misrepresentations__LJ005-0034__occ1__len23200: 'misrepresentations')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=5.6%): 'misreprasentations '
	hyp 2/50 (CER=11.1%): 'misreprasuntations '
	hyp 3/50 (CER=11.1%): 'misreperasentations '
	hyp 4/50 (CER=5.6%): 'misreprsentations '
	hyp 5/50 (CER=16.7%): 'misreperasuntations '
	hyp 6/50 (CER=11.1%): 'misreprsuntations '
	hyp 7/50 (CER=11.1%): 'misrepersentations '
	hyp 8/50 (CER=16.7%): 'misrepersuntations '
	hyp 9/50 (CER=11.1%): 'misreprasontations '
	hyp 10/50 (CER=16.7%): 'misreperasontations '
	hyp 11/50 (CER=11.1%): 'missreprasentations '
	hyp 12/50 (CER=11.1%): 'misreporasentations '
	hyp 13/50 (CER=11.1%): 'misreprsontations '
	hyp 14/50 (CER=16.7%): 'missreprasuntations '
	hyp 15/50 (CER=11.1%): 'misreprasemntations '
	hyp 16/50 (CER=16.7%): 'misreporasuntations '
	hyp 17/50 (CER=16.7%): 'missreperasentations '
	hyp 18/50 (CER=16.7%): 'misreprasu

added 10 new spellings for 'misrepresentations' to its word set

sample 4 - (institutionalized__LJ040-0144__occ1__len21760: 'institutionalized')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=5.9%): 'einstitutionalized '
	hyp 2/50 (CER=0.0%): 'institutionalized '
	hyp 3/50 (CER=11.8%): 'heinstitutionalized '
	hyp 4/50 (CER=5.9%): 'hinstitutionalized '
	hyp 5/50 (CER=11.8%): 'teinstitutionalized '
	hyp 6/50 (CER=5.9%): 'tinstitutionalized '
	hyp 7/50 (CER=5.9%): 'ainstitutionalized '
	hyp 8/50 (CER=11.8%): 'einstitutionallized '
	hyp 9/50 (CER=11.8%): 'einstitutionalizsed '
	hyp 10/50 (CER=11.8%): 'einstitutionalised '
	hyp 11/50 (CER=5.9%): 'institutionallized '
	hyp 12/50 (CER=5.9%): 'institutionalizsed '
	hyp 13/50 (CER=5.9%): 'institutionalised '
	hyp 14/50 (CER=17.6%): 'hheinstitutionalized '
	hyp 15/50 (CER=11.8%): 'einstiutionalized '
	hyp 16/50 (CER=11.8%): 'hhinstitutionalized '
	hyp 17/50 (CER=17.6%): 'theinstitutionalized '
	hyp 18/50 (CER=11.8%): 'hainstitutionalize

added 9 new spellings for 'institutionalized' to its word set

sample 5 - (specifications__LJ043-0092__occ1__len21600: 'specifications')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/47 (CER=0.0%): 'specifications '
	hyp 2/47 (CER=7.1%): 'spesifications '
	hyp 3/47 (CER=7.1%): 'speifications '
	hyp 4/47 (CER=7.1%): 'spescifications '
	hyp 5/47 (CER=7.1%): 'spetcifications '
	hyp 6/47 (CER=7.1%): 'specsifications '
	hyp 7/47 (CER=14.3%): 'spetsifications '
	hyp 8/47 (CER=14.3%): 'spessifications '
	hyp 9/47 (CER=7.1%): 'speccifications '
	hyp 10/47 (CER=7.1%): 'speciffications '
	hyp 11/47 (CER=7.1%): 'spectifications '
	hyp 12/47 (CER=7.1%): 'spetifications '
	hyp 13/47 (CER=14.3%): 'spetscifications '
	hyp 14/47 (CER=14.3%): 'spesscifications '
	hyp 15/47 (CER=14.3%): 'spesiffications '
	hyp 16/47 (CER=14.3%): 'spestifications '
	hyp 17/47 (CER=14.3%): 'spetcsifications '
	hyp 18/47 (CER=14.3%): 'spescsifications '
	hyp 19/47 (CER=14.3%): 'specscifications '
	hyp 20/47 (CER=14.3%): 

added 11 new spellings for 'specifications' to its word set


100%|███████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.42s/it]


sample 1 - (unconstitutionality__LJ021-0191__occ1__len25760: 'unconstitutionality')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=5.3%): 'unconstitutionaloity '
	hyp 2/50 (CER=5.3%): 'unconstitutionaloty '
	hyp 3/50 (CER=10.5%): 'onconstitutionaloity '
	hyp 4/50 (CER=10.5%): 'onconstitutionaloty '
	hyp 5/50 (CER=0.0%): 'unconstitutionality '
	hyp 6/50 (CER=5.3%): 'unconstitutionalioty '
	hyp 7/50 (CER=5.3%): 'onconstitutionality '
	hyp 8/50 (CER=10.5%): 'onconstitutionalioty '
	hyp 9/50 (CER=10.5%): 'unconstitoutionaloity '
	hyp 10/50 (CER=10.5%): 'unconstitutionhaloity '
	hyp 11/50 (CER=10.5%): 'unconstatutionaloity '
	hyp 12/50 (CER=10.5%): 'tunconstitutionaloity '
	hyp 13/50 (CER=10.5%): 'unconstitutiornaloity '
	hyp 14/50 (CER=10.5%): 'unconstitotionaloity '
	hyp 15/50 (CER=10.5%): 'unconstitoutionaloty '
	hyp 16/50 (CER=10.5%): 'unconstitutionalolty '
	hyp 17/50 (CER=10.5%): 'unconstitutionhaloty '
	hyp 18/50 (CER=10.5%): 'unconstatutionaloty '
	hyp 19/50 (CER=10.5%): '




added 10 new spellings for 'unconstitutionality' to its word set

sample 2 - (anesthesiologists__LJ031-0023__occ1__len24000: 'anesthesiologists')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=29.4%): 'anastheasheologiests '
	hyp 2/50 (CER=29.4%): 'anastheasheologests '
	hyp 3/50 (CER=35.3%): 'anastheaseheologiests '
	hyp 4/50 (CER=35.3%): 'anasheasheologiests '
	hyp 5/50 (CER=23.5%): 'anastheasheologists '
	hyp 6/50 (CER=35.3%): 'anastheaseheologests '
	hyp 7/50 (CER=35.3%): 'andastheasheologiests '
	hyp 8/50 (CER=35.3%): 'anasheasheologests '
	hyp 9/50 (CER=41.2%): 'anasheaseheologiests '
	hyp 10/50 (CER=29.4%): 'anistheasheologiests '
	hyp 11/50 (CER=35.3%): 'andastheasheologests '
	hyp 12/50 (CER=29.4%): 'anastheashyologiests '
	hyp 13/50 (CER=29.4%): 'anastheaseheologists '
	hyp 14/50 (CER=41.2%): 'andastheaseheologiests '
	hyp 15/50 (CER=29.4%): 'anasheasheologists '
	hyp 16/50 (CER=23.5%): 'anasthesheologiests '
	hyp 17/50 (CER=41.2%): 'anasheaseheologests '
	hyp 18/50 

added 19 new spellings for 'anesthesiologists' to its word set

sample 3 - (misrepresentations__LJ005-0034__occ1__len23200: 'misrepresentations')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/46 (CER=5.6%): 'misreprsentations '
	hyp 2/46 (CER=5.6%): 'misreprasentations '
	hyp 3/46 (CER=11.1%): 'misreprsuntations '
	hyp 4/46 (CER=11.1%): 'misreprasuntations '
	hyp 5/46 (CER=11.1%): 'misreprsontations '
	hyp 6/46 (CER=11.1%): 'misreprasontations '
	hyp 7/46 (CER=11.1%): 'misrebrsentations '
	hyp 8/46 (CER=11.1%): 'misrebrasentations '
	hyp 9/46 (CER=11.1%): 'misreprsantations '
	hyp 10/46 (CER=16.7%): 'misrebrsuntations '
	hyp 11/46 (CER=11.1%): 'misreprasantations '
	hyp 12/46 (CER=16.7%): 'misrebrasuntations '
	hyp 13/46 (CER=11.1%): 'missreprsentations '
	hyp 14/46 (CER=11.1%): 'misreprseentations '
	hyp 15/46 (CER=11.1%): 'missreprasentations '
	hyp 16/46 (CER=5.6%): 'misreprisentations '
	hyp 17/46 (CER=16.7%): 'missreprsuntations '
	hyp 18/46 (CER=11.1%): 'misrepraseentations '
	

added 6 new spellings for 'misrepresentations' to its word set

sample 4 - (institutionalized__LJ040-0144__occ1__len21760: 'institutionalized')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/44 (CER=5.9%): 'nstitutionalized '
	hyp 2/44 (CER=11.8%): 'nstotutionalized '
	hyp 3/44 (CER=0.0%): 'institutionalized '
	hyp 4/44 (CER=11.8%): 'nstitutionaliszed '
	hyp 5/44 (CER=11.8%): 'nstitutionalizsed '
	hyp 6/44 (CER=11.8%): 'nstitutionalinzed '
	hyp 7/44 (CER=11.8%): 'nstetutionalized '
	hyp 8/44 (CER=5.9%): 'tnstitutionalized '
	hyp 9/44 (CER=11.8%): 'nstitutionalizced '
	hyp 10/44 (CER=11.8%): 'nstitutionallized '
	hyp 11/44 (CER=5.9%): 'enstitutionalized '
	hyp 12/44 (CER=5.9%): 'instotutionalized '
	hyp 13/44 (CER=17.6%): 'nstotutionaliszed '
	hyp 14/44 (CER=11.8%): 'nstituntionalized '
	hyp 15/44 (CER=5.9%): 'institutionaliszed '
	hyp 16/44 (CER=11.8%): 'nstitutionaliczed '
	hyp 17/44 (CER=5.9%): 'hnstitutionalized '
	hyp 18/44 (CER=11.8%): 'nstiutionalized '
	hyp 19/44 (CER=11.8%): '

added 15 new spellings for 'institutionalized' to its word set

sample 5 - (specifications__LJ043-0092__occ1__len21600: 'specifications')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/49 (CER=0.0%): 'specifications '
	hyp 2/49 (CER=7.1%): 'specificcations '
	hyp 3/49 (CER=7.1%): 'specsifications '
	hyp 4/49 (CER=7.1%): 'spetcifications '
	hyp 5/49 (CER=7.1%): 'spescifications '
	hyp 6/49 (CER=7.1%): 'spesifications '
	hyp 7/49 (CER=7.1%): 'specifocations '
	hyp 8/49 (CER=7.1%): 'specifoications '
	hyp 9/49 (CER=7.1%): 'speifications '
	hyp 10/49 (CER=7.1%): 'speccifications '
	hyp 11/49 (CER=7.1%): 'specfifications '
	hyp 12/49 (CER=14.3%): 'specsificcations '
	hyp 13/49 (CER=14.3%): 'spetcificcations '
	hyp 14/49 (CER=14.3%): 'spescificcations '
	hyp 15/49 (CER=14.3%): 'spetcsifications '
	hyp 16/49 (CER=14.3%): 'spescsifications '
	hyp 17/49 (CER=7.1%): 'spefcifications '
	hyp 18/49 (CER=7.1%): 'speciffications '
	hyp 19/49 (CER=7.1%): 'spefifications '
	hyp 20/49 (CER=14.3%): 'spe

added 5 new spellings for 'specifications' to its word set


100%|███████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.40s/it]


sample 1 - (unconstitutionality__LJ021-0191__occ1__len25760: 'unconstitutionality')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=0.0%): 'unconstitutionality '
	hyp 2/50 (CER=5.3%): 'onconstitutionality '
	hyp 3/50 (CER=5.3%): 'unconstitutionaloty '
	hyp 4/50 (CER=5.3%): 'unconstatutionality '
	hyp 5/50 (CER=5.3%): 'cunconstitutionality '
	hyp 6/50 (CER=5.3%): 'unconstitusionality '
	hyp 7/50 (CER=5.3%): 'unconstitutionhality '
	hyp 8/50 (CER=5.3%): 'unconstitutionalty '
	hyp 9/50 (CER=5.3%): 'tunconstitutionality '
	hyp 10/50 (CER=5.3%): 'unconstitucionality '
	hyp 11/50 (CER=5.3%): 'unconstiatutionality '
	hyp 12/50 (CER=5.3%): 'unconstitutsionality '
	hyp 13/50 (CER=5.3%): 'unconsttutionality '
	hyp 14/50 (CER=10.5%): 'onconstitutionaloty '
	hyp 15/50 (CER=5.3%): 'unconstitutcionality '
	hyp 16/50 (CER=5.3%): 'unconstitudionality '
	hyp 17/50 (CER=5.3%): 'unconstitutionalit '
	hyp 18/50 (CER=5.3%): 'ounconstitutionality '
	hyp 19/50 (CER=5.3%): 'unconstitutionaloity '
	hy




added 8 new spellings for 'unconstitutionality' to its word set

sample 2 - (anesthesiologists__LJ031-0023__occ1__len24000: 'anesthesiologists')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/50 (CER=29.4%): 'anestheaseeolegests '
	hyp 2/50 (CER=23.5%): 'anestheaseeologests '
	hyp 3/50 (CER=35.3%): 'anistheaseeolegests '
	hyp 4/50 (CER=29.4%): 'anistheaseeologests '
	hyp 5/50 (CER=23.5%): 'anestheseeolegests '
	hyp 6/50 (CER=17.6%): 'anestheseeologests '
	hyp 7/50 (CER=29.4%): 'anistheseeolegests '
	hyp 8/50 (CER=23.5%): 'anistheseeologests '
	hyp 9/50 (CER=35.3%): 'anesheaseeolegests '
	hyp 10/50 (CER=29.4%): 'anesheaseeologests '
	hyp 11/50 (CER=41.2%): 'anisheaseeolegests '
	hyp 12/50 (CER=35.3%): 'anisheaseeologests '
	hyp 13/50 (CER=29.4%): 'anesheseeolegests '
	hyp 14/50 (CER=23.5%): 'anesheseeologests '
	hyp 15/50 (CER=35.3%): 'anastheaseeolegests '
	hyp 16/50 (CER=29.4%): 'anastheaseeologests '
	hyp 17/50 (CER=35.3%): 'anisheseeolegests '
	hyp 18/50 (CER=29.4%): 'anisheseeolog

added 14 new spellings for 'anesthesiologists' to its word set

sample 3 - (misrepresentations__LJ005-0034__occ1__len23200: 'misrepresentations')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/49 (CER=11.1%): 'misrepersentations '
	hyp 2/49 (CER=5.6%): 'misreprsentations '
	hyp 3/49 (CER=11.1%): 'misreperasentations '
	hyp 4/49 (CER=5.6%): 'misreprasentations '
	hyp 5/49 (CER=16.7%): 'misrepersontations '
	hyp 6/49 (CER=16.7%): 'missrepersentations '
	hyp 7/49 (CER=16.7%): 'misrepersuntations '
	hyp 8/49 (CER=11.1%): 'misreprsontations '
	hyp 9/49 (CER=11.1%): 'missreprsentations '
	hyp 10/49 (CER=11.1%): 'misreprsuntations '
	hyp 11/49 (CER=5.6%): 'misreperesentations '
	hyp 12/49 (CER=11.1%): 'misreparsentations '
	hyp 13/49 (CER=16.7%): 'misreperasontations '
	hyp 14/49 (CER=16.7%): 'missreperasentations '
	hyp 15/49 (CER=0.0%): 'misrepresentations '
	hyp 16/49 (CER=16.7%): 'misreperasuntations '
	hyp 17/49 (CER=16.7%): 'misrepersantations '
	hyp 18/49 (CER=16.7%): 'misrepersenetat

added 6 new spellings for 'misrepresentations' to its word set

sample 4 - (institutionalized__LJ040-0144__occ1__len21760: 'institutionalized')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/49 (CER=11.8%): 'heinstitutionalized '
	hyp 2/49 (CER=11.8%): 'henstitutionalized '
	hyp 3/49 (CER=5.9%): 'einstitutionalized '
	hyp 4/49 (CER=5.9%): 'enstitutionalized '
	hyp 5/49 (CER=5.9%): 'hinstitutionalized '
	hyp 6/49 (CER=5.9%): 'hnstitutionalized '
	hyp 7/49 (CER=11.8%): 'hiinstitutionalized '
	hyp 8/49 (CER=11.8%): 'hainstitutionalized '
	hyp 9/49 (CER=11.8%): 'hanstitutionalized '
	hyp 10/49 (CER=0.0%): 'institutionalized '
	hyp 11/49 (CER=5.9%): 'nstitutionalized '
	hyp 12/49 (CER=5.9%): 'iinstitutionalized '
	hyp 13/49 (CER=5.9%): 'ainstitutionalized '
	hyp 14/49 (CER=5.9%): 'anstitutionalized '
	hyp 15/49 (CER=17.6%): 'theinstitutionalized '
	hyp 16/49 (CER=17.6%): 'heinstitutionallized '
	hyp 17/49 (CER=17.6%): 'heinstitutionaliszed '
	hyp 18/49 (CER=17.6%): 'thenstitutionalized '
	

added 8 new spellings for 'institutionalized' to its word set

sample 5 - (specifications__LJ043-0092__occ1__len21600: 'specifications')
Original Utterance: INSERT ORIG UTT TEXT
	hyp 1/46 (CER=0.0%): 'specifications '
	hyp 2/46 (CER=7.1%): 'spescifications '
	hyp 3/46 (CER=7.1%): 'specsifications '
	hyp 4/46 (CER=7.1%): 'speciffications '
	hyp 5/46 (CER=7.1%): 'spectifications '
	hyp 6/46 (CER=7.1%): 'spesifications '
	hyp 7/46 (CER=7.1%): 'speifications '
	hyp 8/46 (CER=14.3%): 'spescsifications '
	hyp 9/46 (CER=14.3%): 'spesscifications '
	hyp 10/46 (CER=7.1%): 'specificcations '
	hyp 11/46 (CER=14.3%): 'spesciffications '
	hyp 12/46 (CER=14.3%): 'spesctifications '
	hyp 13/46 (CER=14.3%): 'specsiffications '
	hyp 14/46 (CER=7.1%): 'specisfications '
	hyp 15/46 (CER=7.1%): 'spetcifications '
	hyp 16/46 (CER=7.1%): 'specifocations '
	hyp 17/46 (CER=7.1%): 'spefcifications '
	hyp 18/46 (CER=14.3%): 'spestifications '
	hyp 19/46 (CER=7.1%): 'spetifications '
	hyp 20/46 (CER=7.1%): 'spee

added 2 new spellings for 'specifications' to its word set


In [425]:
for orig_word, word_set in word_sets.items():
    print(f"\nGenerated {len(word_set)} respellings for '{orig_word}' (ordered a-z):")
    for word in sorted(word_set):
        print(f"\t{word}: {100*cer(orig_word, word):.2f}% CER")


Respellings for 'unconstitutionality' (ordered a-z):
	cunconstitutionality : 5.26% CER
	dunconstitutionality : 5.26% CER
	gonconstitutionality : 10.53% CER
	gunconstitutionality : 5.26% CER
	onconstitotionality : 10.53% CER
	onconstitutionaleity : 10.53% CER
	onconstitutionaleoty : 15.79% CER
	onconstitutionalioty : 10.53% CER
	onconstitutionality : 5.26% CER
	onconstitutionaloity : 10.53% CER
	onconstitutionaloty : 10.53% CER
	onconstitutionalty : 10.53% CER
	onconstitutionaulity : 10.53% CER
	ounconstitutionality : 5.26% CER
	tunconstitutionality : 5.26% CER
	tunconstitutionaloity : 10.53% CER
	tunconstitutionaloty : 10.53% CER
	unconstatutionality : 5.26% CER
	unconstatutionaloity : 10.53% CER
	unconstatutionaloty : 10.53% CER
	unconstiatutionality : 5.26% CER
	unconstitotionality : 5.26% CER
	unconstitotionaloity : 10.53% CER
	unconstitoutionaloity : 10.53% CER
	unconstitoutionaloty : 10.53% CER
	unconstitucionality : 5.26% CER
	unconstitudionality : 5.26% CER
	unconstituionality 