# Goal of this notebook

- use certain metrics (LM perplexity, G2P error) to identify wordtypes that are more likely to be mispronounced by our grapheme-input TTS system

# automatic reloading magic

In [1]:
%load_ext autoreload
%autoreload 2

# Imports 

In [2]:
import json
from datasets import load_dataset
import torch
import jiwer

# check if have correct type of node 

In [3]:
import socket

# print hostname to make sure we are on correct node
disallowed_nodes = ['escience6']
hostname = socket.gethostname()
print(hostname)
node = hostname.split('.')[0]
if node in disallowed_nodes:
    raise ValueError(f"Running on disallowed node {node}!")

levi.inf.ed.ac.uk


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.is_available()
assert torch.cuda.is_available()

# Load OOV list

(wordtypes not seen in the half of LJSpeech used to train TTS and ASR)

In [5]:
# get oov wordtypes list (words that are not seen in tts training)
oov_wordlist_path = '/home/s1785140/data/ljspeech_fastpitch/oov_list.json'
with open(oov_wordlist_path, 'r') as f:
    oovs_and_freqs = json.load(f)
    
oovs = set(w.strip() for w in oovs_and_freqs.keys())
print(f'original before cleaning/sampling {len(oovs)=}')

original before cleaning/sampling len(oovs)=8343


# Load G2P pronunciation lexicon

## LibriG2P Lexicon 

In [6]:
# load lexicon that G2P model was trained on
dataset_dict = load_dataset("flexthink/librig2p-nostress")

Found cached dataset librig2p-nostress (/home/s1785140/.cache/huggingface/datasets/flexthink___librig2p-nostress/default/0.0.0/95c204c6be42796a753ef410b5dfce2bfa21d61b51f0c3ffe85cf6e3a4dee65f)


  0%|          | 0/6 [00:00<?, ?it/s]

In [7]:
type(dataset_dict['lexicon_train'])

datasets.arrow_dataset.Dataset

In [8]:
# combine data splits in the lexicon 
# (as we are not training G2P, don't need train valid test splits)
from datasets import concatenate_datasets
datasets = [dataset_dict['lexicon_train'], dataset_dict['lexicon_valid'], dataset_dict['lexicon_test']]
dataset = concatenate_datasets(datasets)

In [9]:
# create dict of wordtype to pronunciation
lexicon = {}
for char, phn in zip(dataset['char'], dataset['phn']):
    lexicon[char.lower().strip()] = phn

## CMUdict

## combine the libriG2P with CMUdict

# only consider the OOV words that are in the pronunciation lexicon


In [10]:
len(oovs)

8343

In [11]:
oovs_in_lexicon = oovs.intersection(lexicon.keys())
len(oovs_in_lexicon)

7966

# G2P Error Rates

Only will work with words that occur in the lexicon?

In [12]:
oovs_in_lexicon = sorted(oovs_in_lexicon, key=lambda x: len(x), reverse=True) # sort by len to make batches more efficient (shud help with soundchoiceG2P, not sure about phonetisaurus)

## use g2p to predict phn seqs

In [13]:
from tqdm import tqdm
import os
wordtype2predicted_phnseq = {}

In [16]:
use_phonetisaurus = True

if use_phonetisaurus:
    import subprocess
    # to stop warning
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'

    shell_cmd = [
        "phonetisaurus",
        "predict",
        "--model", 
        "/home/s1785140/data/ljspeech_fastpitch/train_meta_half_phonetisaurusG2P_model.fst"
    ]
    words = oovs_in_lexicon
    shell_result = subprocess.run(shell_cmd + words, capture_output=True, text=True)
    lines = shell_result.stdout.split('\n')
    lines = [l for l in lines if l != ""] 
    
    wordtype2predicted_phnseq = {}
    for l in lines:
        spelling = l.split(" ")[0]
        phones = l.split(" ")[1:]
        wordtype2predicted_phnseq[spelling] = phones
else:
    # load G2P model 
    from speechbrain.pretrained import GraphemeToPhoneme
    g2p = GraphemeToPhoneme.from_hparams("speechbrain/soundchoice-g2p")
    
    batch_size = 128
    warning_phn_seqs = []
    for i in tqdm(range(0, len(oovs_in_lexicon), batch_size)):
        batch = oovs_in_lexicon[i:i+batch_size]
        predictions = g2p(batch)
        for wordtype, phn_seq in zip(batch, predictions):
            if "" in phn_seq or " " in phn_seq:
                warning_phn_seqs.append((wordtype, phn_seq))
            phn_seq = [phn for phn in phn_seq if phn not in ["", " "]] # strip empty phns
            wordtype2predicted_phnseq[wordtype] = phn_seq
        # break # debug

## calc PER for each phn seqs

In [17]:
import pandas as pd

In [18]:
import textdistance
lev = textdistance.Levenshtein() # create an instance of Levenshtein distance
def per(ref, hyp):
    dist = lev.distance(ref, hyp) # calculate the distance between ref and hyp
    per = dist / len(ref) # calculate the PER by dividing by the length of ref
    return per

In [19]:
data = []
for wordtype, predicted_phnseq in wordtype2predicted_phnseq.items():
    data.append({
        "wordtype": wordtype, 
        "lexicon_entry": lexicon[wordtype], 
        "predicted_phnseq": predicted_phnseq, 
        "per": per(lexicon[wordtype], predicted_phnseq),
    })

In [20]:
df = pd.DataFrame(data)
df.sort_values('per', ascending=False)

Unnamed: 0,wordtype,lexicon_entry,predicted_phnseq,per
7954,ah,[AA],"[AH, HH]",2.000000
4168,deutsche,"[D, OY, CH]","[D, IH, UW, T, S, K, IY]",2.000000
7874,nux,"[N, UW]","[N, AH, K, S]",1.500000
4423,jacques,"[ZH, AA, K]","[JH, AH, K, W, Z]",1.333333
6791,chaos,"[K, EY, AA, S]","[CH, AE, OW, Z]",1.000000
...,...,...,...,...
3566,fiercely,"[F, IH, R, S, L, IY]","[F, IH, R, S, L, IY]",0.000000
3565,laughing,"[L, AE, F, IH, NG]","[L, AE, F, IH, NG]",0.000000
3564,indulged,"[IH, N, D, AH, L, JH, D]","[IH, N, D, AH, L, JH, D]",0.000000
3562,coverest,"[K, AH, V, R, AH, S, T]","[K, AH, V, R, AH, S, T]",0.000000


##  save oov list with PERs to disk 

In [21]:
outpath = '/home/s1785140/data/ljspeech_fastpitch/oov_list_with_PER.pickle'
df.to_pickle(outpath)

## load oov with PER from disk

In [22]:
loaded_df = pd.read_pickle(outpath)
orig_num = len(loaded_df)
orig_num

7966

### do some filtering

In [23]:
str_min_len_threshold = 10
loaded_df = loaded_df[loaded_df['wordtype'].str.len() >= str_min_len_threshold]
new_num = len(loaded_df)
print(f'filtered out {orig_num - new_num}')
new_num

filtered out 5967


1999

### display

In [24]:
NUM_TO_DISPLAY = 50
loaded_df.sort_values('per', ascending=False).head(NUM_TO_DISPLAY)

Unnamed: 0,wordtype,lexicon_entry,predicted_phnseq,per
1275,saccharine,"[S, AE, K, ER, AY, N]","[S, EY, K, CH, EH, R, AH, N]",0.833333
692,tocqueville,"[T, OW, K, V, IH, L]","[T, AH, K, K, W, EH, V, IH, L]",0.666667
42,gloucestershire,"[G, L, AO, S, T, ER, SH, ER]","[G, L, AW, S, EH, S, T, ER, SH, IH, R]",0.625
1407,peripheral,"[P, ER, IH, F, R, AH, L]","[P, IH, R, IY, F, ER, AH, L]",0.571429
1963,allegiance,"[AH, L, IY, JH, AH, N, S]","[AE, L, AH, G, AY, AH, N, S]",0.571429
1543,khrushchev,"[K, R, UW, S, CH, EH, V]","[K, R, AH, SH, SH, IH, V]",0.571429
1630,bequeathed,"[B, AH, K, W, IY, TH, T]","[B, IH, K, IY, DH, D]",0.571429
1787,sepulchres,"[S, EY, P, UW, L, CH, R, Z]","[S, EH, P, Y, AH, L, CH, ER, Z]",0.5
1180,gloucester,"[G, L, AO, S, T, ER]","[G, L, AW, S, EH, S, T, ER]",0.5
729,exhortation,"[EH, G, Z, AO, R, T, EY, SH, AH, N]","[IH, K, S, HH, ER, T, EY, SH, AH, N]",0.5


# LM Perplexity