# Goal of this notebook

- use certain metrics (LM perplexity, G2P error) to identify wordtypes that are more likely to be mispronounced by our grapheme-input TTS system

# automatic reloading magic

In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Imports 

In [44]:
import json
from datasets import load_dataset
import torch
import jiwer

# check if have correct type of node 

In [45]:
import socket

# print hostname to make sure we are on correct node
disallowed_nodes = ['escience6']
hostname = socket.gethostname()
print(hostname)
node = hostname.split('.')[0]
if node in disallowed_nodes:
    raise ValueError(f"Running on disallowed node {node}!")

greider.inf.ed.ac.uk


In [46]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.is_available()
assert torch.cuda.is_available()

# Load OOV list

(wordtypes not seen in the half of LJSpeech used to train TTS and ASR)

In [47]:
# get oov wordtypes list (words that are not seen in tts training)
oov_wordlist_path = '/home/s1785140/data/ljspeech_fastpitch/oov_list.json'
with open(oov_wordlist_path, 'r') as f:
    oovs_and_freqs = json.load(f)
    
oovs = set(w.strip() for w in oovs_and_freqs.keys())
print(f'original before cleaning/sampling {len(oovs)=}')

original before cleaning/sampling len(oovs)=8343


# Load G2P pronunciation lexicon

In [48]:
# load lexicon that G2P model was trained on
dataset_dict = load_dataset("flexthink/librig2p-nostress")

Found cached dataset librig2p-nostress (/home/s1785140/.cache/huggingface/datasets/flexthink___librig2p-nostress/default/0.0.0/95c204c6be42796a753ef410b5dfce2bfa21d61b51f0c3ffe85cf6e3a4dee65f)


  0%|          | 0/6 [00:00<?, ?it/s]

In [49]:
type(dataset_dict['lexicon_train'])

datasets.arrow_dataset.Dataset

In [50]:
# combine data splits in the lexicon 
# (as we are not training G2P, don't need train valid test splits)
from datasets import concatenate_datasets
datasets = [dataset_dict['lexicon_train'], dataset_dict['lexicon_valid'], dataset_dict['lexicon_test']]
dataset = concatenate_datasets(datasets)

In [51]:
# create dict of wordtype to pronunciation
lexicon = {}
for char, phn in zip(dataset['char'], dataset['phn']):
    lexicon[char.lower().strip()] = phn

# only consider the OOV words that are in the pronunciation lexicon


In [52]:
len(oovs)

8343

In [53]:
oovs_in_lexicon = oovs.intersection(lexicon.keys())
len(oovs_in_lexicon)

7966

# G2P Error Rates

Only will work with words that occur in the lexicon?

In [75]:
# load G2P model 
from speechbrain.pretrained import GraphemeToPhoneme
g2p = GraphemeToPhoneme.from_hparams("speechbrain/soundchoice-g2p")

In [76]:
oovs_in_lexicon = sorted(oovs_in_lexicon, key=lambda x: len(x), reverse=True) # sort by len to make batches more efficient

In [77]:
l = ["AH", "BEE"]


## use g2p to predict phn seqs

In [78]:
from tqdm import tqdm
wordtype2predicted_phnseq = {}
batch_size = 128
warning_phn_seqs = []
for i in tqdm(range(0, len(oovs_in_lexicon), batch_size)):
    batch = oovs_in_lexicon[i:i+batch_size]
    predictions = g2p(batch)
    for wordtype, phn_seq in zip(batch, predictions):
        if "" in phn_seq or " " in phn_seq:
            warning_phn_seqs.append((wordtype, phn_seq))
        phn_seq = [phn for phn in phn_seq if phn not in ["", " "]] # strip empty phns
        wordtype2predicted_phnseq[wordtype] = phn_seq
    # break # debug

  0%|                                                              | 0/63 [00:00<?, ?it/s]Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█████████████████████████████████████████████████████| 63/63

## calc PER for each phn seqs

In [79]:
import pandas as pd

In [80]:
import textdistance
lev = textdistance.Levenshtein() # create an instance of Levenshtein distance
def per(ref, hyp):
    dist = lev.distance(ref, hyp) # calculate the distance between ref and hyp
    per = dist / len(ref) # calculate the PER by dividing by the length of ref
    return per

In [81]:
data = []
for wordtype, predicted_phnseq in wordtype2predicted_phnseq.items():
    data.append({
        "wordtype": wordtype, 
        "lexicon_entry": lexicon[wordtype], 
        "predicted_phnseq": predicted_phnseq, 
        "per": per(lexicon[wordtype], predicted_phnseq),
    })

In [82]:
df = pd.DataFrame(data)
df.sort_values('per', ascending=False)

Unnamed: 0,wordtype,lexicon_entry,predicted_phnseq,per
7884,hon,"[AH, N, ER, AH, B, AH, L]","[HH, AA, N]",1.000000
7953,un,"[Y, UW, EH, N]","[AH, N]",0.750000
7961,ne,"[N, AO, R, TH, IY, S, T]","[N, IY]",0.714286
5160,jacques,"[ZH, AA, K]","[JH, AA, K, EY]",0.666667
7926,moi,"[M, W, AA]","[M, OY]",0.666667
...,...,...,...,...
2996,detriment,"[D, EH, T, R, AH, M, AH, N, T]","[D, EH, T, R, AH, M, AH, N, T]",0.000000
2995,analogous,"[AH, N, AE, L, AH, G, AH, S]","[AH, N, AE, L, AH, G, AH, S]",0.000000
2994,equerries,"[IH, K, W, EH, R, IY, Z]","[IH, K, W, EH, R, IY, Z]",0.000000
2993,divisions,"[D, IH, V, IH, ZH, AH, N, Z]","[D, IH, V, IH, ZH, AH, N, Z]",0.000000


##  save oov list with PERs to disk 

In [115]:
outpath = '/home/s1785140/data/ljspeech_fastpitch/oov_list_with_PER.pickle'
df.to_pickle(outpath)

## load oov with PER from disk

In [119]:
loaded_df = pd.read_pickle(outpath)
orig_num = len(loaded_df)
orig_num

7966

### do some filtering

In [120]:
str_min_len_threshold = 10
loaded_df = loaded_df[loaded_df['wordtype'].str.len() >= str_min_len_threshold]
new_num = len(loaded_df)
print(f'filtered out {orig_num - new_num}')
new_num

filtered out 5967


1999

### display

In [125]:
NUM_TO_DISPLAY = 50
loaded_df.sort_values('per', ascending=False).head(NUM_TO_DISPLAY)

Unnamed: 0,wordtype,lexicon_entry,predicted_phnseq,per
1505,exhumation,"[EH, K, S, HH, Y, UW, M, EY, SH, AH, N]","[EH, G, Z, AH, M, EY, SH, AH, N]",0.454545
1223,subpoenaed,"[S, AH, P, IY, N, AH, D]","[S, AH, P, OW, AH, N, EH, D]",0.428571
395,experimental,"[IH, K, S, P, ER, M, EH, N, AH, L]","[IH, K, S, P, EH, R, AH, M, EH, N, T, AH, L]",0.4
653,assignation,"[AH, S, AY, N, EY, SH, AH, N]","[AE, S, AH, G, N, EY, SH, AH, N]",0.375
984,accelerator,"[AE, K, S, EH, L, ER, EY, T, ER]","[AE, K, S, EH, L, ER, AH, T, AO, R]",0.333333
397,individually,"[IH, N, D, IH, V, IH, JH, AH, L, IY]","[IH, N, D, AH, V, IH, JH, AH, W, AH, L, IY]",0.3
836,depredators,"[D, IH, P, R, EH, D, AH, T, ER, Z]","[D, EH, P, R, AH, D, EY, T, ER, Z]",0.3
527,assurbanipal,"[AH, S, ER, B, AH, N, AH, P, AH, L]","[AH, S, ER, B, AE, N, IH, P, AA, L]",0.3
1334,plaintiffs,"[P, L, EY, N, IH, F, S]","[P, L, EY, N, T, AH, F, S]",0.285714
1277,phosphoric,"[F, AA, S, F, ER, IH, K]","[F, AA, S, F, AO, R, IH, K]",0.285714


# LM Perplexity