# Goal of this notebook

- use certain metrics (LM perplexity, G2P error) to identify wordtypes that are more likely to be mispronounced by our grapheme-input TTS system

# automatic reloading magic

In [51]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Imports 

In [52]:
import json
from datasets import load_dataset
import torch
import jiwer
from tqdm import tqdm

# check if have correct type of node 

In [53]:
import socket

# print hostname to make sure we are on correct node
disallowed_nodes = ['escience6']
hostname = socket.gethostname()
print(hostname)
node = hostname.split('.')[0]
if node in disallowed_nodes:
    raise ValueError(f"Running on disallowed node {node}!")

levi.inf.ed.ac.uk


In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.is_available()
assert torch.cuda.is_available()

# Load OOV list

(wordtypes not seen in the half of LJSpeech used to train TTS and ASR)

In [55]:
# get oov wordtypes list (words that are not seen in tts training)
oov_wordlist_path = '/home/s1785140/data/ljspeech_fastpitch/oov_list.json'
with open(oov_wordlist_path, 'r') as f:
    oovs_and_freqs = json.load(f)
    
oovs = set(w.strip() for w in oovs_and_freqs.keys())
print(f'original before cleaning/sampling {len(oovs)=}')

original before cleaning/sampling len(oovs)=8343


# Load G2P pronunciation lexicon

In [56]:
# load lexicon that G2P model was trained on
dataset_dict = load_dataset("flexthink/librig2p-nostress")

Found cached dataset librig2p-nostress (/home/s1785140/.cache/huggingface/datasets/flexthink___librig2p-nostress/default/0.0.0/95c204c6be42796a753ef410b5dfce2bfa21d61b51f0c3ffe85cf6e3a4dee65f)


  0%|          | 0/6 [00:00<?, ?it/s]

In [57]:
type(dataset_dict['lexicon_train'])

datasets.arrow_dataset.Dataset

In [58]:
# combine data splits in the lexicon 
# (as we are not training G2P, don't need train valid test splits)
from datasets import concatenate_datasets
datasets = [dataset_dict['lexicon_train'], dataset_dict['lexicon_valid'], dataset_dict['lexicon_test']]
dataset = concatenate_datasets(datasets)

In [59]:
# create dict of wordtype to pronunciation
lexicon = {}
for char, phn in zip(dataset['char'], dataset['phn']):
    lexicon[char.lower().strip()] = phn

# only consider the OOV words that are in the pronunciation lexicon


In [60]:
len(oovs)

8343

In [61]:
oovs_in_lexicon = oovs.intersection(lexicon.keys())
len(oovs_in_lexicon)

7966

In [62]:
oovs_not_in_lexicon = oovs - oovs_in_lexicon
len(oovs_not_in_lexicon)

377

# Compare LibriG2P coverage against CMUdict

In [63]:
import cmudict as cmudict_module

In [64]:
cmudict = cmudict_module.dict()

## normalise cmudict entries

- key: all lowercase
- values: phone strings should not hav any stress markers

In [65]:
for wordtype in cmudict.keys():
    if wordtype.isupper():
        raise ValueError
else:
    print("Keys do not contain any capital letters")

Keys do not contain any capital letters


In [66]:
prons = cmudict['content']

In [67]:
prons

[['K', 'AA1', 'N', 'T', 'EH0', 'N', 'T'],
 ['K', 'AH0', 'N', 'T', 'EH1', 'N', 'T']]

In [68]:
def strip_stress(pron):
    stripped_pron = []
    for phn in pron:
        stripped_pron.append(phn.strip('0123456789'))
    return stripped_pron

strip_stress(['K', 'AA1', 'N', 'T', 'EH0', 'N', 'T'])

['K', 'AA', 'N', 'T', 'EH', 'N', 'T']

In [69]:
new_cmudict = {}
for wordtype, prons in cmudict.items():
    new_prons = []
    for pron in prons:
        new_prons.append(strip_stress(pron))
    new_cmudict[wordtype] = new_prons

In [70]:
cmudict = new_cmudict

In [71]:
cmudict['content']

[['K', 'AA', 'N', 'T', 'EH', 'N', 'T'], ['K', 'AH', 'N', 'T', 'EH', 'N', 'T']]

## find words in OOV list that are in CMUDict

In [72]:
len(oovs)

8343

In [73]:
oovs_in_cmudict = oovs.intersection(cmudict.keys())
len(oovs_in_cmudict)

7233

In [74]:
oovs_not_in_cmudict = oovs - oovs_in_cmudict
len(oovs_not_in_cmudict)

1110

## find OOVs not in LibriG2P or cmudict

In [75]:
oovs_not_in_cmudict_and_librig2p = oovs_not_in_cmudict.intersection(oovs_not_in_lexicon)
len(oovs_not_in_cmudict_and_librig2p)

274

In [76]:
oovs_not_in_cmudict_and_librig2p

{'accordez',
 'adaptively',
 'affectionless',
 'afterwork',
 'agardh',
 'agencys',
 'agonal',
 'akermans',
 'akkad',
 'aldermens',
 'alinement',
 'aloman',
 'amuhia',
 'amylaceous',
 'anabolism',
 'arachtu',
 'argool',
 'arke',
 'armys',
 'arthor',
 'askern',
 'baddow',
 'bagne',
 'bamell',
 'barbariously',
 'bashour',
 'batess',
 'bbl',
 'belian',
 'billfolds',
 'bodoni',
 'bouhes',
 'bradawls',
 'bringuiers',
 'bubbletop',
 'buranelli',
 'busdriver',
 'buxtons',
 'cabmans',
 'caducibranchs',
 'cannings',
 'cardiotachyscope',
 'catwalks',
 'caunts',
 'centurys',
 'cerebrospinal',
 'chaldasan',
 'charae',
 'chiselers',
 'chromatin',
 'chromatophores',
 'chummage',
 'cissian',
 'citys',
 'cleancutness',
 'clipperton',
 'collaborates',
 'colsman',
 'compters',
 'conceptualisation',
 'condigne',
 'connallys',
 'conveners',
 'countrys',
 'courtmartialed',
 'crosshair',
 'cutdowns',
 'cyruss',
 'daulby',
 'daulbys',
 'delarues',
 'delustered',
 'deoxidation',
 'detre',
 'diemens',
 'dixblan

In [77]:
import num2words
from num2words import num2words

In [78]:
num2words(25)

'twenty-five'

In [79]:
nums = [num2words(i).strip('-') for i in range(100)]

In [80]:
def does_not_contain_num(s):
    for num in nums:
        if num in s:
            return False
    return True

In [81]:
## filter this list 



# word len
print(f"111 {len(oovs_not_in_cmudict_and_librig2p)=}")
oovs_not_in_cmudict_and_librig2p = set(w for w in oovs_not_in_cmudict_and_librig2p if len(w) >= 4)
print(f"222 {len(oovs_not_in_cmudict_and_librig2p)=}")

# not a number
oovs_not_in_cmudict_and_librig2p = set(w for w in oovs_not_in_cmudict_and_librig2p if does_not_contain_num(w))
print(f"333 {len(oovs_not_in_cmudict_and_librig2p)=}")

111 len(oovs_not_in_cmudict_and_librig2p)=274
222 len(oovs_not_in_cmudict_and_librig2p)=268
333 len(oovs_not_in_cmudict_and_librig2p)=246


# generate pronunciation for each word in the data used to train ASR/TTS

In [82]:
train_data = '/home/s1785140/data/ljspeech_fastpitch/train_meta_half.txt'
with open(train_data, 'r') as f:
    train_data = f.readlines()

In [83]:
train_utts = [l.split('|')[-1] for l in train_data]

In [84]:
# use text cleaner to clean each utt

from fastpitch.common.text.cleaners import lowercase_no_punc as text_cleaner

train_utts = [text_cleaner(utt) for utt in train_utts]


In [85]:
train_wordtypes = set()
for utt in train_utts:
    for token in utt.split(' '):
        train_wordtypes.add(token)
len(train_wordtypes)

5612

In [86]:
train_wordtypes

{'meanwhile',
 'confronted',
 'floors',
 'ear',
 'silence',
 'pilot',
 'residents',
 'committal',
 'ship',
 'society',
 'cuba',
 'physiology',
 'influences',
 'successively',
 'progress',
 'proper',
 'jail',
 'convinced',
 'rare',
 'veil',
 'school',
 'accused',
 'total',
 'tablespoonful',
 'excluded',
 'communicate',
 'contracted',
 'stars',
 'conceived',
 'tyburn',
 'mine',
 'worked',
 'talked',
 'singing',
 'short',
 'delay',
 'wait',
 'rome',
 'states',
 'contemplated',
 'connally',
 'marksman',
 'reply',
 'holster',
 'hall',
 'revealed',
 'july',
 'sovereign',
 'cooperate',
 'highly',
 'fines',
 'prominently',
 'colored',
 'freedom',
 'latest',
 'connor',
 'sport',
 'pence',
 'strangers',
 'detain',
 'safeguards',
 'ground',
 'negative',
 'stone',
 'infirmaries',
 'learned',
 'appeared',
 'dioxide',
 'realized',
 'seemingly',
 'killing',
 'thereafter',
 'defense',
 'establishments',
 'locate',
 'greatest',
 'gathered',
 'missing',
 'aperture',
 'provides',
 'hid',
 'emerged',
 'qu

In [87]:
print("there are ", len(train_wordtypes.intersection(set(cmudict.keys())))," training wordtypes in cmudict")

there are  5431  training wordtypes in cmudict


In [88]:
# use cmudict to get pronunciation for train wordtypes

train_wordtype2pron = {}
for w in train_wordtypes.intersection(set(cmudict.keys())):
    train_wordtype2pron[w] = cmudict[w][0]

In [89]:
# use G2P to predict pronunciations for wordtypes in training set but not in cmudict

from speechbrain.pretrained import GraphemeToPhoneme
g2p = GraphemeToPhoneme.from_hparams("speechbrain/soundchoice-g2p")

In [90]:
for w in tqdm(train_wordtypes - set(cmudict.keys())):
    train_wordtype2pron[w] = g2p(w)

  0%|                                                             | 0/181 [00:00<?, ?it/s]Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|███████████████████████████████████████████████████| 181/181

## save pron dict to disk

in two formats
- generic
- for training G2P model (https://pypi.org/project/phonetisaurus/)

In [91]:
# generic
with open('/home/s1785140/data/ljspeech_fastpitch/train_meta_half_pron_dict.json', 'w') as f:
    json.dump(train_wordtype2pron, f)

In [92]:
# for G2P training
#/path/to/lexicon.dict
#lexicon format:
#word1 phoneme1 phoneme2 ...
#word2 phoneme1 phoneme2 phoneme3 ...
g2p_training_lexicon_lines = []
for w, pron in train_wordtype2pron.items():
    l = f"{w} {' '.join(pron)}"
    g2p_training_lexicon_lines.append(l)

In [93]:
g2p_training_lexicon_lines[0]

'meanwhile M IY N W AY L'

In [94]:
with open('/home/s1785140/data/ljspeech_fastpitch/train_meta_half_prons_for_training_g2p.dict', 'w') as f:
    f.write('\n'.join(g2p_training_lexicon_lines))

# Train G2P model using only wordtypes contained in data used to train TTS/ASR

In [95]:
import subprocess
subprocess.run(["phonetisaurus",
                "train",
                "--model", 
                "/home/s1785140/data/ljspeech_fastpitch/train_meta_half_phonetisaurusG2P_model.fst",
                "/home/s1785140/data/ljspeech_fastpitch/train_meta_half_prons_for_training_g2p.dict"])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[94mINFO:phonetisaurus-train:2023-03-16 12:02:32[0m:  Checking command configuration...
[94mDEBUG:phonetisaurus-train:2023-03-16 12:02:32[0m:  Directory does not exist.  Trying to create.
[94mINFO:phonetisaurus-train:2023-03-16 12:02:32[0m:  Checking lexicon for reserved characters: '}', '|', '_'...
[94mDEBUG:phonetisaurus-train:2023-03-16 12:02:32[0m:  arpa_path:  train/model.o8.arpa
[94mDEBUG:phonetisaurus-train:2023-03-16 12:02:32[0m:  corpus_path:  train/model.corpus
[94mDEBUG:phonetisaurus-train:2023-03-16 12:02:32[0m:  dir_prefix:  train
[94mDEBUG:phonetisaurus-train:2023-03-16 12:02:32[0m:  grow:  False
[94mDEBUG:phonetisaurus-train:2023-03-16 12:02:32[0m:  lexicon_file:  /tmp/tmp3u_61cdi.txt
[94mDEBUG:phonetisaurus-train:2023-03-16 12:02:32[0m:  logger:  <Logger phonetisaurus-train (DEBUG)>
[94mDEBUG:phonetisaurus-train:2023-03-16 12:02:32[0m:  makeJointNgramCommand:  <bound method G2PModelTrainer._mitlm of <__main__.G2PModelTrainer object at 0x7f5112c47580>

CompletedProcess(args=['phonetisaurus', 'train', '--model', '/home/s1785140/data/ljspeech_fastpitch/train_meta_half_phonetisaurusG2P_model.fst', '/home/s1785140/data/ljspeech_fastpitch/train_meta_half_prons_for_training_g2p.dict'], returncode=0)

# demonstrate how to run trained G2P model

In [106]:
# to stop warning
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

shell_cmd = [
    "phonetisaurus",
    "predict",
    "--model", 
    "/home/s1785140/data/ljspeech_fastpitch/train_meta_half_phonetisaurusG2P_model.fst"
]
words = ['hello', 'world']
shell_result = subprocess.run(shell_cmd + words, capture_output=True, text=True)

In [109]:
lines = shell_result.stdout.split('\n')

In [111]:
# remove any empty strings
lines = [l for l in lines if l]

In [115]:
spelling2g2p_pron = {}
for l in lines:
    spelling = l.split(" ")[0]
    phones = l.split(" ")[1:]
    spelling2g2p_pron[spelling] = phones

In [114]:
'hello HH EH L OW'.split(" ")[1:]

['HH', 'EH', 'L', 'OW']

In [116]:
spelling2g2p_pron

{'hello': ['HH', 'EH', 'L', 'OW'], 'world': ['W', 'ER', 'L', 'D']}