# select utts to do various things based on a corpus

50% fastpitch
    90% train
    10% dev 
50% respeller
    80% train
    10% dev
    10% test
    
The utterances used to train fastpitch should have more prototypical english spellings

Method 1:
* Add utts to respeller corpus that have words that have freq of 1
* then 2
* then 3... 
* Until we form 50% of the total number of graphemes

Method 2:
* use G2P model + phoneme dict

Method 3:
* use avg perplexity to rank words/utts from pretrained grapheme based LM

In [162]:
import string
from collections import Counter

In [163]:
# load text data
with open('/home/s1785140/data/ljspeech_fastpitch/wav_text_filelist.txt') as f:
    lines = f.readlines()

In [164]:
def only_alphabet(s):
    rv = []
    for c in s:
        if c in string.ascii_lowercase + ' ':
            rv.append(c)
    return ''.join(rv)

def replace(s1, c1, c2):
    s2 = []
    for c in s1:
        if c == c1:
            s2.append(c2)
        else:
            s2.append(c)
    return ''.join(s2)        

def tokenise(s):
    # replace hyphen with space to treat word as two separate words rather than one word
    return only_alphabet(replace(s, '-', ' ')).split(' ')

word_freq = Counter()
uttid2words = {}

for l in lines:
    utt_id = l.split('.wav')[0].split('/')[-1]
    text = l.split('|')[-1].rstrip('\n')
    
    words = set(tokenise(text)) # remove duplicates
    uttid2words[utt_id] = words
    
    for word in words:
        word_freq[word] += 1

In [166]:
# most common
word_freq.most_common(10)

[('the', 9641),
 ('of', 6379),
 ('and', 5384),
 ('to', 5037),
 ('in', 4099),
 ('a', 3600),
 ('was', 3311),
 ('that', 2611),
 ('he', 2136),
 ('his', 1693)]

In [167]:
# least common
# sorted_ascending = sorted(word_freq.items(), key=lambda x: (x[1], x[0]))
sorted_ascending = sorted(word_freq.items(), key=lambda x: x[1])
sorted_ascending[:10]

[('differs', 1),
 ('netherlands', 1),
 ('chinese', 1),
 ('woodcutters', 1),
 ('letterpress', 1),
 ('productions', 1),
 ('craftsmen', 1),
 ('shapeliness', 1),
 ('calligraphy', 1),
 ('perfection', 1)]

In [168]:
def utts_with_word(uttid2words, word):
    uttids = []
    for uttid, words in uttid2words.items():
        if word in words:
            uttids.append(uttid)
    return uttids

In [169]:
# respeller utterances
TOTAL_UTTS = 13100
respeller_corpus_ratio = 0.5
respeller_corpus_N = int(TOTAL_UTTS*respeller_corpus_ratio)
respeller_utt_to_lowfreq_word = dict()
for word, freq in sorted_ascending:
    utt_ids = utts_with_word(uttid2words, word)
    for utt_id in utt_ids:
        respeller_utt_to_lowfreq_word[utt_id] = (word, freq)
        del uttid2words[utt_id] # remove these utts from candidate list    
        
    if len(respeller_utt_to_lowfreq_word) > respeller_corpus_N:
        break

In [170]:
# for utt, tup in respeller_utt_to_lowfreq_word.items():
#     print(utt, tup)

LJ001-0001 ('differs', 1)
LJ001-0003 ('netherlands', 1)
LJ001-0010 ('letterpress', 1)
LJ001-0014 ('productions', 1)
LJ001-0015 ('shapeliness', 1)
LJ001-0016 ('calligraphy', 1)
LJ001-0022 ('ecclesiastical', 1)
LJ001-0023 ('psalters', 1)
LJ001-0024 ('maintz', 1)
LJ001-0025 ('rounder', 1)
LJ001-0026 ('ne', 1)
LJ001-0028 ('basle', 1)
LJ001-0031 ('sweynheim', 1)
LJ001-0033 ('twelfth', 1)
LJ001-0036 ('mentelin', 1)
LJ001-0037 ('zeiner', 1)
LJ001-0038 ('gering', 1)
LJ001-0040 ('legible', 1)
LJ001-0044 ('vindelin', 1)
LJ001-0049 ('eighties', 1)
LJ001-0050 ('aldus', 1)
LJ001-0052 ('artistically', 1)
LJ001-0054 ('contemporaries', 1)
LJ001-0055 ('jacobus', 1)
LJ001-0056 ('venetian', 1)
LJ001-0057 ('classics', 1)
LJ001-0058 ('epoch', 1)
LJ001-0059 ('frenchmen', 1)
LJ001-0062 ('theological', 1)
LJ001-0063 ('formally', 1)
LJ001-0064 ('transitional', 1)
LJ001-0065 ('ulm', 1)
LJ001-0066 ('schussler', 1)
LJ001-0068 ('uncompromising', 1)
LJ001-0069 ('worde', 1)
LJ001-0071 ('flemish', 1)
LJ001-0073 ('apa

In [172]:
len(respeller_utt_to_lowfreq_word.keys())

6551

In [173]:
len(uttid2words.keys())

6549

In [174]:
assert TOTAL_UTTS == len(respeller_utt_to_lowfreq_word.keys()) + len(uttid2words.keys())

In [186]:
import random

# [train, dev, test]
respeller_utt_ids = list(respeller_utt_to_lowfreq_word.keys())
random.shuffle(respeller_utt_ids)
N = len(respeller_utt_ids)
respeller_ratios = [0.8, 0.1, 0.1]
assert sum(respeller_ratios) == 1
train_ratio, dev_ratio, test_ratio = respeller_ratios 
train = respeller_utt_ids[:int(train_ratio*N)]
dev = respeller_utt_ids[int(train_ratio*N):int(train_ratio*N)+int(dev_ratio*N)]
test = respeller_utt_ids[int(train_ratio*N)+int(dev_ratio*N):]
assert N == len(train) + len(dev) + len(test), f"{N} == {len(train)} + {len(dev)} + {len(test)}"
print(f"{N} == {len(train)} + {len(dev)} + {len(test)}")

6551 == 5240 + 655 + 656


In [176]:
# [train, dev, test]
tts_utt_ids = list(uttid2words.keys())
tts_ratios = [0.9, 0.1, 0.0]