In [1]:
import csv
from pathlib import Path
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from glob import glob

# Find words in test set that are not in train set

In [7]:
# load tsv texts
def load_tsv_texts(tsv_path):
    tsv_path = Path(tsv_path)
    if not tsv_path.is_file():
        raise FileNotFoundError(f"Dataset not found: {tsv_path}")
    with open(tsv_path) as f:
        reader = csv.DictReader(
            f,
            delimiter="\t",
            quotechar=None,
            doublequote=False,
            lineterminator="\n",
            quoting=csv.QUOTE_NONE,
        )
        samples = [dict(e) for e in reader]
    if len(samples) == 0:
        raise ValueError(f"Empty manifest: {tsv_path}")
    return [s["tgt_text"] for s in samples]

In [None]:
def load_lines_from_files(path, file_ext="txt"):
    """(for VCTK txt) recursively search for txt files and load lines from them"""
    files = glob(f'{path}/*/*.{file_ext}', recursive=True)
    lines = []
    for x in files:
        with open(x, 'r') as f:
            l = f.readlines()
        assert len(l) == 1
        lines.append(l[0].strip())
    return lines

path = "/home/s1785140/data/VCTK-Corpus/txt/"
lines = load_lines_from_files(path, "txt")
lines[:10]

In [12]:
with open(txt_files[0], 'r') as f:
    l = f.readlines()
    
assert len(l) == 1
l[0].strip()


'Please call Stella.'

In [14]:
tokens_from_lines([l[0].strip()])

['please', 'call', 'stella']

In [13]:
def tokens_from_lines(lines):
    tokenizer = RegexpTokenizer(r'\w+')
    lines = [tokenizer.tokenize(line) for line in lines]
    tokens = [token.lower() for line in lines for token in line ]
    return tokens

In [9]:
def get_token_counter(tokens):
    c = Counter()
    for t in tokens:
        c[t] += 1
    return c

In [10]:
lines = load_tsv_texts("/home/s1785140/data/LJSpeech-1.1/feature_manifest/train.tsv")
train_tokens = tokens_from_lines(lines)
train_counter = get_token_counter(train_tokens)

In [11]:
lines = load_tsv_texts("/home/s1785140/data/LJSpeech-1.1/feature_manifest/dev.tsv")
dev_tokens = tokens_from_lines(lines)
dev_counter = get_token_counter(dev_tokens)

In [12]:
lines = load_tsv_texts("/home/s1785140/data/LJSpeech-1.1/feature_manifest/test.tsv")
test_tokens = tokens_from_lines(lines)
test_counter = get_token_counter(test_tokens)

In [13]:
test_oovs = set(test_counter.keys()) - set(train_counter.keys())
dev_oovs = set(dev_counter.keys()) - set(train_counter.keys())

# check what words are in the sets

In [14]:
"said" in dev_counter

True

# count the number of word tokens in the test sets

In [18]:
sum(test_counter.values())

9038

In [19]:
sum(dev_counter.values())

6046

In [27]:
sorted([(len(l), l) for l in lines], key=lambda x: x[0])

[(20, 'In eighteen thirteen'),
 (22, 'eight. The press yard.'),
 (25, 'has never been surpassed.'),
 (29, 'In the yard behind the prison'),
 (30, 'in being comparatively modern.'),
 (30, 'to improve the letter in form.'),
 (30, 'Italy is contentedly stagnant.'),
 (30, 'being thin, tough, and opaque.'),
 (31, 'into holes marked with numbers.'),
 (33, 'After that they fell as steadily,'),
 (34, 'the whole about fifteen feet wide.'),
 (38, 'fourteen sixty-nine, fourteen seventy;'),
 (38, 'and the disuse of the inferior courts.'),
 (39, 'One fruitful source of badness in paper'),
 (39, 'weighted by treble the amount of costs.'),
 (39, 'The shameful malpractices of Bambridge,'),
 (40, "These were: one. The male debtors' side."),
 (41, 'or female convicts ordered for execution.'),
 (41, 'three times the amount would be expended.'),
 (42, 'The office of marshal had been hereditary,'),
 (43, 'than in the same operations with ugly ones.'),
 (43, 'On the female side matters were much worse;'),
 

# generate stimuli to model for testing SAC

In [15]:
for oov_word in sorted(list(dev_oovs)):
    # print()
    print(f"how is {oov_word} pronounced")
    print(f"how is <{oov_word}> pronounced")

how is abominable pronounced
how is <abominable> pronounced
how is abundantly pronounced
how is <abundantly> pronounced
how is adjudged pronounced
how is <adjudged> pronounced
how is allowances pronounced
how is <allowances> pronounced
how is amateurs pronounced
how is <amateurs> pronounced
how is annex pronounced
how is <annex> pronounced
how is arcade pronounced
how is <arcade> pronounced
how is aristocracy pronounced
how is <aristocracy> pronounced
how is attracting pronounced
how is <attracting> pronounced
how is auditors pronounced
how is <auditors> pronounced
how is beam pronounced
how is <beam> pronounced
how is bribe pronounced
how is <bribe> pronounced
how is bumble pronounced
how is <bumble> pronounced
how is carousing pronounced
how is <carousing> pronounced
how is cashman pronounced
how is <cashman> pronounced
how is catholics pronounced
how is <catholics> pronounced
how is certify pronounced
how is <certify> pronounced
how is charitable pronounced
how is <charitable> prono

In [16]:
for oov_word in sorted(list(test_oovs)):
    # print()
    print(f"how is {oov_word} pronounced")
    print(f"how is <{oov_word}> pronounced")
    

how is accountant pronounced
how is <accountant> pronounced
how is aggregate pronounced
how is <aggregate> pronounced
how is akerman pronounced
how is <akerman> pronounced
how is aldus pronounced
how is <aldus> pronounced
how is allnutt pronounced
how is <allnutt> pronounced
how is alms pronounced
how is <alms> pronounced
how is animadversion pronounced
how is <animadversion> pronounced
how is apace pronounced
how is <apace> pronounced
how is arabic pronounced
how is <arabic> pronounced
how is arcade pronounced
how is <arcade> pronounced
how is artistically pronounced
how is <artistically> pronounced
how is attends pronounced
how is <attends> pronounced
how is augsburg pronounced
how is <augsburg> pronounced
how is avoidance pronounced
how is <avoidance> pronounced
how is awkward pronounced
how is <awkward> pronounced
how is badness pronounced
how is <badness> pronounced
how is bambridge pronounced
how is <bambridge> pronounced
how is bart pronounced
how is <bart> pronounced
how is bar