In [143]:
input_textgrid = "/content/alcott-male-kobietki_001_rozdzial-i-pielgrzymki.textgrid" # @param {"type":"string"}
input_json = "/content/alcott-male-kobietki_001_rozdzial-i-pielgrzymki.json" # @param {"type":"string"}
pronounce_as_tsv = "/content/pronounce-as.tsv" # @param {"type":"string"}
language = "pl" # @param ["pl", "en", "sv"]

In [20]:
%%capture
%pip install praatio

In [142]:
%%capture
%pip install piper_phonemize

Collecting piper_phonemize
  Downloading piper_phonemize-1.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (282 bytes)
Downloading piper_phonemize-1.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (25.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.0/25.0 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: piper_phonemize
Successfully installed piper_phonemize-1.1.0


In [None]:
STRIP_PUNCTUATION = {
    "pl": ",;:!?—…„”\"“.«»*()[]‘/\\",
    "en": ",;:!?—…„”\"“.«»*()[]‘/\\",
}

In [19]:
# read huggingface json
import json

def read_huggingface_json(input_json):
    with open(input_json, 'r') as f:
        data = json.load(f)
    words = []
    for chunk in data["chunks"]:
        words.append((chunk["timestamp"][0], chunk["timestamp"][1], chunk["text"]))
    return words

In [18]:
# read pronounce-as data
def read_pronounce_as(pronounce_as_tsv):
    pronounce_as = {}
    with open(pronounce_as_tsv, 'r') as f:
        for line in f.readlines():
            line = line.strip().split('\t')
            word = line[0].lower()
            if not word in pronounce_as:
                pronounce_as[word] = set()
            pronounce_as[word].add(line[1])
    if len(pronounce_as) == 0:
        return None
    return pronounce_as

In [17]:
# read praat utterances
from praatio import textgrid

def read_praat_utterances(input_textgrid):
    utterances = []
    tg = textgrid.openTextgrid(input_textgrid, includeEmptyIntervals=False)

    for tmp_tier in tg.tiers:
        if tmp_tier.name == "utterances":
            tier = tmp_tier
            break

    for interval in tier.entries:
        if interval.label.strip() == "":
            continue
        utterances.append((interval.start, interval.end, interval.label))
    return utterances

In [23]:
utterances = read_praat_utterances(input_textgrid)
words = read_huggingface_json(input_json)
pronounce_as = read_pronounce_as(pronounce_as_tsv)

In [24]:
words_by_start = {x[0]: x for x in words}
words_by_end = {x[1]: x for x in words}

In [58]:
def group_words_by_utterances(words, utterances):

    collected = []
    new_utterances = []
    for utterance in utterances:
        start = utterance[0]
        end = utterance[1]

        ut_dict = {
            "start": utterance[0],
            "end": utterance[1],
            "text": utterance[2],
            "words": []
        }
        for word in words:
            if word[0] < start:
                if word[1] > start:
                    ut_dict["maybe_start"] = word
                continue
            elif word[1] > end:
                if word[0] < end:
                    ut_dict["maybe_end"] = word
                break
            elif word[0] >= start and word[1] <= end:
                ut_dict["words"].append(word)
                collected.append(word)
            else:
                print(word)
        new_utterances.append(ut_dict)

    not_collected = []
    for word in words:
        if word not in collected:
            not_collected.append(word)

    return new_utterances, not_collected

In [123]:
grouped, uncollected = group_words_by_utterances(words, utterances)

In [118]:
def get_normalised_words(text):
    text = text.lower()
    words = [word.strip(STRIP_PUNCTUATION[language]) for word in text.split()]
    words = [x for x in words if x != ""]
    return words


In [124]:
for item in grouped:
    item["normalised_words"] = get_normalised_words(item["text"])

In [126]:
leftover = ''
for item in grouped:
    w2v_words = [x[2] for x in item["words"]]
    if leftover != "":
        w2v_words = [leftover] + w2v_words
        item["maybe_start_word"] = leftover
        leftover = ''
    elif "maybe_start" in item:
        if item["maybe_start"][2].endswith(item["normalised_words"][0]):
            w2v_words = [item["normalised_words"][0]] + w2v_words
            item["maybe_start_word"] = item["normalised_words"][0]
    if "maybe_end" in item:
        if item["maybe_end"][2].startswith(item["normalised_words"][-1]):
            item["maybe_end_word"] = item["normalised_words"][-1]
            w2v_words = w2v_words + [item["normalised_words"][-1]]
            leftover = item["maybe_end"][2][len(item["maybe_end_word"]):]
    if w2v_words == item["normalised_words"]:
        item["ok"] = True

In [112]:
from difflib import SequenceMatcher

# run a sequence matcher, but filter for the common
# case of a difference in spacing.
# Also, extract the relevant pieces.
def filter_smatcher(ref, hyp):
    items = []

    s = SequenceMatcher(None, ref, hyp)
    for opcode in s.get_opcodes():
        if opcode[0] == 'equal':
            items.append(("OK", ref[opcode[1]:opcode[2]]))
        elif opcode[0] == 'insert':
            items.append(("INSERT", hyp[opcode[3]:opcode[4]]))
        elif opcode[0] == 'delete':
            items.append(("DELETE", ref[opcode[1]:opcode[2]]))
        elif opcode[0] == 'replace':
            left = ref[opcode[1]:opcode[2]]
            right = hyp[opcode[3]:opcode[4]]
            if "".join(left) == "".join(right):
                items.append(("OK", left))
            else:
                items.append(("REPLACE", left, right))
    return items

In [110]:
def run_filter_smatcher(item):
    a = item["normalised_words"]
    b = [x[2] for x in item["words"]]
    if "maybe_start_word" in item:
        b = [item["maybe_start_word"]] + b
    if "maybe_end_word" in item:
        b = b + [item["maybe_end_word"]]
    return filter_smatcher(a, b)

In [185]:
CHECKED_STARTS = [
    1148.4643,
    1165.464,
    1089.572,
    1050.643,
    1009.3364,
    1039.4474,
    1262.991,
    5.4285,
    10.4302,
    28.4102,
    229.2161,
    260.7154,
    302.2812,
    1121.9926,
    1225.599,
    1401.72,
    1064.1156,
    1222.379,
    760.1107,
    801.1269,
    156.831,
    1356.0282,
    1445.424,
    1458.4299,
    1462.4391,
    1481.8289,
    1491.5354,
    1423.2975,
    1407.9742,
    58.7359,
    67.4604,
    191.8702,
    209.3544,
    233.9202,
    630.2735,
    269.192,
    608.6451,
    554.6953,
    539.9961,
    626.63,
    306.1663,
    681.7677,
]

In [173]:
import piper_phonemize

def run_piper(text, accents = False):
    ret = piper_phonemize.phonemize_espeak(text=text, voice=language)
    if not accents:
        ACCENTS = ["ˈ", "ˌ"]
        ret = [x for y in ret for x in y if x not in ACCENTS]
    return ret

def check_phonemised(a, b):
    if len(a) == len(b) == 1:
        a_phn = run_piper(text=a[0])
        b_phn = run_piper(text=b[0])
        return a_phn == b_phn

    a_phn_list = run_piper(text=" ".join(a))
    b_phn_list = run_piper(text=" ".join(b))
    a_phn_nospace = run_piper(text="".join(a))
    b_phn_nospace = run_piper(text="".join(b))

    return a_phn_list == b_phn_list or a_phn_nospace == b_phn_nospace


In [167]:
check_phonemised(["stym"], ["z", "tym"])

['s', 't', 'ɨ', 'm'] ['s', ' ', 't', 'ɨ', 'm'] ['s', 't', 'ɨ', 'm'] ['s', 't', 'ɨ', 'm']


True

In [176]:
def get_counts(diffs):
    counts = {
        "OK": 0,
        "INSERT": 0,
        "DELETE": 0,
        "REPLACE": 0
    }
    for diff in diffs:
        counts[diff[0]] += 1
    return counts

In [177]:
def counts_ok(counts):
    return counts["INSERT"] == 0 and counts["DELETE"] == 0 and counts["REPLACE"] == 0

In [186]:
for item in grouped:
    item_printed = False
    if item["start"] in CHECKED_STARTS:
        item["ok"] = True
    if "ok" in item and item["ok"]:
        continue
    diffs = run_filter_smatcher(item)
    counts = get_counts(diffs)
    for diff in diffs:
        if diff[0] == "OK":
            continue
        if diff[0] == "REPLACE":
            if check_phonemised(diff[1], diff[2]):
                counts["REPLACE"] -= 1
                if counts_ok(counts):
                    item["ok"] = True
                    break
            if not item_printed:
                print(item["start"], item["text"])
                item_printed = True
            print(diff[1], diff[2])
        if diff[0] == "INSERT":
            if not item_printed:
                print(item["start"], "INS", item["text"])
                item_printed = True
            print(diff[1])
        if diff[0] == "DELETE":
            if not item_printed:
                print(item["start"], "DEL", item["text"])
                item_printed = True
            print(diff[1])

280.0684 DEL — Ona używa takich gminnych słów — zauważyła Amelka, rzucając gromiące spojrzenie na długą postać rozciągniętą na dywanie.
['na', 'dywanie']
296.4915 DEL — Dlatego też gwiżdżę.
['gwiżdżę']
298.7279 DEL — Nic cierpię ordynarnych, nieeleganckich dziewcząt.
['nic', 'cierpię']
343.9231 — Nie jestem! I jeżeli upinanie włosów czyni mnie doroślejszą, to będę nosiła warkocze do dwudziestu lat! — zawołała, zrzucając siatkę i rozwieszając ciemnoblond grzywę.
['mnie'] ['nie']
['ciemnoblond', 'grzywę'] ['ciemno', 'bląd']
356.1204 — Nie cierpię tej myśli, że dorosnę, zostanę panną March, będę nosić długie suknie i wyglądać tak sztywno jak aster chiński.
['march'] ['marcz']
['suknie'] ['sukni']
391.3708 Lecz nie ma na to rady; staraj się więc poprzestać na odgrywaniu roli naszego brata — rzekła Eliza, głaszcząc szorstkie włosy na głowie wspartej o jej kolana, rączką przyjemną w dotknięciu, pomimo zmywania talerzy i okurzania sprzętów.
['okurzania'] ['odkórzenia']
420.096 Lubię twoje dob