In [4]:
input_textgrid = "/content/alcott-male-kobietki_001_rozdzial-i-pielgrzymki.textgrid" # @param {"type":"string"}
input_json = "/content/alcott-male-kobietki_001_rozdzial-i-pielgrzymki.json" # @param {"type":"string"}
pronounce_as_tsv = "/content/pronounce-as.tsv" # @param {"type":"string"}

In [20]:
%%capture
%pip install praatio

In [19]:
# read huggingface json
import json

def read_huggingface_json(input_json):
    with open(input_json, 'r') as f:
        data = json.load(f)
    words = []
    for chunk in data["chunks"]:
        words.append((chunk["timestamp"][0], chunk["timestamp"][1], chunk["text"]))
    return words

In [18]:
# read pronounce-as data
def read_pronounce_as(pronounce_as_tsv):
    pronounce_as = {}
    with open(pronounce_as_tsv, 'r') as f:
        for line in f.readlines():
            line = line.strip().split('\t')
            word = line[0].lower()
            if not word in pronounce_as:
                pronounce_as[word] = set()
            pronounce_as[word].add(line[1])
    if len(pronounce_as) == 0:
        return None
    return pronounce_as

In [17]:
# read praat utterances
from praatio import textgrid

def read_praat_utterances(input_textgrid):
    utterances = []
    tg = textgrid.openTextgrid(input_textgrid, includeEmptyIntervals=False)

    for tmp_tier in tg.tiers:
        if tmp_tier.name == "utterances":
            tier = tmp_tier
            break

    for interval in tier.entries:
        if interval.label.strip() == "":
            continue
        utterances.append((interval.start, interval.end, interval.label))
    return utterances

In [23]:
utterances = read_praat_utterances(input_textgrid)
words = read_huggingface_json(input_json)
pronounce_as = read_pronounce_as(pronounce_as_tsv)

In [24]:
words_by_start = {x[0]: x for x in words}
words_by_end = {x[1]: x for x in words}

In [58]:
def group_words_by_utterances(words, utterances):

    collected = []
    new_utterances = []
    for utterance in utterances:
        start = utterance[0]
        end = utterance[1]

        ut_dict = {
            "start": utterance[0],
            "end": utterance[1],
            "text": utterance[2],
            "words": []
        }
        for word in words:
            if word[0] < start:
                if word[1] > start:
                    ut_dict["maybe_start"] = word
                continue
            elif word[1] > end:
                if word[0] < end:
                    ut_dict["maybe_end"] = word
                break
            elif word[0] >= start and word[1] <= end:
                ut_dict["words"].append(word)
                collected.append(word)
            else:
                print(word)
        new_utterances.append(ut_dict)

    not_collected = []
    for word in words:
        if word not in collected:
            not_collected.append(word)

    return new_utterances, not_collected

In [123]:
grouped, uncollected = group_words_by_utterances(words, utterances)

In [118]:
def get_normalised_words(text):
    text = text.lower()
    words = [word.strip(",;:!?—…„”\"“.«»*()[]‘/\\") for word in text.split()]
    words = [x for x in words if x != ""]
    return words


In [124]:
for item in grouped:
    item["normalised_words"] = get_normalised_words(item["text"])

In [126]:
leftover = ''
for item in grouped:
    w2v_words = [x[2] for x in item["words"]]
    if leftover != "":
        w2v_words = [leftover] + w2v_words
        item["maybe_start_word"] = leftover
        leftover = ''
    elif "maybe_start" in item:
        if item["maybe_start"][2].endswith(item["normalised_words"][0]):
            w2v_words = [item["normalised_words"][0]] + w2v_words
            item["maybe_start_word"] = item["normalised_words"][0]
    if "maybe_end" in item:
        if item["maybe_end"][2].startswith(item["normalised_words"][-1]):
            item["maybe_end_word"] = item["normalised_words"][-1]
            w2v_words = w2v_words + [item["normalised_words"][-1]]
            leftover = item["maybe_end"][2][len(item["maybe_end_word"]):]
    if w2v_words == item["normalised_words"]:
        item["ok"] = True

In [127]:
grouped[-20]

{'start': 1337.8755,
 'end': 1349.0268,
 'text': 'Gdyby nie to, że jestem za duża na takie rzeczy, chętnie bym się jeszcze w to bawiła — rzekła Amelka, która zaczynała mówić o porzuceniu dziecinnych rozrywek w poważnym wieku lat dwunastu.',
 'words': [(1337.96, 1338.18, 'gdyby'),
  (1338.22, 1338.32, 'nie'),
  (1338.38, 1338.46, 'to'),
  (1338.56, 1338.62, 'że'),
  (1338.66, 1338.9, 'jestem'),
  (1338.96, 1339.04, 'za'),
  (1339.08, 1339.3, 'duża'),
  (1339.36, 1339.44, 'na'),
  (1339.48, 1339.68, 'takie'),
  (1339.72, 1340.0, 'rzeczy'),
  (1340.28, 1340.6, 'chętnie'),
  (1340.62, 1340.74, 'bym'),
  (1340.8, 1340.88, 'się'),
  (1340.92, 1341.1, 'jeszcze'),
  (1341.14, 1341.26, 'wto'),
  (1341.3, 1341.66, 'bawiła'),
  (1342.5, 1342.78, 'rzekła'),
  (1342.84, 1343.26, 'amelka'),
  (1343.32, 1343.56, 'która'),
  (1343.64, 1344.06, 'zaczynała'),
  (1344.12, 1344.4, 'mówić'),
  (1344.5, 1344.52, 'o'),
  (1344.58, 1345.1, 'porzuceniu'),
  (1345.18, 1345.68, 'dziecinnych'),
  (1345.74, 1346.2

In [112]:
from difflib import SequenceMatcher

# run a sequence matcher, but filter for the common
# case of a difference in spacing.
# Also, extract the relevant pieces.
def filter_smatcher(ref, hyp):
    items = []

    s = SequenceMatcher(None, ref, hyp)
    for opcode in s.get_opcodes():
        if opcode[0] == 'equal':
            items.append(("OK", ref[opcode[1]:opcode[2]]))
        elif opcode[0] == 'insert':
            items.append(("INSERT", hyp[opcode[3]:opcode[4]]))
        elif opcode[0] == 'delete':
            items.append(("DELETE", ref[opcode[1]:opcode[2]]))
        elif opcode[0] == 'replace':
            left = ref[opcode[1]:opcode[2]]
            right = hyp[opcode[3]:opcode[4]]
            if "".join(left) == "".join(right):
                items.append(("OK", left))
            else:
                items.append(("REPLACE", left, right))
    return items

In [110]:
def run_filter_smatcher(item):
    a = item["normalised_words"]
    b = [x[2] for x in item["words"]]
    if "maybe_start_word" in item:
        b = [item["maybe_start_word"]] + b
    if "maybe_end_word" in item:
        b = b + [item["maybe_end_word"]]
    return filter_smatcher(a, b)

In [129]:
run_filter_smatcher(grouped[12])
#grouped[12]

[('OK', ['bardzo', 'jest']),
 ('OK', ['nieładnie']),
 ('OK',
  ['że',
   'niektóre',
   'dziewczęta',
   'mają',
   'mnóstwo',
   'pięknych',
   'rzeczy',
   'a']),
 ('REPLACE', ['inne'], ['inny']),
 ('OK', ['nie', 'mają', 'nic', 'dodała', 'amelka', 'z', 'gniewną', 'minką'])]