In [20]:
TOK_JSON = """
{"\u0251": 1, "\u00e6": 2, "\u028c": 3, "a\u028a": 4, "\u0259": 5, "\u025d": 6, "a\u026a": 7, "b": 8, "\u02a7": 9, "d": 10, "\u00f0": 11, "\u027e": 12, "\u025b": 13, "l\u0329": 14, "m\u0329": 15, "n\u0329": 16, "\u014b\u0329": 17, "\u025a": 18, "e\u026a": 19, "f": 20, "g": 21, " ": 22, "h": 24, "\u026a": 26, "i\u02d0": 27, "\u02a4": 28, "k": 29, "l": 30, "m": 31, "n": 32, "\u014b": 33, "\u027e\u0303": 34, "o\u028a": 35, "\u0254\u026a": 36, "p": 37, "ʔ": 38, "\u0279": 39, "s": 40, "\u0283": 41, "t": 42, "\u03b8": 43, "\u028a": 44, "u\u02d0": 45, "v": 46, "w": 47, "j": 48, "z": 49, "\u0292": 50, "|": 0, "[UNK]": 51, "[PAD]": 52}
"""

In [21]:
import json

TOKENS = json.loads(TOK_JSON)
TOKLIST = ["\[UNK\]"]
for key in TOKENS:
    if "[" in key or key == "|" or key == " ":
        continue
    TOKLIST.append(key)
# sort TOKLIST by length, longest first
TOKLIST.sort(key=len, reverse=True)
TOK_REGEX_INNER = "|".join(TOKLIST)
TOK_REGEX = fr"({TOK_REGEX_INNER})"

In [25]:
import re

def tokenise(text):
    tokens = []
    text = text.strip()
    while text:
        match = re.match(TOK_REGEX, text)
        if not match:
            raise ValueError(f"Could not match token in text: {text}")
        token = match.group(0)
        if token == "[UNK]":
            tokens.append("ɪ")
        else:
            tokens.append(token)
        text = text[len(token):].strip()
    return tokens

In [23]:
SAMPLE_TIMIT = "doʊn t biːɪf ɹeɪʔ l[UNK]ɾl̩wʌn ð[UNK]biːs kæn t[UNK]ɚʔʧuːwɑlʔaɪmɚɹaʊn dɪnðɛnhiːt[UNK]tɪz kæp ʔoʊvɝhɪz lɛf tʔiːɝ ʔɪnʃʊk[UNK]z k lʌb ʔɪʔðɪp ɹ[UNK]n ʔ s"

In [45]:
EQUIVALENT_TOKENS = {
    "əl": "l̩",
    "tʃ": "ʧ",
}
STOPS = ["b", "p", "d", "t", "g", "k"]

In [46]:
import difflib

def make_equivalent(a, b):
    sm = difflib.SequenceMatcher(a=a, b=b, autojunk=False)
    ops = sm.get_opcodes()
    out_ops = []
    for op in ops:
        if op[0] == "equal":
            out_ops.append(op)
            continue
        elif op[0] == "replace":
            left = " ".join(a[op[1]:op[2]])
            right = " ".join(b[op[3]:op[4]])
            if left in EQUIVALENT_TOKENS and EQUIVALENT_TOKENS[left] == right:
                out_ops.append(("equal", op[1], op[2], op[3], op[4]))
            elif right == "ʔ" and left in STOPS:
                out_ops.append(("equal", op[1], op[2], op[3], op[4]))
            elif left in ["ɐ", "ə"] and right in ["ə", "ɪ"]:
                out_ops.append(("equal", op[1], op[2], op[3], op[4]))
            else:
                out_ops.append(op)
        else:
            out_ops.append(op)
    return ops
            


In [31]:
A = "d oʊ t b iː ɐ f ɹ eɪ d l ɪ ɾ əl w ʌ n ð ə b iː s t k æ n t h ɜː tʃ uː w ɑː l aɪ m ɐ ɹ aʊ n d æ n d ð ɛ n h iː t ɪ p t ɪ z k æ p oʊ v ɚ h ɪ z l ɛ f t ɪ æ n d ʃ ʊ k h ɪ z k l ʌ b æ t ð ə p ɹ ɪ n s".split(" ")
B = tokenise(SAMPLE_TIMIT)

In [39]:
def print_equivalents(A, B, ops):
    for op in ops:
        print(f"{op[0]}: A[{op[1]}:{op[2]}] = {A[op[1]:op[2]]}, B[{op[3]}:{op[4]}] = {B[op[3]:op[4]]}")

In [43]:
ops = make_equivalent(A, B)

In [44]:
print_equivalents(A, B, ops)

equal: A[0:2] = ['d', 'oʊ'], B[0:2] = ['d', 'oʊ']
insert: A[2:2] = [], B[2:3] = ['n']
equal: A[2:5] = ['t', 'b', 'iː'], B[3:6] = ['t', 'b', 'iː']
replace: A[5:6] = ['ɐ'], B[6:7] = ['ɪ']
equal: A[6:9] = ['f', 'ɹ', 'eɪ'], B[7:10] = ['f', 'ɹ', 'eɪ']
replace: A[9:10] = ['d'], B[10:11] = ['ʔ']
equal: A[10:13] = ['l', 'ɪ', 'ɾ'], B[11:14] = ['l', 'ɪ', 'ɾ']
replace: A[13:14] = ['əl'], B[14:15] = ['l̩']
equal: A[14:18] = ['w', 'ʌ', 'n', 'ð'], B[15:19] = ['w', 'ʌ', 'n', 'ð']
replace: A[18:19] = ['ə'], B[19:20] = ['ɪ']
equal: A[19:22] = ['b', 'iː', 's'], B[20:23] = ['b', 'iː', 's']
delete: A[22:23] = ['t'], B[23:23] = []
equal: A[23:27] = ['k', 'æ', 'n', 't'], B[23:27] = ['k', 'æ', 'n', 't']
replace: A[27:30] = ['h', 'ɜː', 'tʃ'], B[27:31] = ['ɪ', 'ɚ', 'ʔ', 'ʧ']
equal: A[30:32] = ['uː', 'w'], B[31:33] = ['uː', 'w']
replace: A[32:33] = ['ɑː'], B[33:34] = ['ɑ']
equal: A[33:34] = ['l'], B[34:35] = ['l']
insert: A[34:34] = [], B[35:36] = ['ʔ']
equal: A[34:36] = ['aɪ', 'm'], B[36:38] = ['aɪ', 'm']
repl