# First attempts at alignment (take 2)

> "Wish I could find the first version"

- toc: false
- hidden: true
- branch: master
- categories: [riksdag, wav2vec, alignment]

In [2]:
TEST_A = "/Users/joregan/Playing/rd_ctm_edit/H5C120171011va"

In [3]:
lines = []
with open(TEST_A) as f:
    for line in f.readlines():
        lines.append(line.strip())

In [4]:
def accept_all(lines):
    outlines = []
    for line in lines:
        parts = line.split(" ")
        if parts[-1] == "cor":
            outlines.append(line)
        elif parts[-1] == "sub":
            parts[4] = parts[6]
            parts[-1] = "cor"
            outlines.append(" ".join(parts))
    return outlines

In [5]:
accept_all(lines)

['2442205210012872721 1 27.86 0.06 Herr 1.0 Herr cor',
 '2442205210012872721 1 28.0 0.48 talman! 1.0 talman! cor',
 '2442205210012872721 1 28.72 0.96 Riksdagsledamöter! 1.0 Riksdagsledamöter! cor',
 '2442205210012872721 1 30.16 0.82 Allianspartierna 1.0 Allianspartierna cor',
 '2442205210012872721 1 31.08 0.56 Moderaterna, 1.0 Moderaterna, cor',
 '2442205210012872721 1 32.08 0.74 Centerpartiet, 1.0 Centerpartiet, cor',
 '2442205210012872721 1 33.02 0.579 Liberalerna 1.0 Liberalerna cor',
 '2442205210012872721 1 33.74 0.079 och 1.0 och cor',
 '2442205210012872721 1 33.88 0.759 Kristdemokraterna 1.0 Kristdemokraterna cor',
 '2442205210012872721 1 34.76 0.399 föreslår 1.0 föreslår cor',
 '2442205210012872721 1 35.22 0.079 som 1.0 som cor',
 '2442205210012872721 1 35.36 0.579 riksdagens 1.0 riksdagens cor',
 '2442205210012872721 1 36.02 0.239 förste 1.0 förste cor',
 '2442205210012872721 1 36.32 0.179 vice 1.0 vice cor',
 '2442205210012872721 1 36.58 0.34 talman 1.0 talman cor',
 '24422052

In [6]:
def ctm_to_timed(lines):
    output = []
    for line in lines:
        parts = line.split(" ")
        start = float(parts[2])
        dur = float(parts[3])
        output.append({
            "start": start,
            "end": start + dur,
            "text": parts[6]
        })
    return output

In [7]:
side_a = ctm_to_timed(accept_all(lines))

In [8]:
phonfile = "/Users/joregan/Playing/rd_phonetic/2442205210012872721_480p.json"

In [9]:
import json
with open(phonfile) as f:
    pieces = json.load(f)

In [10]:
def hf_json_to_timed(data):
    output = []
    for chunk in data["chunks"]:
        output.append({
            "start": chunk["timestamp"][0],
            "end": chunk["timestamp"][1],
            "text": chunk["text"]
        })
    return output

In [11]:
side_b = hf_json_to_timed(pieces)

In [12]:
def prune_to_other(left, right, fudge=0.5):
    output = []
    for item in right:
        if item["start"] < left[0]["start"] - fudge:
            continue
        elif item["end"] > left[-1]["end"] + fudge:
            continue
        else:
            output.append(item)

    return left, output

In [13]:
new_a, new_b = prune_to_other(side_a, side_b)

In [14]:
def end_cost(a, b):
    return abs(a["end"] - b["end"])

def start_cost(a, b):
    return abs(a["start"] - b["start"])

def cost(a, b):
    starts = start_cost(a, b)
    ends = end_cost(a, b)
    return starts + ends

In [15]:
def in_start_range(a, b, range=0.2):
    return abs(a["start"] - b["start"]) <= range

def in_end_range(a, b, range=0.2):
    return abs(a["end"] - b["end"]) <= range

def in_range(a, b, range=0.2):
    r_start = in_start_range(a, b, range)
    r_end = in_end_range(a, b, range)
    return r_start or r_end


In [16]:
def falls_between(a1, a2, b):
    if b["end"] <= a2["start"] and b["start"] >= a1["end"]:
        return True
    return False

In [17]:
import numpy as np



In [18]:
def approx_eq(start1, start2, factor=0.04):
    return start1 == start2 or abs(start1 - start2) < factor

In [74]:
def align_times(new_a, new_b, merge_end_flexibility=0.06):
    s1 = len(new_a)
    s2 = len(new_b)

    additionals = []
    merges = {}

    dist_matrix = np.matrix(np.ones((s1, s2)))
    DEBUG = True
    pair_cost = 0.0

    for i in range(s1):
        for j in range(s2):
            if not in_range(new_a[i], new_b[j]):
                continue

            if i == 0 and new_b[j]["end"] < new_a[0]["start"]:
                additionals.append((-1, 0, j))
                dist_matrix[i, j] = 1.0
                continue
            elif i < (s1 - 1) and falls_between(new_a[i], new_a[i + 1], new_b[j]):
                additionals.append((i, i + 1, j))
                dist_matrix[i, j] = 1.0
                continue
            elif i == s1 and new_b[j]["start"] >= new_a[i]["end"]:
                additionals.append((i, -1, j))
                dist_matrix[i, j] = 1.0
                continue

            if approx_eq(new_a[i]["start"], new_b[j]["start"]):
                tmp_j = j
                fwd = []
                extent = new_b[tmp_j]
                if i < (s1 - 2) and new_b[tmp_j]["end"] < new_a[i + 1]["end"]:
                    extent = new_a[i + 1]
                while tmp_j < (s2 - 1) and not in_end_range(new_a[i], extent, merge_end_flexibility):
                    fwd.append((end_cost(new_a[i], new_b[tmp_j]), tmp_j))
                    tmp_j += 1
                if len(fwd) > 1:
                    sfwd = sorted(fwd)
                    new_j = sfwd[0][1]
                    if new_j != j:
                        pair_cost = sfwd[0][0]
                        merges[i] = [x for x in range(j, new_j + 1)]
                        j = new_j
            print(i, j, pair_cost)
            if pair_cost != 1.:
                pair_cost = cost(new_a[i], new_b[j])
            dist_matrix[i, j] = pair_cost
            print("second", i, j, pair_cost == dist_matrix[i, j])
    return dist_matrix, additionals, merges

In [75]:
dist_m, additions, mrg = align_times(new_a, new_b)

0 0 0.0
second 0 0 True
0 1 0.019999999999999574
second 0 1 True
1 0 0.8200000000000038
second 1 0 True
1 1 0.7200000000000024
second 1 1 True
2 3 0.10000000000000142
second 2 3 True
2 3 0.620000000000001
second 2 3 True
3 4 0.620000000000001
second 3 4 True
3 6 0.0
second 3 6 True
3 6 0.16000000000000014
second 3 6 True
4 7 0.16000000000000014
second 4 7 True
5 10 0.03999999999999915
second 5 10 True
5 10 0.3200000000000003
second 5 10 True
6 11 0.3200000000000003
second 6 11 True
6 12 0.02099999999999369
second 6 12 True
7 11 0.9009999999999962
second 7 11 True
7 12 0.919000000000004
second 7 12 True
7 13 0.03900000000000148
second 7 13 True
8 12 0.32099999999999795
second 8 12 True
8 14 0.0009999999999976694
second 8 14 True
8 14 0.2009999999999934
second 8 14 True
9 15 0.2009999999999934
second 9 15 True
9 16 0.0589999999999975
second 9 16 True
10 15 0.6010000000000062
second 10 15 True
10 16 0.6589999999999989
second 10 16 True
10 17 0.04099999999999682
second 10 17 True
11 16 0.7

In [76]:
new_b[24]

{'start': 38.14, 'end': 38.32, 'text': 'fɪne'}

In [77]:
additions

[(4, 5, 8), (14, 15, 21)]

In [52]:
dist_m.shape

(18, 25)

In [121]:
def walk_matrix(dist_matrix, additions):
    i = 0
    j = 0

    s1 = dist_matrix.shape[0]
    s2 = dist_matrix.shape[1]

    path = []
    def do_additions(i, j):
        if (i-1, i, j) in additions:
            return True
        if i+1 < s1 and (i, i + 1, j) in additions:
            return True
        if i == s1 and (i, -1, j) in additions:
            return True
        return False

    while i < s1:
        while j < s2:
            if not i in mrg:
                if do_additions(i, j):
                    j += 1
                    continue
                pairs = []
                tmpj = j
                while tmpj < s2 - 1 and dist_matrix[i,tmpj] != 1.0:
                    pairs.append((dist_matrix[i,tmpj], tmpj))
                    tmpj += 1
                if pairs != []:
                    spairs = sorted(pairs)
                    j = spairs[0][1]
                path.append((i, j))
                i += 1
                j += 1
                continue
            else:
                path += [(i, x) for x in mrg[i]]
                j = mrg[i][-1] + 1
                i += 1
                continue
    return path

In [115]:
additions

[(4, 5, 8), (14, 15, 21)]

In [122]:
path = walk_matrix(dist_m, additions)

In [119]:
for pp in path:
    print(new_a[pp[0]]["text"], new_b[pp[1]]["text"])

Herr ɑː
talman! tɑːlman
Riksdagsledamöter! rɪksasleːda
Riksdagsledamöter! møːtœ̞
Allianspartierna al
Allianspartierna aspatiːæɳa
Moderaterna, mʊdɑːtœ̞ɔɳa
Centerpartiet, sentə
Centerpartiet, patiːət
Liberalerna lɪbɑːlɔɳa
och oː
Kristdemokraterna kɪs
Kristdemokraterna demɔkɑːtɔɳa
föreslår fœ̞ːesoː
som sɔm
riksdagens rɪksdɑːɡəns
förste fœ̞st
vice viːsə
talman tɑːlman
Ewa eva
Thalén tareːn
Finné. fɪne


In [35]:
mrg

{2: [2, 3], 3: [5, 6], 5: [9, 10], 8: [13, 14]}

In [36]:
for aa in mrg:
    for bb in mrg[aa]:
        print(new_a[aa], new_b[bb])

{'start': 28.72, 'end': 29.68, 'text': 'Riksdagsledamöter!'} {'start': 28.7, 'end': 29.18, 'text': 'rɪksasleːda'}
{'start': 28.72, 'end': 29.68, 'text': 'Riksdagsledamöter!'} {'start': 29.24, 'end': 29.58, 'text': 'møːtœ̞'}
{'start': 30.16, 'end': 30.98, 'text': 'Allianspartierna'} {'start': 30.16, 'end': 30.26, 'text': 'al'}
{'start': 30.16, 'end': 30.98, 'text': 'Allianspartierna'} {'start': 30.32, 'end': 30.98, 'text': 'aspatiːæɳa'}
{'start': 32.08, 'end': 32.82, 'text': 'Centerpartiet,'} {'start': 32.06, 'end': 32.3, 'text': 'sentə'}
{'start': 32.08, 'end': 32.82, 'text': 'Centerpartiet,'} {'start': 32.36, 'end': 32.86, 'text': 'patiːət'}
{'start': 33.88, 'end': 34.639, 'text': 'Kristdemokraterna'} {'start': 33.84, 'end': 34.04, 'text': 'kɪs'}
{'start': 33.88, 'end': 34.639, 'text': 'Kristdemokraterna'} {'start': 34.08, 'end': 34.64, 'text': 'demɔkɑːtɔɳa'}


In [124]:
import pandas as pd
df = pd.DataFrame(data=dist_m,index=[x["text"] for x in new_a], columns=[x["text"] for x in new_b])

In [None]:
from phonemizer import phonemize

In [None]:
for it_a in new_a:
    print(it_a["text"], phonemize(it_a["text"], language='sv'))

Herr hɛr 
talman! tɑːlman 
Riksdagsledamöter! rɪksdɑːɡsleːdamøːtər 
Allianspartierna aliːanspatiːərna 
Moderaterna, muːdeːratərna 
Centerpartiet, sɛntərpatiːət 
Liberalerna liːbəralərna 
och ɔk 
Kristdemokraterna kriːstdəmɔkrɑːtɛrna 
föreslår føːrəsloːr 
som sɔm 
riksdagens rɪksdɑːɡɛns 
förste fœʂtə 
vice viːsə 
talman tɑːlman 
Ewa eːva 
Thalén thɑːleːn 
Finné. fɪneː 


In [39]:
CHECK_MERGED = [
    ("rɪksasleːdamøːtœ̞", "rɪksdɑːɡsleːdamøːtər"),
    ("alaspatiːæɳa", "aliːanspatiːərna"),
    ("sentəpatiːət", "sɛntərpatiːət"),
    ("kɪsdemɔkɑːtɔɳa", "kriːstdəmɔkrɑːtɛrna")
]

In [40]:
from difflib import SequenceMatcher

for mpair in CHECK_MERGED:
    a = mpair[0]
    b = mpair[1]
    s = SequenceMatcher(None, a, b)
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        print('{:7}   a[{}:{}] --> b[{}:{}] {!r:>8} --> {!r}'.format(
            tag, i1, i2, j1, j2, a[i1:i2], b[j1:j2]))
    print(s.ratio())
    print()

equal     a[0:4] --> b[0:4]   'rɪks' --> 'rɪks'
replace   a[4:5] --> b[4:8]      'a' --> 'dɑːɡ'
equal     a[5:15] --> b[8:18] 'sleːdamøːt' --> 'sleːdamøːt'
replace   a[15:17] --> b[18:20]     'œ̞' --> 'ər'
0.7567567567567568

equal     a[0:2] --> b[0:2]     'al' --> 'al'
insert    a[2:2] --> b[2:4]       '' --> 'iː'
equal     a[2:3] --> b[4:5]      'a' --> 'a'
insert    a[3:3] --> b[5:6]       '' --> 'n'
equal     a[3:9] --> b[6:12] 'spatiː' --> 'spatiː'
replace   a[9:11] --> b[12:15]     'æɳ' --> 'ərn'
equal     a[11:12] --> b[15:16]      'a' --> 'a'
0.7142857142857143

equal     a[0:1] --> b[0:1]      's' --> 's'
replace   a[1:2] --> b[1:2]      'e' --> 'ɛ'
equal     a[2:5] --> b[2:5]    'ntə' --> 'ntə'
insert    a[5:5] --> b[5:6]       '' --> 'r'
equal     a[5:12] --> b[6:13] 'patiːət' --> 'patiːət'
0.88

equal     a[0:1] --> b[0:1]      'k' --> 'k'
replace   a[1:2] --> b[1:4]      'ɪ' --> 'riː'
equal     a[2:3] --> b[4:5]      's' --> 's'
insert    a[3:3] --> b[5:6]       '' --> 't

In [125]:
df

Unnamed: 0,ɑː,tɑːlman,rɪksasleːda,møːtœ̞,<pa>,al,aspatiːæɳa,mʊdɑːtœ̞ɔɳa,<pa>.1,sentə,...,fœ̞ːesoː,sɔm,rɪksdɑːɡəns,fœ̞st,viːsə,tɑːlman.1,<pa>.2,eva,tareːn,fɪne
Herr,0.02,0.82,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
talman!,0.72,0.12,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Riksdagsledamöter!,1.0,1.0,1.0,0.62,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Allianspartierna,1.0,1.0,1.0,1.0,1.1,1.0,0.16,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Moderaterna,",1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.02,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Centerpartiet,",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Liberalerna,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
och,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Kristdemokraterna,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
föreslår,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.059,0.601,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
