# First attempts at alignment (take 2)

> "Wish I could find the first version"

- toc: false
- hidden: true
- branch: master
- categories: [riksdag, wav2vec, alignment]

In [22]:
TEST_A = "/Users/joregan/Playing/rd_ctm_edit/H5C120171011va"

In [23]:
lines = []
with open(TEST_A) as f:
    for line in f.readlines():
        lines.append(line.strip())

In [24]:
def accept_all(lines):
    outlines = []
    for line in lines:
        parts = line.split(" ")
        if parts[-1] == "cor":
            outlines.append(line)
        elif parts[-1] == "sub":
            parts[4] = parts[6]
            parts[-1] = "cor"
            outlines.append(" ".join(parts))
    return outlines

In [25]:
accept_all(lines)

['2442205210012872721 1 27.86 0.06 Herr 1.0 Herr cor',
 '2442205210012872721 1 28.0 0.48 talman! 1.0 talman! cor',
 '2442205210012872721 1 28.72 0.96 Riksdagsledamöter! 1.0 Riksdagsledamöter! cor',
 '2442205210012872721 1 30.16 0.82 Allianspartierna 1.0 Allianspartierna cor',
 '2442205210012872721 1 31.08 0.56 Moderaterna, 1.0 Moderaterna, cor',
 '2442205210012872721 1 32.08 0.74 Centerpartiet, 1.0 Centerpartiet, cor',
 '2442205210012872721 1 33.02 0.579 Liberalerna 1.0 Liberalerna cor',
 '2442205210012872721 1 33.74 0.079 och 1.0 och cor',
 '2442205210012872721 1 33.88 0.759 Kristdemokraterna 1.0 Kristdemokraterna cor',
 '2442205210012872721 1 34.76 0.399 föreslår 1.0 föreslår cor',
 '2442205210012872721 1 35.22 0.079 som 1.0 som cor',
 '2442205210012872721 1 35.36 0.579 riksdagens 1.0 riksdagens cor',
 '2442205210012872721 1 36.02 0.239 förste 1.0 förste cor',
 '2442205210012872721 1 36.32 0.179 vice 1.0 vice cor',
 '2442205210012872721 1 36.58 0.34 talman 1.0 talman cor',
 '24422052

In [26]:
def ctm_to_timed(lines):
    output = []
    for line in lines:
        parts = line.split(" ")
        start = float(parts[2])
        dur = float(parts[3])
        output.append({
            "start": start,
            "end": start + dur,
            "text": parts[6]
        })
    return output

In [27]:
side_a = ctm_to_timed(accept_all(lines))

In [28]:
phonfile = "/Users/joregan/Playing/rd_phonetic/2442205210012872721_480p.json"

In [29]:
import json
with open(phonfile) as f:
    pieces = json.load(f)

In [30]:
def hf_json_to_timed(data):
    output = []
    for chunk in data["chunks"]:
        output.append({
            "start": chunk["timestamp"][0],
            "end": chunk["timestamp"][1],
            "text": chunk["text"]
        })
    return output

In [31]:
side_b = hf_json_to_timed(pieces)

In [32]:
def prune_to_other(left, right, fudge=0.5):
    output = []
    for item in right:
        if item["start"] < left[0]["start"] - fudge:
            continue
        elif item["end"] > left[-1]["end"] + fudge:
            continue
        else:
            output.append(item)

    return left, output

In [33]:
new_a, new_b = prune_to_other(side_a, side_b)

In [86]:
def end_cost(a, b):
    return abs(a["end"] - b["end"])

def start_cost(a, b):
    return abs(a["start"] - b["start"])

def cost(a, b):
    starts = start_cost(a, b)
    ends = end_cost(a, b)
    return starts + ends

In [35]:
def in_start_range(a, b, range=0.2):
    return abs(a["start"] - b["start"]) <= range

def in_end_range(a, b, range=0.2):
    return abs(a["end"] - b["end"]) <= range

def in_range(a, b, range=0.2):
    r_start = in_start_range(a, b, range)
    r_end = in_end_range(a, b, range)
    return r_start or r_end


In [36]:
def falls_between(a1, a2, b):
    if b["end"] <= a2["start"] and b["start"] >= a1["end"]:
        return True
    return False

In [38]:
import numpy as np

In [57]:
def approx_eq(start1, start2, factor=0.04):
    return start1 == start2 or abs(start1 - start2) < factor

In [139]:
def align_times(new_a, new_b, merge_end_flexibility=0.06):
    s1 = len(new_a)
    s2 = len(new_b)

    additionals = []
    merges = {}

    dist_matrix = np.matrix(np.ones((s1, s2)))
    DEBUG = True
    pair_cost = 0.0

    for i in range(s1):
        for j in range(s2):
            if not in_range(new_a[i], new_b[j]):
                continue
            if i == 0 and new_b[j]["end"] < new_a[0]["start"]:
                additionals.append(({}, new_a[i], new_b[j]))
                pair_cost = 1.
            elif i < (s1 - 1) and falls_between(new_a[i], new_a[i + 1], new_b[j]):
                additionals.append((new_a[i], new_a[i + 1], new_b[j]))
                pair_cost = 1.
            elif i == s1 and new_b[j]["start"] >= new_a[i]["end"]:
                additionals.append((new_a[i], {}, new_b[j]))
                pair_cost = 1.
            if approx_eq(new_a[i]["start"], new_b[j]["start"]):
                tmp_j = j
                fwd = []
                extent = new_b[tmp_j]
                if i < (s1 - 2) and new_b[tmp_j]["end"] < new_a[i + 1]["end"]:
                    extent = new_a[i + 1]
                while tmp_j < (s2 - 1) and not in_end_range(new_a[i], extent, merge_end_flexibility):
                    fwd.append((end_cost(new_a[i], new_b[tmp_j]), tmp_j))
                    tmp_j += 1
                if len(fwd) > 1:
                    sfwd = sorted(fwd)
                    print(sfwd)
                    new_j = sfwd[0][1]
                    if new_j != j:
                        pair_cost = sfwd[0][0]
                        merges[i] = [x for x in range(j, new_j + 1)]
                        j = new_j
            if pair_cost != 1.:
                pair_cost = cost(new_a[i], new_b[j])
            dist_matrix[i, j] = pair_cost
    return dist_matrix, additionals, merges

In [None]:
dist_matrix, adds, mrg = align_times(new_a, new_b)

In [141]:
mrg

{2: [2, 3], 3: [5, 6], 5: [9, 10], 8: [13, 14]}

In [142]:
for aa in mrg:
    for bb in mrg[aa]:
        print(new_a[aa], new_b[bb])

{'start': 28.72, 'end': 29.68, 'text': 'Riksdagsledamöter!'} {'start': 28.7, 'end': 29.18, 'text': 'rɪksasleːda'}
{'start': 28.72, 'end': 29.68, 'text': 'Riksdagsledamöter!'} {'start': 29.24, 'end': 29.58, 'text': 'møːtœ̞'}
{'start': 30.16, 'end': 30.98, 'text': 'Allianspartierna'} {'start': 30.16, 'end': 30.26, 'text': 'al'}
{'start': 30.16, 'end': 30.98, 'text': 'Allianspartierna'} {'start': 30.32, 'end': 30.98, 'text': 'aspatiːæɳa'}
{'start': 32.08, 'end': 32.82, 'text': 'Centerpartiet,'} {'start': 32.06, 'end': 32.3, 'text': 'sentə'}
{'start': 32.08, 'end': 32.82, 'text': 'Centerpartiet,'} {'start': 32.36, 'end': 32.86, 'text': 'patiːət'}
{'start': 33.88, 'end': 34.639, 'text': 'Kristdemokraterna'} {'start': 33.84, 'end': 34.04, 'text': 'kɪs'}
{'start': 33.88, 'end': 34.639, 'text': 'Kristdemokraterna'} {'start': 34.08, 'end': 34.64, 'text': 'demɔkɑːtɔɳa'}


In [99]:
import pandas as pd
df = pd.DataFrame(data=dist_matrix,index=[x["text"] for x in new_a], columns=[x["text"] for x in new_b])

In [79]:
from phonemizer import phonemize

In [41]:
for it_a in new_a:
    print(it_a["text"], phonemize(it_a["text"], language='sv'))

Herr hɛr 
talman! tɑːlman 
Riksdagsledamöter! rɪksdɑːɡsleːdamøːtər 
Allianspartierna aliːanspatiːərna 
Moderaterna, muːdeːratərna 
Centerpartiet, sɛntərpatiːət 
Liberalerna liːbəralərna 
och ɔk 
Kristdemokraterna kriːstdəmɔkrɑːtɛrna 
föreslår føːrəsloːr 
som sɔm 
riksdagens rɪksdɑːɡɛns 
förste fœʂtə 
vice viːsə 
talman tɑːlman 
Ewa eːva 
Thalén thɑːleːn 
Finné. fɪneː 
