In [1]:
import pocketsphinx

In [4]:
textgrids="/Users/joregan/Playing/hsi/audio/whisperx-textgrids/"
ctmdir="/Users/joregan/Playing/hsi/ctc_prefix_beam_search/"
ctmeditdir="/Users/joregan/Playing/hsi/whisper_reverb_ctmedit/"

In [6]:
from pathlib import Path

def read_ctm(filename):
    if type(filename) is Path:
        filename = str(filename)

    ctmlines = []
    with open(filename) as inf:
        for line in inf.readlines():
            line = line.strip()
            ctmlines.append(line.split(" "))
    return ctmlines

In [5]:
from praatio import textgrid

def read_textgrid(filename, tiername="words"):
    if type(filename) is Path:
        filename = str(filename)
    tg = textgrid.openTextgrid(filename, False)

    utterances = []
    tier = tg.getTier(tiername)
    for entry in tier.entries:
        text = entry.label.strip()
        utterances.append((entry.start, entry.end, text))
    return utterances

In [8]:
tgpath = Path(textgrids)
ctmpath = Path(ctmdir)
ctmeditpath = Path(ctmeditdir)

In [9]:
EG = "hsi_5_0718_210_001_main"
CTMEDATA = read_ctm(ctmeditpath / f"{EG}.ctmedit")
TGDATA = read_textgrid(tgpath / f"{EG}.TextGrid")

In [18]:
def clean_word(word):
    return word.lower().strip(".,;:!?")

def map_words_to_tg(tgdata):
    mappings = []
    for num, item in enumerate(tgdata):
        for word in item[2].split(" "):
            clean = clean_word(word)
            mappings.append([clean, word, num])
    return mappings

In [20]:
def ctm_excerpt(ctmdata, start, end, fluff=0.6):
    if type(start) is str:
        start = float(start)
    if type(end) is str:
        end = float(end)
    
    excerpt = []
    for ctmline in ctmdata:
        cstart = float(ctmline[2])
        cdur = float(ctmline[3])
        cend = cstart + cdur
        if cstart < (start - fluff):
            continue
        if cend > (end + fluff):
            break
        excerpt.append(ctmline)
    return excerpt

In [33]:
def index_ctmdata_by_start(ctmdata):
    return {float(x[2]): n for n, x in enumerate(ctmdata)}

In [34]:
index_ctmdata_by_start(CTMEDATA)

{7.23: 0,
 7.43: 1,
 7.75: 2,
 8.47: 3,
 9.43: 4,
 9.59: 5,
 9.83: 6,
 10.39: 7,
 13.19: 8,
 13.39: 9,
 23.42: 10,
 23.7: 11,
 39.22: 12,
 40.06: 13,
 40.34: 14,
 40.46: 15,
 40.58: 16,
 41.33: 17,
 41.49: 18,
 41.85: 19,
 42.09: 20,
 42.25: 21,
 42.49: 22,
 44.05: 23,
 44.25: 24,
 44.41: 25,
 44.61: 26,
 44.85: 27,
 55.33: 28,
 55.93: 29,
 56.29: 30,
 56.57: 31,
 56.71: 32,
 56.77: 33,
 56.91: 34,
 57.33: 35,
 57.57: 36,
 57.85: 37,
 58.01: 38,
 58.17: 39,
 58.53: 40,
 58.77: 41,
 58.93: 42,
 59.03: 43,
 59.37: 44,
 59.65: 45,
 59.85: 46,
 60.13: 47,
 61.13: 48,
 61.52: 49,
 62.84: 50,
 63.28: 51,
 63.48: 52,
 63.96: 53,
 64.16: 54,
 64.32: 55,
 64.72: 56,
 65.52: 57,
 65.96: 58,
 66.32: 59,
 66.6: 60,
 66.88: 61,
 67.12: 62,
 67.24: 63,
 67.36: 64,
 67.6: 65,
 67.72: 66,
 68.04: 67,
 68.4: 68,
 68.52: 69,
 68.92: 70,
 69.32: 71,
 69.76: 72,
 70.24: 73,
 70.52: 74,
 71.0: 75,
 71.4: 76,
 71.72: 77,
 72.92: 78,
 73.52: 79,
 73.72: 80,
 73.96: 81,
 74.08: 82,
 74.28: 83,
 74.52: 84,
 74

In [36]:
ctm_excerpt(CTMEDATA, 57.734, 60.576)

[['hsi_5_0718_210_001_main.wav', '1', '57.33', '0.1', 'i', '0.00', '-', 'ins'],
 ['hsi_5_0718_210_001_main.wav', '1', '57.57', '0.1', 'i', '0.00', 'i', 'cor'],
 ['hsi_5_0718_210_001_main.wav',
  '1',
  '57.85',
  '0.1',
  'really',
  '0.00',
  'really',
  'cor'],
 ['hsi_5_0718_210_001_main.wav',
  '1',
  '58.01',
  '0.1',
  'think',
  '0.00',
  'think',
  'cor'],
 ['hsi_5_0718_210_001_main.wav',
  '1',
  '58.17',
  '0.26',
  "i've",
  '0.00',
  "i've",
  'cor'],
 ['hsi_5_0718_210_001_main.wav',
  '1',
  '58.53',
  '0.1',
  'made',
  '0.00',
  'made',
  'cor'],
 ['hsi_5_0718_210_001_main.wav',
  '1',
  '58.77',
  '0.1',
  'the',
  '0.00',
  'the',
  'cor'],
 ['hsi_5_0718_210_001_main.wav',
  '1',
  '58.93',
  '0.1',
  'best',
  '0.00',
  'best',
  'cor'],
 ['hsi_5_0718_210_001_main.wav',
  '1',
  '59.03',
  '0.0',
  '-',
  '1.0',
  'out',
  'del'],
 ['hsi_5_0718_210_001_main.wav',
  '1',
  '59.37',
  '0.1',
  'outta',
  '0.00',
  'of',
  'sub'],
 ['hsi_5_0718_210_001_main.wav',
  '1',
 

In [42]:
def namethis(ctmedit, start, end, consumed_from=0):
    index = index_ctmdata_by_start(ctmedit)
    excerpt = ctm_excerpt(ctmedit, start, end)
    ex_start = float(excerpt[0][2])
    ex_start_ptr = index[ex_start]
    # if (ex_start_ptr - 1) > consumed_from:
    #     # extend here?
    #     pass
    prev_start = float(CTMEDATA[ex_start_ptr - 1][2])
    print(prev_start, ex_start, CTMEDATA[ex_start_ptr - 1])

In [44]:
namethis(CTMEDATA, 57.734, 60.576)

56.91 57.33 ['hsi_5_0718_210_001_main.wav', '1', '56.91', '0.04', 'think', '0.00', 'think', 'cor']


In [23]:
TGDATA

[(7.489, 7.709, 'Yeah.'),
 (7.729, 8.19, 'Yeah.'),
 (8.21, 8.49, 'Yes.'),
 (40.339, 41.12, 'Oh, that was nice.'),
 (41.461, 42.622, 'Nice idea to have that.'),
 (44.164, 45.025, 'Oh, I like this place.'),
 (55.393, 57.114, 'Yeah, so what do you think?'),
 (57.734,
  60.576,
  "I really think I've made the best out of this place actually."),
 (62.837, 63.038, 'Sorry?'),
 (63.918,
  72.263,
  "Yeah, it does and I really like what I did with the flowers up there because it's like the hanging gardens of Babylon."),
 (73.604, 77.026, 'I would like to have more of course, but I think in time'),
 (78.106, 82.527, 'They will hang down like a waterfall of green leaves.'),
 (83.368,
  88.609,
  "That's my intention anyway because I'm really into plants."),
 (90.149,
  98.892,
  "What's nice with these plants too, that's called the tongue of mother-in-law, the one by the window in Swedish."),
 (100.792, 103.194, "Because it's called Dragon Tongue."),
 (104.174,
  110.498,
  'And your mother-in-la

In [45]:
def partition_ctm(ctmdata, amount=1.0):
    segments = []
    current = []

    prev = 0.0
    for ctmline in ctmdata:
        start = float(ctmline[2])
        end = float(ctmline[3]) + start
        if prev > 0.0 and (end - prev) > amount:
            segments.append(current)
            current = []
        current.append(ctmline)
    segments.append(current)
    return segments

In [47]:
partitioned = partition_ctm(CTMEDATA)
for partition in partitioned:
    edit_type = set()
    for ctmline in partition:
        edit_type.add(ctmline[-1])
    if len(edit_type) == 1:
        print(partition)

In [49]:
partitioned[0][-1]

['hsi_5_0718_210_001_main.wav',
 '1',
 '734.04',
 '0.1',
 'yeah',
 '0.00',
 'yeah',
 'cor']