In [6]:
from pathlib import Path
import json



In [5]:
#ANALYSIS_PATH = Path("/shared/mm_conv/analysis/analysis_results/")
ANALYSIS_PATH = Path("/Users/joregan/Playing/hsi/analysis_results")

In [8]:
object_timings = {}
for object_file in ANALYSIS_PATH.glob("hsi_*/*_object_analysis.json"):
    file_id = object_file.stem.replace("_object_analysis", "")
    if not file_id in object_timings:
        object_timings[file_id] = {}
    with open(object_file) as inf:
        data = json.load(inf)
    for item in data:
        for one_item in data[item]:
            if not one_item["utterance_id"] in object_timings[file_id]:
                object_timings[file_id][one_item["utterance_id"]] = []
            
            object_timings[file_id][one_item["utterance_id"]].append({
                "phrase": one_item["phrase"],
                "start": one_item["timing"][0],
                "end": one_item["timing"][1],
                "topic": one_item["topic"]
            })

In [9]:
object_timings

{'hsi_4_0717_209_001': {'0': [{'phrase': 'computer',
    'start': 0.407,
    'end': 0.567,
    'topic': 'Laptop_c535f2bc'}],
  '1': [{'phrase': 'It',
    'start': 4.313,
    'end': 4.393,
    'topic': 'Laptop_c535f2bc'},
   {'phrase': 'the table',
    'start': 4.313,
    'end': 4.513,
    'topic': 'Laptop_c535f2bc'}],
  '2': [{'phrase': 'it',
    'start': 6.212,
    'end': 6.692,
    'topic': 'Laptop_c535f2bc'}],
  '3': [{'phrase': 'it',
    'start': 12.403,
    'end': 12.643,
    'topic': 'Laptop_c535f2bc'},
   {'phrase': 'this one',
    'start': 12.643,
    'end': 13.583,
    'topic': 'Laptop_c535f2bc'}],
  '5': [{'phrase': 'painting',
    'start': 25.092,
    'end': 25.232,
    'topic': 'Painting_0db0fb84'},
   {'phrase': 'it',
    'start': 25.232,
    'end': 25.632,
    'topic': 'Painting_0db0fb84'}],
  '6': [{'phrase': 'It',
    'start': 28.863,
    'end': 29.243,
    'topic': 'Painting_0db0fb84'}],
  '7': [{'phrase': 'It',
    'start': 30.893,
    'end': 30.993,
    'topic': 'Pai

In [10]:
import re

def slice_tsv_data(data, start, end):
    ret = []
    for datum in data:
        if type(datum["start"]) is str:
            datum["start"] = float(datum["start"])
        if type(datum["end"]) is str:
            datum["end"] = float(datum["end"])
        if datum["start"] >= start and datum["end"] <= end:
            ret.append(datum)
        elif datum["end"] > end:
            return ret
    return ret

def load_tsv(filename):
    data = []
    with open(filename) as inf:
        for line in inf.readlines():
            parts = line.strip().split("\t")
            data.append({
                "start": float(parts[0]),
                "end": float(parts[1]),
                "word": parts[2]
            })
    return data

def norm_spaces(text):
    return re.sub("  +", " ", text.strip())

def clean_text(text):
    text = norm_spaces(text)
    return " ".join([x.lower().strip(".,;?!") for x in text.split(" ")])

def get_indices(needle, haystack, checkpos=True):
    ret = []
    nwords = [x.lower().strip(",?.;:()") for x in needle.split(" ")]
    hwords = [x.lower().strip(",?.;:") for x in haystack.split(" ")]
    nwordspos = nwords[:-1] + [f"{nwords[-1]}'s"]
    nlen = len(nwords)

    for i in range(len(hwords)):
        if hwords[i:i+nlen] == nwords:
            ret.append((i, i+nlen))
        elif checkpos and hwords[i:i+nlen] == nwordspos:
            ret.append((i, i+nlen))
    return ret

In [None]:
def get_tsv_for_segment(segment, tsv_data, filename=None, segment_id=None):
    assert "general" in segment, "Missing key 'general'"
    assert "start" in segment["general"], "Missing key 'start'"
    assert "end" in segment["general"], "Missing key 'end'"

    start = segment["general"]["start"]
    end = segment["general"]["end"]

    tsv = slice_tsv_data(tsv_data, start, end)
    tsv_words = " ".join([x["word"] for x in tsv])

    if filename and filename in manual_segments and segment_id and segment_id in manual_segments[filename]:
        return tsv

    if segment["snippet"] != tsv_words:
        cleaned_snippet = clean_text2(segment["snippet"])
        cleaned_text = clean_text2(tsv_words)

        if cleaned_snippet not in cleaned_text:
            if filename is not None and segment_id is not None:
                print(f"{filename}\t{segment_id}\t{segment['snippet']}\t{tsv_words}")
            else:
                print("🙀 mismatch:", "🖇️", segment["snippet"], "🎧", tsv_words, cleaned_text.find(cleaned_snippet))
            return []
        else:
            idxes = get_indices(cleaned_snippet, cleaned_text)
            assert len(idxes) == 1
            tsv = tsv[idxes[0][0]:idxes[0][1]]
            tsv_words = " ".join([x["word"] for x in tsv])
            cleaned_text = clean_text(tsv_words)
            assert cleaned_snippet == cleaned_text, f"🖇️ {cleaned_snippet} 🎧 {cleaned_text}"
    return tsv