In [6]:
from pathlib import Path
import json



In [None]:
#ANALYSIS_PATH = Path("/shared/mm_conv/analysis/analysis_results/")
ANALYSIS_PATH = Path("/Users/joregan/Playing/hsi/analysis_results")

In [8]:
object_timings = {}
for object_file in ANALYSIS_PATH.glob("hsi_*/*_object_analysis.json"):
    file_id = object_file.stem.replace("_object_analysis", "")
    if not file_id in object_timings:
        object_timings[file_id] = {}
    with open(object_file) as inf:
        data = json.load(inf)
    for item in data:
        for one_item in data[item]:
            if not one_item["utterance_id"] in object_timings[file_id]:
                object_timings[file_id][one_item["utterance_id"]] = []
            
            object_timings[file_id][one_item["utterance_id"]].append({
                "phrase": one_item["phrase"],
                "start": one_item["timing"][0],
                "end": one_item["timing"][1],
                "topic": one_item["topic"]
            })

In [11]:
import re

def slice_tsv_data(data, start, end):
    ret = []
    for datum in data:
        if type(datum["start"]) is str:
            datum["start"] = float(datum["start"])
        if type(datum["end"]) is str:
            datum["end"] = float(datum["end"])
        if datum["start"] >= start and datum["end"] <= end:
            ret.append(datum)
        elif datum["end"] > end:
            return ret
    return ret

def load_tsv(filename):
    data = []
    with open(filename) as inf:
        for line in inf.readlines():
            parts = line.strip().split("\t")
            data.append({
                "start": float(parts[0]),
                "end": float(parts[1]),
                "word": parts[2]
            })
    return data

def norm_spaces(text):
    return re.sub("  +", " ", text.strip())

def clean_text(text):
    text = norm_spaces(text)
    return " ".join([x.lower().strip(".,;?!") for x in text.split(" ")])

def get_indices(needle, haystack, checkpos=True):
    ret = []
    nwords = [x.lower().strip(",?.;:()") for x in needle.split(" ")]
    hwords = [x.lower().strip(",?.;:") for x in haystack.split(" ")]
    nwordspos = nwords[:-1] + [f"{nwords[-1]}'s"]
    nlen = len(nwords)

    for i in range(len(hwords)):
        if hwords[i:i+nlen] == nwords:
            ret.append((i, i+nlen))
        elif checkpos and hwords[i:i+nlen] == nwordspos:
            ret.append((i, i+nlen))
    return ret

def clean_text2(text):
    nums = {
        "60": "sixty",
        "1": "one",
        "20th": "twentieth",
        "9th": "ninth",
        "5": "five"
    }
    text = norm_spaces(text)
    words = [x.lower().strip(".,;?!") for x in text.split(" ")]
    ret = []
    for word in words:
        if word.startswith("[") and word.endswith("]"):
            continue
        elif word.startswith("{") and word.endswith("}"):
            continue
        word = nums.get(word, word)
        word = word.replace(".", " ").replace(",", " ")
        ret.append(word)
    return " ".join(ret)

In [12]:
MANUAL = """
hsi_5_0718_210_001	17
hsi_5_0718_210_001	18
hsi_5_0718_210_001	114
hsi_4_0717_211_003	36
hsi_4_0717_211_003	42
hsi_3_0715_210_010	89
hsi_3_0715_209_008	31
hsi_3_0715_210_011	48
hsi_4_0717_211_002	6
hsi_5_0718_210_001	49
hsi_5_0718_209_003	7
hsi_6_0718_227_002	63
hsi_5_0718_209_001	1
hsi_6_0718_210_002	102
hsi_6_0718_210_002	33
hsi_6_0718_210_002	18
hsi_6_0718_209_001	95
hsi_3_0715_209_006	18
hsi_3_0715_227_001	21
hsi_4_0717_210_001	47
hsi_3_0715_210_010	87
hsi_3_0715_210_010	15
hsi_3_0715_209_006	30
hsi_3_0715_209_006	43
hsi_6_0718_211_002	14
"""

manual_segments = {}
for line in MANUAL.split("\n"):
    if line == "":
        continue
    parts = line.split("\t")
    if not parts[0] in manual_segments:
        manual_segments[parts[0]] = []
    manual_segments[parts[0]].append(parts[1])

In [26]:
def get_tsv_for_segment(segment, tsv_data, filename=None, segment_id=None):
    start = segment["timing"]["phrase_start"]
    end = segment["timing"]["phrase_end"]

    tsv = slice_tsv_data(tsv_data, start, end)
    tsv_words = " ".join([x["word"] for x in tsv])
    print("TSV words", tsv_words)

    if filename and filename in manual_segments and segment_id and segment_id in manual_segments[filename]:
        return tsv

    if segment["utterance"] != tsv_words:
        cleaned_snippet = clean_text2(segment["utterance"])
        cleaned_text = clean_text2(tsv_words)

        if cleaned_snippet not in cleaned_text:
            if filename is not None and segment_id is not None:
                print(f"{filename}\t{segment_id}\t{segment['utterance']}\t{tsv_words}")
            else:
                print("🙀 mismatch:", "🖇️", segment["utterance"], "🎧", tsv_words, cleaned_text.find(cleaned_snippet))
            return []
        else:
            idxes = get_indices(cleaned_snippet, cleaned_text)
            assert len(idxes) == 1
            tsv = tsv[idxes[0][0]:idxes[0][1]]
            tsv_words = " ".join([x["word"] for x in tsv])
            cleaned_text = clean_text(tsv_words)
            assert cleaned_snippet == cleaned_text, f"🖇️ {cleaned_snippet} 🎧 {cleaned_text}"
    return tsv

In [16]:
INPUT_FILE = "/shared/mm_conv/meta_final_set/meta_pronomial_single.json"
INPUT_FILE = "/tmp/meta_pronomial_single.json"
with open(INPUT_FILE) as inf:
    data = json.load(inf)

In [20]:
object_timings["hsi_4_0717_222_003"]["0"]

[{'phrase': 'that', 'start': 6.112, 'end': 6.152, 'topic': 'room'}]

In [25]:
data["0"]

{'recording_id': 'hsi_4_0717_222_003',
 'segment_id': '33',
 'plurality': 'single',
 'ref_type': 'pronominal',
 'phrase': 'that',
 'utterance': "I don't think it goes very well together with the old Farmers. I don't, I don't remember what that one's called.",
 'object_id': 'Painting_eda6d5e1',
 'object_name': '',
 'image_paths': {'color': 'images/color/hsi_4_0717_222_003_33_002_color.png',
  'depth': 'images/depth/hsi_4_0717_222_003_33_002_depth.png',
  'overlay': 'images/overlay/hsi_4_0717_222_003_33_002_overlay.png',
  'mask': 'images/masks/hsi_4_0717_222_003_33_002_mask.png'},
 'topic': 'painting',
 'timing': {'phrase_start': 168.86508987625018,
  'phrase_end': 174.22052980009644}}

In [30]:
TSVS = Path("/Users/joregan/Playing/hsi/word_annotations/")
base = "hsi_4_0717_222_003"
tsv_data = load_tsv(str(TSVS / f"{base}_main.tsv"))
a = get_tsv_for_segment(data["0"], tsv_data)

TSV words I don't think it goes very well together with the old Farmers. I don't, I don't remember what that one's called.


In [31]:
a

[{'start': 169.005, 'end': 169.025, 'word': 'I'},
 {'start': 169.025, 'end': 169.265, 'word': "don't"},
 {'start': 169.265, 'end': 169.525, 'word': 'think'},
 {'start': 169.525, 'end': 169.665, 'word': 'it'},
 {'start': 169.665, 'end': 169.945, 'word': 'goes'},
 {'start': 169.945, 'end': 170.185, 'word': 'very'},
 {'start': 170.185, 'end': 170.405, 'word': 'well'},
 {'start': 170.405, 'end': 170.805, 'word': 'together'},
 {'start': 170.805, 'end': 170.965, 'word': 'with'},
 {'start': 170.985, 'end': 171.065, 'word': 'the'},
 {'start': 171.065, 'end': 171.485, 'word': 'old'},
 {'start': 171.485, 'end': 172.205, 'word': 'Farmers.'},
 {'start': 172.205, 'end': 172.305, 'word': 'I'},
 {'start': 172.305, 'end': 172.585, 'word': "don't,"},
 {'start': 172.585, 'end': 172.745, 'word': 'I'},
 {'start': 172.765, 'end': 172.865, 'word': "don't"},
 {'start': 172.865, 'end': 173.185, 'word': 'remember'},
 {'start': 173.185, 'end': 173.365, 'word': 'what'},
 {'start': 173.365, 'end': 173.545, 'word'

In [None]:
def find_collisions(timings, key):
    collisions = []
    clean_key = clean_text2(key)
    if len(timings) == 1:
        return []
    for i in range(len(timings)):
        phrase = clean_text2(timings[i]["phrase"])
        if phrase == clean_key:
            continue
        if get_indices(clean_key, phrase) != []:
            collisions.append(timings[i]["phrase"])
    return collisions

In [83]:
def process_segment(segment, mark_start = '<span style="background-color: yellow;">', mark_end = '</span>'):
    rec_id = segment["recording_id"]
    seg_id = segment["segment_id"]
    this_segment = object_timings[rec_id][seg_id]
    phrase = clean_text2(segment["phrase"])

    collisions = find_collisions(this_segment, phrase)

    tsv_data = load_tsv(str(TSVS / f"{rec_id}_main.tsv"))
    tsv = get_tsv_for_segment(segment, tsv_data)

    filtered = []
    for item in this_segment:
        if clean_text2(item["phrase"]) == phrase:
            filtered.append(item)
    
    indices = get_indices(phrase, segment["utterance"])

    utt_words = segment["utterance"].split(" ")

    collision_indices = []
    if collisions != []:
        for collision in collisions:
            collision_indices += get_indices(collision, segment["utterance"])
            for ii in indices:
                i_start = tsv[ii[0]:ii[1]][0]["start"]
                i_end = tsv[ii[0]:ii[1]][-1]["end"]
                for ci in collision_indices:
                    c_start = tsv[ci[0]:ci[1]][0]["start"]
                    c_end = tsv[ci[0]:ci[1]][-1]["end"]
                    if i_start >= c_start and i_end <= c_end:
                        indices.remove(ii)

    if len(indices) == 1:
        pre = " ".join(utt_words[0:indices[0][0]])
        inner = " ".join(utt_words[indices[0][0]:indices[0][1]])
        end = " ".join(utt_words[indices[0][1]:])

        inner = mark_start + inner + mark_end
        return " ".join([pre, inner, end])
    else:
        print(utt_words)
        print([w["word"] for w in tsv])
        print(filtered)
        print(this_segment)
        for i in indices:
            print(tsv[i[0]:i[1]])


In [84]:
process_segment(data["10"])

TSV words And what about that waste bin over there? What's that doing there?


'And what about that waste bin over there? What\'s <span style="background-color: yellow;">that</span> doing there?'

In [37]:
segment = data["0"]

rec_id = segment["recording_id"]
seg_id = segment["segment_id"]
this_segment = object_timings[rec_id][seg_id]
phrase = clean_text2(segment["phrase"])

tsv_data = load_tsv(str(TSVS / f"{rec_id}_main.tsv"))
tsv = get_tsv_for_segment(segment, tsv_data)

filtered = []
for item in this_segment:
    if clean_text2(item["phrase"]) == phrase:
        filtered.append(item)
tsv, filtered

TSV words I don't think it goes very well together with the old Farmers. I don't, I don't remember what that one's called.


([{'start': 169.005, 'end': 169.025, 'word': 'I'},
  {'start': 169.025, 'end': 169.265, 'word': "don't"},
  {'start': 169.265, 'end': 169.525, 'word': 'think'},
  {'start': 169.525, 'end': 169.665, 'word': 'it'},
  {'start': 169.665, 'end': 169.945, 'word': 'goes'},
  {'start': 169.945, 'end': 170.185, 'word': 'very'},
  {'start': 170.185, 'end': 170.405, 'word': 'well'},
  {'start': 170.405, 'end': 170.805, 'word': 'together'},
  {'start': 170.805, 'end': 170.965, 'word': 'with'},
  {'start': 170.985, 'end': 171.065, 'word': 'the'},
  {'start': 171.065, 'end': 171.485, 'word': 'old'},
  {'start': 171.485, 'end': 172.205, 'word': 'Farmers.'},
  {'start': 172.205, 'end': 172.305, 'word': 'I'},
  {'start': 172.305, 'end': 172.585, 'word': "don't,"},
  {'start': 172.585, 'end': 172.745, 'word': 'I'},
  {'start': 172.765, 'end': 172.865, 'word': "don't"},
  {'start': 172.865, 'end': 173.185, 'word': 'remember'},
  {'start': 173.185, 'end': 173.365, 'word': 'what'},
  {'start': 173.365, 'en

In [38]:
get_indices("the thing", "the thing the thing is in")

[(0, 2), (2, 4)]