In [22]:
import json

In [23]:
def load_tsv(filename):
    data = []
    with open(filename) as inf:
        for line in inf.readlines():
            parts = line.strip().split("\t")
            data.append({
                "start": float(parts[0]),
                "end": float(parts[1]),
                "word": parts[2]
            })
    return data

In [24]:
def slice_tsv_data(data, start, end):
    ret = []
    for datum in data:
        if type(datum["start"]) is str:
            datum["start"] = float(datum["start"])
        if type(datum["end"]) is str:
            datum["end"] = float(datum["end"])
        if datum["start"] >= start and datum["end"] <= end:
            ret.append(datum)
        elif datum["end"] > end:
            return ret
    return ret

In [25]:
import re
def norm_spaces(text):
    return re.sub("  +", " ", text.strip())

In [26]:
def clean_text(text):
    text = norm_spaces(text)
    return " ".join([x.lower().strip(".,;?!") for x in text.split(" ")])

In [27]:
from pathlib import Path

TSVS = Path("/Users/joregan/Playing/hsi/word_annotations/")
JSON = Path("/Users/joregan/Playing/merged_annotations/")
OUTP = Path("/Users/joregan/Playing/timed_annotations/")
if not OUTP.is_dir():
    OUTP.mkdir()

In [41]:
def get_indices(needle, haystack, checkpos=True):
    ret = []
    nwords = [x.lower().strip(",?.;:()") for x in needle.split(" ")]
    hwords = [x.lower().strip(",?.;:") for x in haystack.split(" ")]
    nwordspos = nwords[:-1] + [f"{nwords[-1]}'s"]
    nlen = len(nwords)

    for i in range(len(hwords)):
        if hwords[i:i+nlen] == nwords:
            ret.append((i, i+nlen))
        elif checkpos and hwords[i:i+nlen] == nwordspos:
            ret.append((i, i+nlen))
    return ret

In [38]:
def clean_text2(text):
    nums = {
        "60": "sixty",
        "1": "one",
        "20th": "twentieth",
        "9th": "ninth",
        "5": "five"
    }
    text = norm_spaces(text)
    words = [x.lower().strip(".,;?!") for x in text.split(" ")]
    ret = []
    for word in words:
        if word.startswith("[") and word.endswith("]"):
            continue
        elif word.startswith("{") and word.endswith("}"):
            continue
        word = nums.get(word, word)
        word = word.replace(".", " ").replace(",", " ")
        ret.append(word)
    return " ".join(ret)

In [50]:
def get_tsv_for_segment(segment, tsv_data, filename=None, segment_id=None):
    assert "general" in segment, "Missing key 'general'"
    assert "start" in segment["general"], "Missing key 'start'"
    assert "end" in segment["general"], "Missing key 'end'"

    start = segment["general"]["start"]
    end = segment["general"]["end"]

    tsv = slice_tsv_data(tsv_data, start, end)
    tsv_words = " ".join([x["word"] for x in tsv])

    if segment["snippet"] != tsv_words:
        cleaned_snippet = clean_text2(segment["snippet"])
        cleaned_text = clean_text2(tsv_words)

        if cleaned_snippet not in cleaned_text:
            if filename is not None and segment_id is not None:
                print(f"{filename}\t{segment_id}\t{segment['snippet']}\t{tsv_words}")
            else:
                print("🙀 mismatch:", "🖇️", segment["snippet"], "🎧", tsv_words, cleaned_text.find(cleaned_snippet))
            return []
        else:
            idxes = get_indices(cleaned_snippet, cleaned_text)
            assert len(idxes) == 1
            tsv = tsv[idxes[0][0]:idxes[0][1]]
            tsv_words = " ".join([x["word"] for x in tsv])
            cleaned_text = clean_text(tsv_words)
            assert cleaned_snippet == cleaned_text, f"🖇️ {cleaned_snippet} 🎧 {cleaned_text}"
    return tsv

In [31]:
def is_skippable(segment, strict=True):
    skippables = ["conversation_generic"]
    if strict:
        skippables += ["reference_imaginary"]
    if not "topic_name" in segment["high_level"]:
        if "current_topic" in segment["high_level"]:
            segment["high_level"]["topic_name"] = segment["high_level"]["current_topic"]
            del(segment["high_level"]["current_topic"])
    if segment["high_level"]["topic_name"] == "reference_unreal":
        segment["high_level"]["topic_name"] = "reference_imaginary"
    if segment["high_level"]["topic_name"] in skippables:
        return True
    elif segment["low_level"]["resolved_references"] == {}:
        return True
    else:
        return False

In [32]:
# that weird waste bin [(1, 5)]
# that [(1, 2), (8, 9)]
def skip_overlapped_index(a, b):
    if a[0] >= b[0] and a[1] <= b[1]:
        return True
    return False

assert skip_overlapped_index((1, 2), (1, 5)) == True
assert skip_overlapped_index((1, 5), (1, 2)) == False

In [33]:
def prune_manual_index(indices, manual):
    ret = []
    for index in indices:
        if index[0] in manual:
            ret.append(index)
    return ret

assert prune_manual_index([(1, 3), (5, 7)], [1]) == [(1, 3)]
assert prune_manual_index([(1, 3), (5, 7)], [1, 5]) == [(1, 3), (5, 7)]

In [34]:
# I will lose all faith in humanity if there isn't a less idiotic way to do this
def prune_dict_for_overlap(segments):
    if len(segments.keys()) == 1:
        return segments
    for segment in segments:
        pruned = set()
        for seg2 in segments:
            if segment != seg2:
                for a in segments[segment]:
                    for b in segments[seg2]:
                        if skip_overlapped_index(a, b):
                            if a in pruned:
                                pruned.remove(a)
                        else:
                            pruned.add(a)
        segments[segment] = list(pruned)
    return segments

test = {
    "1": [(1, 3), (5, 7)],
    "2": [(9, 11)],
    "3": [(1, 4)]
}
exp = {
    "1": [(5, 7)],
    "2": [(9, 11)],
    "3": [(1, 4)]
}
assert prune_dict_for_overlap(test) == exp

In [46]:
def process_segment(segment, tsv_data, filename=None, segment_id=None):
    if is_skippable(segment):
        return
    tsv = get_tsv_for_segment(segment, tsv_data, filename, segment_id)
    references = segment["low_level"]["resolved_references"]
    manual_idx = segment["low_level"].get("resolved_references_indices", {})

    # these are ordered. Kinda.
    indices = {}
    for ref in references:
        indices[ref] = get_indices(ref, segment["snippet"])
        if ref in manual_idx:
            indices[ref] = prune_manual_index(indices[ref], manual_idx[ref])
    indices = prune_dict_for_overlap(indices)
    reftimes = []
    for ref in references:
        for index in indices[ref]:
            seq = tsv[index[0]:index[1]]
            if seq == []:
                continue
            start = seq[0]["start"]
            end = seq[-1]["end"]
            reftimes.append({
                "start": start,
                "end": end,
                "text": ref,
                "reference": references[ref]
            })
    segment["low_level"]["reference_times"] = reftimes

In [51]:
for jsonfile in JSON.glob("*.json"):
    base = jsonfile.stem
    with open(jsonfile) as jsf:
        data = json.load(jsf)
    rawtsv = load_tsv(str(TSVS / f"{base}_main.tsv"))
    outfile = OUTP / f"{base}.json"
    for seg in data:
        process_segment(data[seg], rawtsv, base, seg)
    with open(str(outfile), 'w') as f:
        json.dump(data, f, indent=2)

hsi_6_0718_227_002	63	Like if someone wants to walk around your, your, your sofa,it's, it's going to be in their way, right?	Like if someone wants to walk around your, your, your sofa, it's, it's gonna be in their way, right?
hsi_5_0718_209_003	7	The desk, you can't have that here because it's ruining the whole atmosphere.	The desk, you you can't have that here because it's eh ruining the whole atmosphere.
hsi_6_0718_222_001	47	uh that that	
hsi_6_0718_222_001	87	it's	
hsi_5_0718_210_001	17	eh, So it's brought here by a friend as just a little, little cutling.	eh, So it's brought here by a friend as just a little, little cutting..
hsi_5_0718_210_001	18	And now it looks like that.	And now it's looks like that.
hsi_5_0718_210_001	49	Because, you know, every time I was with my... We didn't watch much TV in those days because I'm 60 years old.	Because, you know, every time I was with my... We didn't watch much eh TV in those days because I'm sixty years old.
hsi_5_0718_210_001	114	They're 

In [37]:
print(jsonfile)

/Users/joregan/Playing/merged_annotations/hsi_6_0718_227_001.json


In [None]:
import json 
import csv      
    
def update_json_snippets_from_csv(json_path, csv_path, output_path):
    # Load JSON
    with open(json_path, 'r') as f:
        json_data = json.load(f)

    # Load CSV and store snippets by (start, end)
    csv_snippets = {}
    with open(csv_path, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            if len(row) != 3:
                continue
            try:
                start = round(float(row[0]), 3)
                end = round(float(row[1]), 3)
                snippet = row[2]
                csv_snippets[(start, end)] = snippet
            except ValueError:
                continue

    # Replace JSON snippets based on matching start/end
    for entry in json_data.values():
        start = round(entry['general']['start'], 3)
        end = round(entry['general']['end'], 3)
        if (start, end) in csv_snippets:
            entry['snippet'] = csv_snippets[(start, end)]

    # Save updated JSON
    with open(output_path, 'w') as f:
        json.dump(json_data, f, indent=2)

    print(f"Updated JSON saved to: {output_path}")