In [22]:
def slurpfile(filename) -> str:
    with open(filename) as inf:
        return inf.read().strip()

In [23]:
host = "http://130.237.3.107:8080/api/"
api_token: str = slurpfile("label_studio_mine")
input_dir = "/Users/joregan/Playing/hsi_ctmedit/textgrid/"

In [6]:
import requests
import json
from pathlib import Path

headers = {
    "Authorization": f"Token {api_token}"
}

In [7]:
def get_projects():
    req = requests.get(f"{host}projects", headers=headers)
    assert req.status_code == 200
    data = json.loads(req.text)
    return data

In [8]:
def get_project_id_from_name(name):
    projects = get_projects()
    for res in projects["results"]:
        if res["title"].strip() == name.strip():
            return res["id"]

In [36]:
get_project_id_from_name("Main 6")

7

In [10]:
def get_tasks(projectid):
    req = requests.get(f"{host}tasks", headers=headers, params={"project": projectid})
    assert req.status_code == 200
    data = json.loads(req.text)
    return data

In [11]:
def index_task_filestem_to_id(tasks_data):
    tasks = tasks_data["tasks"]
    mapping = {}
    for task in tasks:
        task_id = task["id"]
        if "storage_filename" in task:
            task_raw_path = task["storage_filename"]
        else:
            task_raw_path = task["data"]["audio"]
        if not task_raw_path:
            continue
        task_stem = task_raw_path.split("/")[-1]
        mapping[task_stem] = task_id
    return mapping

In [22]:
get_project_id_from_name("Speaker 3")

1

In [37]:
tasks = get_tasks(7)

In [38]:
mapping = index_task_filestem_to_id(tasks)

In [39]:
mapping

{'hsi_6_0718_209_001_main.wav': 96,
 'hsi_6_0718_209_002_main.wav': 97,
 'hsi_6_0718_209_003_main.wav': 98,
 'hsi_6_0718_210_001_main.wav': 99,
 'hsi_6_0718_210_002_main.wav': 100,
 'hsi_6_0718_211_001_main.wav': 101,
 'hsi_6_0718_211_002_main.wav': 102,
 'hsi_6_0718_222_001_main.wav': 103,
 'hsi_6_0718_222_003_main.wav': 104,
 'hsi_6_0718_227_001_main.wav': 105,
 'hsi_6_0718_227_002_main.wav': 106}

In [12]:
import json
from praatio import textgrid

noises = ["smack", "spn", "mic_click", "labial_trill", "sniff", "click", "vocal_clicks", "throat_noise", "vocal_noise", "lip_trill", "lip_noise", "noise", "flapping_noises", "cough", "grumble", "skip", "creak", "s"]
breath = ["breath", "inhale", "exhale", "sigh", "breath_noise", "breath_noises", "blow", "suck"]
laugh = ["laugh", "laughter"]

labels = {}
for noise in noises:
    labels[noise] = "Noise"
for noise in laugh:
    labels[noise] = "Laughter"
for noise in breath:
    labels[noise] = "Breath"

def tg_to_result(tgfile):
    outputs = []
    tg = textgrid.openTextgrid(tgfile, False)
    tiername = "utterances"
    if not tiername in tg.tierNames:
        tiername = "words"

    tier = tg.getTier(tiername)
    for entry in tier.entries:
        text = entry.label.strip()
        if text == "":
            continue

        label = "Speech"
        if text.endswith("crosstalk]"):
            label = "Cross-talk"
            if text.startswith("[") and text.endswith("]"):
                text = text[1:-1]
        elif text.startswith("[") and text.endswith("]") and text[1:-1] in labels:
            label = labels[text[1:-1]]
            text = text[1:-1]

        segment = {
            "value": {
                "start": entry.start,
                "end": entry.end,
                "channel": 0,
                "labels": [label]
            },
            "from_name": "labels",
            "to_name": "audio",
            "type": "labels",
        }
        rec = {
            "value": {
                "start": entry.start,
                "end": entry.end,
                "channel": 0,
                "text": [text]
            },
            "from_name": "transcription",
            "to_name": "audio",
            "type": "textarea",
        }
        outputs.append(segment)
        outputs.append(rec)

    return outputs

In [13]:
def post_results(id, task, project, results):
    ep = f"{host}annotations/{id}/?taskID={task}&project={project}"

    cur_headers = {i: headers[i] for i in headers}
    cur_headers["Content-type"] = "application/json"

    content = {
        "was_cancelled": False,
        "ground_truth": False,
        "project": project,
        "draft_id": 0,
        "parent_prediction": None,
        "parent_annotation": None,
        "result": results
    }
    r = requests.patch(ep, data=json.dumps(content), headers=cur_headers)
    return r

In [41]:
file = f"{input_dir}hsi_6_0718_211_001_main.TextGrid"
data = tg_to_result(file)

In [42]:
r = post_results(229, 101, 7, data)
print(r.text)

{"id":229,"result":[{"value":{"start":10.504962206088036,"end":11.022981308483644,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":10.504962206088036,"end":11.022981308483644,"channel":0,"text":["Or..."]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"start":14.416670556229224,"end":15.485915113738105,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":14.416670556229224,"end":15.485915113738105,"channel":0,"text":["Yeah, yeah, yeah."]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"start":15.738283394392376,"end":16.243019955700916,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":15.738283394392376,"end":16.243019955700916,"channel":0,"text":["Ah yeah"]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"start":16.317896740469397,"end"

In [40]:
from pathlib import Path

for fn in mapping:
    tgfile = fn.replace(".wav", ".TextGrid")
    if (Path(input_dir) / tgfile).exists():
        print(mapping[fn], fn)


101 hsi_6_0718_211_001_main.wav


In [35]:
count = 223
for task in mapping:
    jsonfile = task.replace(".wav", ".TextGrid")
    file = f"{input_dir}{jsonfile}"
    if not (Path(input_dir) / jsonfile).exists():
        continue
    data = tg_to_result(file)
    r = post_results(count, mapping[task], 6, data)
    count += 1
    print(r.text)

{"id":223,"result":[{"value":{"start":2.2153651153560876,"end":3.0056763100365647,"channel":0,"labels":["Noise"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":2.2153651153560876,"end":3.0056763100365647,"channel":0,"text":["cough"]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"start":8.328011942989301,"end":10.99710868893097,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":8.328011942989301,"end":10.99710868893097,"channel":0,"text":["Yeah, but we we can't walk around in the entire room."]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"start":13.188360706760257,"end":13.563592492469896,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":13.188360706760257,"end":13.563592492469896,"channel":0,"text":["Yeah."]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"st

In [136]:
tmap = {}
count = 99
for task in mapping:
    tmap[task] = count
    count += 1


In [140]:
from pathlib import Path

for file in Path("/Users/joregan/Playing/hsi_ctmedit/textgrid").glob("*.TextGrid"):
    wavfile = file.stem + ".wav"
    if wavfile in mapping:
        print(wavfile, mapping[wavfile])

hsi_7_0719_227_002_inter.wav 69
hsi_7_0719_209_003_inter.wav 61
hsi_7_0719_227_003_inter.wav 70
hsi_7_0719_211_004_inter.wav 66
hsi_6_0718_209_001_inter.wav 59
hsi_7_0719_222_002_inter.wav 67
hsi_7_0719_209_001_inter.wav 60
hsi_7_0719_222_004_inter.wav 68
hsi_7_0719_211_002_inter.wav 65
hsi_7_0719_210_002_inter.wav 63
hsi_7_0719_210_003_inter.wav 64
hsi_5_0718_209_001_inter.wav 56
hsi_5_0718_209_003_inter.wav 58
hsi_7_0719_210_001_inter.wav 62
hsi_5_0718_209_002_inter.wav 57
