In [146]:
def slurpfile(filename) -> str:
    with open(filename) as inf:
        return inf.read().strip()

In [147]:
host = "http://130.237.3.107:8080/api/"
api_token = slurpfile("label_studio_whisperx")
input_dir = "/Users/joregan/Playing/hsi/audio/whisperx-json/"

In [35]:
import requests
import json
from pathlib import Path

headers = {
    "Authorization": f"Token {api_token}"
}

In [11]:
def get_projects():
    req = requests.get(f"{host}projects", headers=headers)
    assert req.status_code == 200
    data = json.loads(req.text)
    return data

In [21]:
def get_project_id_from_name(name):
    projects = get_projects()
    for res in projects["results"]:
        if res["title"].strip() == name.strip():
            return res["id"]

In [148]:
get_project_id_from_name("Main 5")

6

In [31]:
def get_tasks(projectid):
    req = requests.get(f"{host}tasks", headers=headers, params={"project": projectid})
    assert req.status_code == 200
    data = json.loads(req.text)
    return data

In [45]:
def index_task_filestem_to_id(tasks_data):
    tasks = tasks_data["tasks"]
    mapping = {}
    for task in tasks:
        task_id = task["id"]
        if "storage_filename" in task:
            task_raw_path = task["storage_filename"]
        else:
            task_raw_path = task["data"]["audio"]
        if not task_raw_path:
            continue
        task_stem = task_raw_path.split("/")[-1]
        mapping[task_stem] = task_id
    return mapping

In [22]:
get_project_id_from_name("Speaker 3")

1

In [149]:
tasks = get_tasks(6)

In [150]:
mapping = index_task_filestem_to_id(tasks)

In [151]:
mapping

{'hsi_5_0718_209_001_main.wav': 84,
 'hsi_5_0718_209_002_main.wav': 85,
 'hsi_5_0718_209_003_main.wav': 86,
 'hsi_5_0718_210_001_main.wav': 87,
 'hsi_5_0718_210_002_main.wav': 88,
 'hsi_5_0718_210_003_main.wav': 89,
 'hsi_5_0718_211_002_main.wav': 90,
 'hsi_5_0718_211_003_main.wav': 91,
 'hsi_5_0718_222_002_main.wav': 92,
 'hsi_5_0718_222_003_main.wav': 93,
 'hsi_5_0718_227_001_main.wav': 94,
 'hsi_5_0718_227_002_main.wav': 95}

ID: 71

In [153]:
import json

def convert_json(filename):
    with open(filename) as inf:
        data = json.load(inf)

    outputs = []
    for segment in data["segments"]:
        start = segment["start"]
        end = segment["end"]
        text = segment["text"]

        segment = {
            "value": {
                "start": start,
                "end": end,
                "channel": 0,
                "labels": ["Speech"]
            },
            "from_name": "labels",
            "to_name": "audio",
            "type": "labels",
        }
        rec = {
            "value": {
                "start": start,
                "end": end,
                "channel": 0,
                "text": [text.strip()]
            },
            "from_name": "transcription",
            "to_name": "audio",
            "type": "textarea",
        }
        # outputs.append(segment)
        outputs.append(rec)

    return outputs

In [96]:
def post_results(id, task, project, results):
    ep = f"{host}annotations/{id}/?taskID={task}&project={project}"

    cur_headers = {i: headers[i] for i in headers}
    cur_headers["Content-type"] = "application/json"

    content = {
        "was_cancelled": False,
        "ground_truth": False,
        "project": project,
        "draft_id": 0,
        "parent_prediction": None,
        "parent_annotation": None,
        "result": results
    }
    r = requests.patch(ep, data=json.dumps(content), headers=cur_headers)
    return r

In [129]:
file = f"{input_dir}hsi_3_0715_209_006_main.json"
data = convert_json(file)

In [130]:
r = post_results(98, 22, 1, data)
print(r.text)

{"id":98,"result":[{"value":{"start":9.994,"end":10.839,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":9.994,"end":10.839,"channel":0,"text":["Comprehensible."]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"start":57.443,"end":58.064,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":57.443,"end":58.064,"channel":0,"text":["Yeah, further."]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"start":58.264,"end":58.444,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":58.264,"end":58.444,"channel":0,"text":["Yeah."]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"start":73.675,"end":74.576,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":73.675,"end":74.576,"channel":0,"text":

In [152]:
count = 179
for task in mapping:
    jsonfile = task.replace(".wav", ".json")
    file = f"{input_dir}{jsonfile}"
    data = convert_json(file)
    r = post_results(count, mapping[task], 6, data)
    count += 1
    print(r.text)

{"id":179,"result":[{"value":{"start":8.359,"end":10.862,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":8.359,"end":10.862,"channel":0,"text":["Yeah, but we can't walk around in the entire room."]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"start":13.226,"end":13.466,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":13.226,"end":13.466,"channel":0,"text":["Yeah."]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"start":14.167,"end":14.808,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start":14.167,"end":14.808,"channel":0,"text":["Oh, there you are."]},"from_name":"transcription","to_name":"audio","type":"textarea"},{"value":{"start":14.888,"end":15.228,"channel":0,"labels":["Speech"]},"from_name":"labels","to_name":"audio","type":"labels"},{"value":{"start"

In [136]:
tmap = {}
count = 99
for task in mapping:
    tmap[task] = count
    count += 1


In [140]:
from pathlib import Path

for file in Path("/Users/joregan/Playing/hsi_ctmedit/textgrid").glob("*.TextGrid"):
    wavfile = file.stem + ".wav"
    if wavfile in mapping:
        print(wavfile, mapping[wavfile])

hsi_7_0719_227_002_inter.wav 69
hsi_7_0719_209_003_inter.wav 61
hsi_7_0719_227_003_inter.wav 70
hsi_7_0719_211_004_inter.wav 66
hsi_6_0718_209_001_inter.wav 59
hsi_7_0719_222_002_inter.wav 67
hsi_7_0719_209_001_inter.wav 60
hsi_7_0719_222_004_inter.wav 68
hsi_7_0719_211_002_inter.wav 65
hsi_7_0719_210_002_inter.wav 63
hsi_7_0719_210_003_inter.wav 64
hsi_5_0718_209_001_inter.wav 56
hsi_5_0718_209_003_inter.wav 58
hsi_7_0719_210_001_inter.wav 62
hsi_5_0718_209_002_inter.wav 57
