In [1]:
def slurpfile(filename) -> str:
    with open(filename) as inf:
        return inf.read().strip()

In [2]:
timecode_dir = "/Users/joregan/timecode_cut"
output_dir = "/tmp/textgrid_cut"
host = "http://130.237.3.107:8080/api/"
api_token: str = slurpfile("label_studio_mine")

In [3]:
from pathlib import Path

timecode_path = Path(timecode_dir)
output_path = Path(output_dir)

In [4]:
def get_timecode_offsets(filename):
    if type(filename) == Path:
        filename = str(filename)

    with open(filename) as inf:
        lines = [l.strip() for l in inf.readlines()]
        assert lines[0] == ",Frame,Time (Seconds),TimeCode", f"CSV file ({filename}) seems to be incorrect"
        p_start = lines[1].split(",")
        start = float(p_start[2])
        p_end = lines[-1].split(",")
        end = float(p_end[2])
        return start, end

In [5]:
import requests
import json
from pathlib import Path

headers = {
    "Authorization": f"Token {api_token}"
}

In [None]:
IDS = [
    264,
]

In [10]:
def get_annotation(annot_it):
    ep = f"{host}annotations/{annot_it}"
    req = requests.get(ep, headers=headers)
    assert req.status_code == 200
    data = json.loads(req.text)
    return data

In [15]:
data = get_annotation(264)

In [20]:
combined = {}

if "result" in data:
    for res in data["result"]:
        if not res["id"] in combined:
            combined[res["id"]] = res
        else:
            if "text" in res["value"]:
                combined[res["id"]]["value"]["text"] = res["value"]["text"]
            elif "labels" in res["value"]:
                combined[res["id"]]["value"]["labels"] = res["value"]["labels"]

In [22]:
results = []

for item in combined:
    val = item["value"]
    if not "labels" in val:
        print("NO LABELS", item)
        continue
    if not "Speech" in val["labels"]:
        print("NO SPEECH", item)
        continue
    start = val["start"]
    end = val["end"]
    text = val["text"]
    if len(text) > 1:
        for t in text:
            if not (t.startswith("/") and t.endswith("/")):
                text = t
    

{'KmYVSVjAS5': {'original_length': 341.376,
  'value': {'start': 16.924884235915957,
   'end': 18.024884235915955,
   'channel': 0,
   'text': ['What looks strange?', '/wˌʌts lˈʊks tɹˈeɪndʒ?/'],
   'labels': ['Speech']},
  'id': 'KmYVSVjAS5',
  'from_name': 'transcription',
  'to_name': 'audio',
  'type': 'textarea',
  'origin': 'manual'},
 'roPZSFDWDP': {'original_length': 341.376,
  'value': {'start': 19.19713218349019,
   'end': 19.69713218349019,
   'channel': 0,
   'text': ['Fingers.', '/fˈɪŋɡɚz./'],
   'labels': ['Speech']},
  'id': 'roPZSFDWDP',
  'from_name': 'transcription',
  'to_name': 'audio',
  'type': 'textarea',
  'origin': 'manual'},
 'TmbbCc257S': {'original_length': 341.376,
  'value': {'start': 25.524884235915955,
   'end': 28.43302462910922,
   'channel': 0,
   'text': ["But they're all stuck though. I mean the things are there.",
    '/bˈʌ ðɛɹ ˈɔːl stˈʌk ðˌoʊ. aɪ mˈiːn ðə θˈɪŋz əɹ ðˈɛɹ./'],
   'labels': ['Speech']},
  'id': 'TmbbCc257S',
  'from_name': 'transcripti

In [19]:

data["result"][0:2]

[{'original_length': 341.376,
  'value': {'start': 16.924884235915957,
   'end': 18.024884235915955,
   'channel': 0,
   'text': ['What looks strange?', '/wˌʌts lˈʊks tɹˈeɪndʒ?/']},
  'id': 'KmYVSVjAS5',
  'from_name': 'transcription',
  'to_name': 'audio',
  'type': 'textarea',
  'origin': 'manual'},
 {'original_length': 341.376,
  'value': {'start': 16.924884235915957,
   'end': 18.024884235915955,
   'channel': 0,
   'labels': ['Speech']},
  'id': 'KmYVSVjAS5',
  'from_name': 'labels',
  'to_name': 'audio',
  'type': 'labels',
  'origin': 'manual'}]

In [40]:
from praatio import textgrid
from praatio.utilities.constants import Interval

def modify_textgrid(filename, start, end, output=None):
    if type(filename) == Path:
        filename = str(filename)
    if not output:
        output = filename
    elif type(output) == Path:
        output = str(output)

    tg = textgrid.openTextgrid(filename, True)
    new_tg = textgrid.Textgrid()
    for tiername in tg.tierNames:
        tier = tg.getTier(tiername)
        new_entries = []
        for entry in tier.entries:
            e_start = entry[0]
            e_end = entry[1]
            e_text = entry[2]

            new_start = e_start - start
            new_end = e_end - start

            if new_end < 0.0:
                continue
            elif e_start >= end and e_end > end:
                continue
            elif new_start < 0.0 and new_end > 0.0:
                if e_text != "":
                    print("Warning: truncating entry", filename, tiername, entry)
                if new_entries == []:
                    new_entries.append(Interval(0.0, new_end, e_text))
                else:
                    print("Shouldn't have existing entries!!", entry)
                    new_entries.append(Interval(0.0, new_end, e_text))
            elif e_start >= start and e_end <= end:
                new_entries.append(Interval(new_start, new_end, e_text))
            elif e_start <= end and e_end > end:
                if e_text != "":
                    print("Warning: truncating entry", filename, tiername, entry)
                new_entries.append(Interval(new_start, new_end, e_text))
            else:
                print("There should be no default case", entry)

        tier_start = new_entries[0][0]
        tier_end = new_entries[-1][1]
        new_tier = textgrid.IntervalTier(tiername, new_entries, tier_start, tier_end)
        new_tg.addTier(new_tier)

    new_tg.save(output, format="long_textgrid", includeBlankSpaces=True)