# Extract labeled data
Extract labeled data from LabelStudio export and 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from datetime import datetime
import json
import os
import pandas as pd
from pprint import pprint
import re
from typing import NamedTuple

In [None]:
# configure
input_dir = '../data/segment/from_labelstudio/'
in_filename = '2023-03-17.json'

output_dir = '../data/segment/labeled/'
today = datetime.today().strftime('%Y-%m-%d')

ngram_size = 5

In [None]:
Annotation = NamedTuple("Annotation", [('start', int), ('end', int), ('label', str)])


def get_annotations(annotations_in):
    # gather all results
    results = []
    for annotation in annotations_in:
        for result in annotation['result']:
            if len(result['value']['labels']) != 1:
                raise ValueError('should be one label', len(result['value']['labels']))
            results.append(result)
    # create annotations from sorted results
    annotations = []
    start = None
    label = None
    for result in sorted(results, key=lambda result: result['value']['start']):
        if label != result['value']['labels'][0]:
            if label is not None:
                # save previous segment
                annotations.append(Annotation(start, end, label))
            start = result['value']['start']
            label = result['value']['labels'][0]
        end = result['value']['end']
    if label is not None:
        # save final segment
        annotations.append(Annotation(start, end, label))
    return annotations


def get_label_for_paragraph(start, end, annotations):
    for annotation in annotations:
        # fuzzy match in case boundaries are off by a bit
        if start >= annotation.start - 3 and end <= annotation.end + 3:
            return annotation.label
    raise ValueError('annotation not found', start, end, annotations)

    
def get_paragraph_segments(annotations, text):
    paragraph_segments = []
    start = 0
    segment = 0
    prev_label = None
    separator = ''
    for ix, paragraph in enumerate(re.split(r'(\n{2,})', text)):
        if ix % 2 == 1:
            start += len(paragraph)
            continue
        end = start + len(paragraph)
        label = get_label_for_paragraph(start, end, annotations)
        if label != prev_label:
            prev_label = label
            segment += 1
        paragraph_segments.append({
            'text': paragraph.strip(),
            'segment': segment,
        })
        start = end
    return paragraph_segments

In [None]:
# read data exported from LabelStudio
with open(os.path.join(input_dir, in_filename)) as f:
    data = json.load(f)

In [None]:
# generate labeled ngram pairs
labeled_data = []
for instance in data:
    try:
        annotations = get_annotations(instance['annotations'])
        paragraph_segments = get_paragraph_segments(annotations, instance['data']['text'])
        labeled_data.append({
            'talk': instance['data']['talk'],
            'section': instance['data']['section'] or '-',
            'paragraphs': paragraph_segments
        })
    except ValueError as e:
        print(instance['data']['talk'], instance['data']['section'], e, '\n')
len(labeled_data)

In [None]:
labeled_data[0]

In [None]:
# write to file
filename = os.path.join(output_dir, f"{today}.json")
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(labeled_data, f, ensure_ascii=False, indent=2)