# Gather labeled data
Gather labeled data from LabelStudio export and write to file in "split/labeled"

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from datetime import datetime
import json
import os
import re

import pandas as pd
from pprint import pprint

from models.split_utils import get_annotations, get_paragraph_splits

In [None]:
# configure
input_dir = '../data/split/from_labelstudio/'
in_filename = '2023-03-17.json'

output_dir = '../data/split/labeled/'
today = datetime.today().strftime('%Y-%m-%d')

ngram_size = 5

In [None]:
# read data exported from LabelStudio
with open(os.path.join(input_dir, in_filename)) as f:
    data = json.load(f)

In [None]:
# generate labeled ngram pairs
labeled_data = []
for instance in data:
    try:
        annotations = get_annotations(instance['annotations'])
        paragraph_splits = get_paragraph_splits(annotations, instance['data']['text'])
        labeled_data.append({
            'talk': instance['data']['talk'],
            'section': instance['data']['section'] or '-',
            'paragraphs': paragraph_splits
        })
    except ValueError as e:
        print(instance['data']['talk'], instance['data']['section'], e, '\n')
len(labeled_data)

In [None]:
labeled_data[0]

In [None]:
# write to file
filename = os.path.join(output_dir, f"{today}.json")
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(labeled_data, f, ensure_ascii=False, indent=2)