In [1]:
import requests

document_url = "https://annotator.jakobkoehler.de/jsonbox/documents/?limit=0"
annotation_url = "https://annotator.jakobkoehler.de/jsonbox/annotations/?sort=_createdOn&limit=0"

def get(url):
    return requests.get(url, auth=('jsonbox', 'i2pe2019')).json()

documents_data = get(document_url)
annotations_data = get(annotation_url)

In [2]:
from collections import defaultdict

annotation_map = defaultdict(list)
for annotation in annotations_data:
    annotation_map[annotation['documentId']].append(annotation)

documents = [
    {
        'text': d['text'], 
        'annotations': [
            a['annotations'] 
            for a in annotation_map[d['_id']]
        ]
    } 
    for d in documents_data
]

In [3]:
def get_annotation_vector(text, annotations):
    annotation_vector = [0] * len(text.split(' '))
    for annotation in annotations:
        if annotation['start'] == 0:
            start = 0
        else:
            start = len(text[:annotation['start']].strip().split(' '))
        num_words = len(annotation['text'].split(' '))
        for i in range(num_words):
            annotation_vector[start+i] = 1
    return annotation_vector

In [4]:
fully_annotated_documents = [
    document for document in documents
    if len(document['annotations']) == 2
]
len(fully_annotated_documents)

100

In [5]:
documents_train = [{
    'text': document['text'].split(' '),
    'H0': get_annotation_vector(document['text'], document['annotations'][0]),
    'H1': get_annotation_vector(document['text'], document['annotations'][1])
} for document in fully_annotated_documents]

for document in documents_train:
    document['H1'] = [
        int(bool(h0 + h1))
        for h0, h1 in zip(document['H0'], document['H1'])
    ]

In [6]:
import json
with open(f'documents_{len(documents_train)}_annotated.json', 'w') as f:
    json.dump(documents_train, f)

In [11]:
with open(f'documents_96_annotated.json', 'r') as f:
    documents_train_old = json.load(f)
    
presence_map = {
    ' '.join(doc['text'][:100]) for doc in documents_train_old
}

In [13]:
documents_new = [
    doc for doc in documents_train
    if ' '.join(doc['text'][:100]) not in presence_map
]

In [14]:
len(documents_new)

4

In [15]:
with open(f'documents_4_annotated.json', 'w') as f:
    json.dump(documents_new, f)