In [154]:
import os
import re

def split_span(s):
    tokens = []
    for match in re.finditer(r"[\w\d]+|[^\w\d\s]+|\s+", s):
        span = match.span()
        tokens.append((match.group(0), span[0], span[1]))
    return tokens

In [155]:
path_documents_text = '../text/'
path_mer_annotations = '../mer_annotations/annotations/'

In [156]:
document_ids = os.listdir(path_documents_text)

for document_id in document_ids:
    with open(path_documents_text + document_id) as f:
        document_text = f.read()
        document_text = document_text.replace('\n', ' ')

    with open(path_mer_annotations + document_id + '.tsv') as f:
        mer_annotations = map(lambda annotation: annotation.strip().split('\t'), f.readlines())
        mer_annotations = map(lambda annotation: [int(annotation[0]), int(annotation[1]), annotation[2]], mer_annotations)
        mer_annotations.sort()
        
    # Split text into tokens and indexes
    splitted_text = split_span(document_text)
    
    # Mark the tokens that are named entities. Check WebAnno documentation to check how WebAnno TSV 3 format works 
    # https://webanno.github.io/webanno/releases/3.2.0/docs/user-guide.html#sect_webannotsv
    for i, annotation in enumerate(mer_annotations):
        annotation_index_begin = annotation[0]
        annotation_index_end = annotation[1]

        for j, token in enumerate(splitted_text):
            token_index_begin = token[1]
            token_index_end = token[2]

            if token_index_begin >= annotation_index_begin and token_index_end <= annotation_index_end:

                splitted_text[j] += ('HPO[{}]'.format(i),)
    
    # If the word does not correspond to any named-entity, mark it with an underscore
    for j, token in enumerate(splitted_text):
        if len(token) == 3:
            splitted_text[j] += ('_',)
    
    # Prepare WebAnno TSV 3 annotation lines to write to file
    webanno_annotations = []
    for j, token in enumerate(splitted_text):
        text = token[0]
        index_begin = token[1]
        index_end = token[2]
        entity_type = '|'.join(token[3:])


        if text == ' ':
            continue


        webanno_annotation = '1-{}\t'.format(j + 1)
        webanno_annotation += '{}-{}\t'.format(token[1], token[2])
        webanno_annotation += '{}\t'.format(text)
        webanno_annotation += '{}\n'.format(entity_type)

        webanno_annotations.append(webanno_annotation)
    
    # Write to file
    with open('annotations/{}.tsv'.format(document_id), 'w') as webanno_file:

        webanno_file.write('#FORMAT=WebAnno TSV 3\n')
        webanno_file.write('#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|value\n\n\n')

        webanno_file.write('#Text={}\n'.format(document_text))

        for webanno_annotation in webanno_annotations:
            webanno_file.write(webanno_annotation)