In [None]:
'''
Given brat output from TEES, generate split.txt, split.ann, split.rel.ann, split.ner.ann
Input: xml, a1 files, a2 files
'''

import xml.etree.ElementTree as ET

# file = '/Users/kurt/Desktop/dataset/tees/EPI11/EPI11-devel.xml'
# output_dir = "/Users/kurt/Desktop/dataset/tees/EPI11/dev/"


def generate_split_txt(predxml, output_dir):
    tree = ET.parse(predxml)
    root = tree.getroot()
    #retrieves sentences and organise store them per file id
    files = dict()
    for doc in root.iter('document'):
        file = doc.get('origId')
        sentences = dict()
        for sentence in doc.iter('sentence'):
            boundary = sentence.get('charOffset').split("-")
            boundary = boundary[0]+":"+boundary[1]
#             print("boundary:", boundary)
            tokens = sentence.find('analyses').find('tokenization').findall('token')
            words = []
            for i in tokens:
                words.append(i.get('text'))
            sentences[str(boundary)] = words
        files[file] = sentences
    #write the sentences
    for filename, sentences in files.items():
        filewrite = open(output_dir + filename + ".split.txt", 'w')
        for boundary, sentence in sentences.items():
            filewrite.write(boundary+"\n")
            s = ' '.join(sentence)
            filewrite.write(s+"\n")
        filewrite.close()
# generate_split_txt(file, output_dir)

In [6]:
import os
def generate_split_ann(input_dir):
    '''
    Generate split.ann file given input_dir containing a1 and a2 files
    '''
    def extract_relations(a2file):
        def extract_trigger(line):
            return line.split()[1].split(":")[1]

        def create_eventid_trigger_mapping(a2file): 
            '''
            Create a mapping between event id and trigger id
            E1:TR1
            '''
            eventid_trigger_mapping = dict()
            for line in a2file:
                if line.startswith("E"):
                    event_id = line.split()[0]
                    trigger = extract_trigger(line)
                    eventid_trigger_mapping[event_id] = trigger
            return eventid_trigger_mapping
        def extract_arguments(line, eventid_trigger_mapping):
            '''
            If argument is an event, find its corresponding trigger id in the mapping
            '''
            temp = line.split()[2:]
            args = []
            if len(temp) > 0:
                for a in temp:
                    role = a.split(":")[0]
                    arg = a.split(":")[1]
                    if arg.startswith("E"): #get the trigger id
                        arg = eventid_trigger_mapping[arg]
                        #format to start with TR instead of T
                        arg = "TR"+arg[1:]
                    args.append((role, arg))
            return args

        eventid_trigger_mapping = create_eventid_trigger_mapping(a2file)
        relations = []
        for line in a2file:
            if line.startswith("E"):
                trigger = extract_trigger(line)
                trigger = "TR"+trigger[1:]
                arguments = extract_arguments(line, eventid_trigger_mapping)
                for a in arguments:
                    relations.append((trigger, a[0], a[1]))
        return relations                

    def form_relations(relations):
        '''
        Remove the numbering in roles
        '''
        rel_id = 1
        rels = []
        for rel in relations:
            role = rel[1]
            if role[-1].isdigit():
                role = role[:-1]
            s = "R"+str(rel_id)+"\t"+role+" Arg1:"+rel[0]+" Arg2:"+rel[2]+"\n"
            rel_id += 1
            rels.append(s)
        return rels
    def reformat_a2(a2file):
        '''
        If it is an event, change trigger id to start with "TR"
        '''
        a2lines = []
        for line in a2file:
            if line.startswith("T"):
                line = "TR"+line[1:]
            if line.startswith("E"):
                temp = line.split()
                trigword = temp[1].split(":")[0]
                trigger = temp[1].split(":")[1]
                new_trig = "TR"+trigger[1:]
                line = temp[0]+"\t"+trigword+":"+new_trig
                for t in temp[2:]:
                    line = line+" "+t
            line = line.rstrip()+"\n"
            a2lines.append(line)
        return a2lines
    files = os.listdir(input_dir)
    for file in files:
        if file.endswith(".a1"):
            file_id = file[:-3]
            a1file = open(input_dir+file_id+".a1", 'r').readlines()
            a2file = open(input_dir+file_id+".a2", 'r').readlines()
            annfile = open(input_dir+file_id+".split.ann", 'w')
            #write entities to ann file
            for line in a1file:
                annfile.write(line)
            a2lines = reformat_a2(a2file)
            for line in a2lines:
                annfile.write(line)
            relations = form_relations(extract_relations(a2file))
            for rel in relations:
                annfile.write(rel)
            annfile.close()
input_dir = "/Users/kurt/Desktop/sbnn_ace/data/bratace/train/"            
generate_split_ann(input_dir)

In [2]:
'''
Generate split.rel.ann and split.ner.ann given split.ann
'''
def generate_ner_rel_ann(input_dir):
    files = os.listdir(input_dir)
    for file in files:
        if file.endswith(".split.ann"):
            file_id = file[:-10]
            lines = open(input_dir+file, 'r').readlines()
            nerfile = open(input_dir+file_id+".split.ner.ann", 'w')
            relfile = open(input_dir+file_id+".split.rel.ann", 'w')
            for line in lines:
                if line.startswith("T"):
                    nerfile.write(line)
                if line.startswith("R"):
                    relfile.write(line)
            nerfile.close()
            relfile.close()
input_dir = "/Users/kurt/Desktop/dataset/tees/CG13/devpred/"                             
generate_ner_rel_ann(input_dir)

In [None]:
#generate for gold dataset
datasets = ['EPI11', 'GE09', 'GE11', 'GE13', 'ID11', 'PC13']
parts = ['train', 'dev', 'test']
rootdir = '/Users/kurt/Desktop/dataset/tees/'

for d in datasets:
    for p in parts:
        xml = rootdir+d+'/'+d+'-'+p+'.xml'
        print(xml, end='')
        temp = rootdir+d+'/'+p+'/'
        print("\n"+temp, end='')
        generate_split_txt(xml, temp)      
        #generate split.ann from a1 and a2 files
        generate_split_ann(temp)
        #generate ner.ann and rel.ann from split.ann files
        generate_ner_rel_ann(temp)
        print("...finished.")

In [None]:
#generate for preds
'''
1. generate split_txt (comment split_ann, ner_rel_ann)
2. use convert xml to generate a1 and a2 files
3. generate split_ann, ner_rel_ann (comment split.txt)

NOTE: for permission denied error
sudo chmod -R 777 .

'''
datasets = ['EPI11', 'GE09', 'GE11', 'GE13', 'ID11', 'PC13', 'CG13']
parts = ['test']
rootdir = '/Users/kurt/Desktop/dataset/tees/preds/'

for d in datasets:
    for p in parts:
        xml = rootdir+d+'/'+d+'-edge-pred.xml'
        print(xml, end='')
        temp = rootdir+d+'/'+p+'/'
        print("\n"+temp, end='')
#         generate_split_txt(xml, temp)      
        #generate split.ann from a1 and a2 files
        generate_split_ann(temp)
        #generate ner.ann and rel.ann from split.ann files
        generate_ner_rel_ann(temp)
        print("...finished.")


In [None]:
from datetime import datetime
one = datetime.now()
two = datetime.now()
dif = two - one
s = str(dif)
s