In [1]:
'''
Run this first. This aggregates human annotations
and generates gold annotations.

New version that selects only parts of the union
that correspond to at least half of the phrases
that make up the union.
'''

import os
from collections import defaultdict

# Directory for annotations
directory = 'PICO-annotations/interventions_batch5k'

# Minimum number of count that a phrase must have
# to be included in the gold annotations
threshold = 3

# Suffixes for the generated files
machine_suffix = '_gold.ann'
human_suffix = '_gold_human_readable.ann'

tokens_suffix = '_tokens.txt'

DEBUG = False

# For each subdirectory
for subdir in os.listdir(directory):
    subdir_path = directory + '/' + subdir
    
    # Not a directory
    if not os.path.isdir(subdir_path):
        continue
    
    # For each abstract in subdirectory
    for abstract in os.listdir(subdir_path):
        if abstract[-4:] == '.txt' and tokens_suffix not in abstract:
            abstract_index = abstract[:-4]
            
            '''Step 1: Gather all annotations'''
            
            # This stores entries of the form
            # ('Participants', [[34, 65], [344, 375], ...])
            # [34, 65] means a Participants phrase starting at 34 and ending at 65
            dictionary = defaultdict(list)
            
            # Go through each associated annotation
            for annotation in os.listdir(subdir_path):
                # Don't include our gold annotations!
                if annotation[-4:] == '.ann' and annotation.startswith(abstract_index) \
                and machine_suffix not in annotation and human_suffix not in annotation:
                    f = open(subdir_path + '/' + annotation)
                    
                    # Read each line
                    for line in f.readlines():
                        tokens = line.split()
                        
                        # This is e.g. 'Participants'
                        pico_type = tokens[1]
                        
                        # Sometimes this comes up. It does not provide pico tags so we skip it.
                        if pico_type == 'AnnotatorNotes':
                            continue
                        
                        # Bad formatting that comes up
                        if ';' in tokens[3]:
                            continue
                        
                        # Start and end of this phrase
                        start = int(tokens[2])
                        end = int(tokens[3])
                        
                        dictionary[pico_type].append([start, end])
                    
                    f.close()
            
            # Sort each list in dictionary according to start values
            for instance_list in dictionary.values():
                instance_list.sort(key=lambda x: x[0])
            
            if DEBUG:
                if abstract_index == '10492627':
                    print dictionary
            
            '''Step 2: Aggregate the annotations'''
            
            # Build an aggregate dictionary
            # by combining all [start, end] pairs that overlap
            # into the same "interval" and count the number of overlapped pairs
            # Format: ('Participants', [[20, 120, 3], [143, 165, 2], ...])
            # [20, 120, 3] means an interval starting at 20 and ending at 120
            # which is built from combining 3 different phrases
            aggregate_dict = defaultdict(list)
            
            for pico_type, instance_list in dictionary.iteritems():
                # Keep track of the start and end of current interval,
                # and the number of phrases the interval is made up of
                curr_start = None
                curr_end = None
                num_phrases = 0
                
                for start, end in instance_list:
                    if num_phrases == 0:
                        # There is no current interval
                        curr_start = start
                        curr_end = end
                        num_phrases = 1
                    elif start < curr_end:
                        # This phrase overlaps with the current interval,
                        # so update the current interval.
                        curr_end = max(curr_end, end)
                        num_phrases += 1
                    else:
                        # This phrase does not overlap with the current interval,
                        # so store the current inverval and start over.
                        if num_phrases >= threshold:
                            aggregate_dict[pico_type].append([curr_start, curr_end, num_phrases])
                        
                        curr_start = start
                        curr_end = end
                        num_phrases = 1
                
                # Store the last interval
                if num_phrases >= threshold:
                    aggregate_dict[pico_type].append([curr_start, curr_end, num_phrases])
            
            if DEBUG:
                if abstract_index == '10492627':
                    print aggregate_dict
            
            '''Step 3: Filter each interval to get gold annotations'''
            
            # For each interval in aggregate_dict that is made up of num_phrases phrases,
            # select only the parts that are contained in at least num_phrases/2 phrases.
            # Same format as aggregate_dict
            gold_dict = defaultdict(list)
            
            for pico_type in dictionary.keys():
                # First, build a list of start and end points of all phrases
                boundaries = set()
                
                for start, end in dictionary[pico_type]:
                    boundaries.add(start)
                    boundaries.add(end)
                    
                boundaries = list(boundaries)
                boundaries.sort()
                
                # Now tally the number of times each part appears
                # ex. boundaries = [20, 25, 32, 37, 45, ...]
                # If [25, 37] appears, we add 1 to tally_dict[25] and tally_dict[32]
                tally_dict = defaultdict(int)
                
                for start, end in dictionary[pico_type]:
                    for boundary in boundaries:
                        if boundary >= start and boundary < end:
                            tally_dict[boundary] += 1
                
                # Iterate through all intervals
                for start_interval, end_interval, num_phrases in aggregate_dict[pico_type]:
                    # Keep track of the start of the current gold interval
                    curr_start = None
                    
                    for boundary in boundaries:
                        # Consider only boundaries within the interval
                        if boundary >= start_interval and boundary <= end_interval:
                            if tally_dict[boundary] >= (num_phrases + 1)/2 and boundary < end_interval:
                                # This part should be included, so set it to be the
                                # current start of the gold interval if not exist.
                                if curr_start == None:
                                    curr_start = boundary
                            else:
                                # This part should not be included, so store
                                # the current gold interval and reset.
                                if curr_start is not None:
                                    gold_dict[pico_type].append([curr_start, boundary, num_phrases])
                                    curr_start = None
            
            if DEBUG:
                if abstract_index == '10492627':
                    print gold_dict
            
            '''Step 4: Write out results'''
            
            # Write gold annotations for system input
            # Format: Participants 20 120 345 678 ...
            f = open(subdir_path + '/' + abstract_index + machine_suffix, 'w')
            
            for pico_type, instance_list in gold_dict.iteritems():
                f.write(pico_type + ' ')
                
                for start, end, num_phrases in instance_list:
                    f.write(str(start) + ' ' + str(end) + ' ')
                
                f.write('\n')
            
            f.close()
            
            # Now write a human readable one
            # Format: Participants [start] [end] [num_phrases]
            # [corresponding text]
            
            # First get the abstract text
            abstract_file = open(subdir_path + '/' + abstract)
            abstract_text = abstract_file.read()
            abstract_file.close()
            
            f = open(subdir_path + '/' + abstract_index + human_suffix, 'w')
            
            for pico_type, instance_list in gold_dict.iteritems():
                for start, end, num_phrases in instance_list:
                    f.write(pico_type + ' ')
                    f.write(str(start) + ' ' + str(end) + ' ' + str(num_phrases))
                    f.write('\n')
                    
                    f.write(abstract_text[start:end])
                    f.write('\n')
            
            f.close()