In [None]:
import tarfile
import urllib.request
import os
import codecs

In [None]:
import zipfile

In [None]:
class Annotation(object):
    def __init__(self):
        self.start_index = -1
        self.end_index = -1
        self.type = ''
        self.spanned_text = ''

class AnnotatedDocument(object):
    def __init__(self):
        self.text = ''
        self.annotations = []
        self.positive_label = -1
        
def read_brat_annotations(lines):
    annotations = []
    # FORMAT is NUMBER[TAB]TYPE[SPACE]START_INDEX[SPACE]END_INDEX[SPACE]SPANNED_TEXT
    for line in lines:

        line = str(line)
        tab_tokens = line.split('\t')
        space_tokens = tab_tokens[1].split()
        anno = Annotation()
        anno.spanned_text = tab_tokens[-1]
        anno.type = space_tokens[0]
        anno.start_index = int(space_tokens[1])
        anno.end_index = int(space_tokens[2])
        annotations.append(anno)
    return annotations
        
def read_annotations(archive_file, force_redownload = False):
    print('Reading annotations from file : ' + archive_file)
    filename = archive_file.split('/')[-1]
    
    if force_redownload or not os.path.isfile(filename):
        print('Downloading remote file : '+ archive_file)
        urllib.request.urlretrieve(archive_file, filename)
    
    annotated_doc_map = {}
    
    print('Opening local file : ' + filename)
    z = zipfile.ZipFile(filename, "r")
    zinfo = z.namelist()
    for name in zinfo:
        if name.endswith('.txt') or name.endswith('.ann'):
            basename = name.split('.')[0]
            if basename not in annotated_doc_map:
                annotated_doc_map[basename] = AnnotatedDocument()
            anno_doc = annotated_doc_map[basename]
            # handle text and BRAT annotation files (.ann) differently
            if name.endswith('.txt'):
                with z.open(name) as f1:
                    anno_doc.text = f1.read()
            else:
                with z.open(name) as f1:
                    # handle this as utf8 or we get back byte arrays
                    anno_doc.annotations = read_brat_annotations(codecs.iterdecode(f1, 'utf8'))
                    
    # now let's finally assign a 0 or 1 to each document based on whether we see our expected type for the pneumonia label
    for key, anno_doc in annotated_doc_map.items():
        annos = anno_doc.annotations
        anno_doc.positive_label = 0
        for anno in annos:
            if anno.type == 'DOCUMENT_PNEUMONIA_YES':
                anno_doc.positive_label = 1
                    
    return annotated_doc_map.values()
    
annotated_docs = read_annotations('https://github.com/burgersmoke/DeCART_2017_rulebased_NLP/raw/master/data/BRAT/BratTestArchive.zip')
print('Total Annotated Documents : {0}'.format(len(annotated_docs)))

total_positives = 0
for anno_doc in annotated_docs:
    if anno_doc.positive_label:
        total_positives += 1
    
print('Total Positive Pneumonia Documents : {0}'.format(total_positives))