# Genia Generator
For each abstract, generate the files in the same folder as the abstract:
1. **[abstract_index]_tokens.txt**, tokens separated by whitespaces as generated by geniatagger.
2. **[abstract_index]_genia.tag**, list of tuples (word, base_form, pos, chunk, named_entity) for each word as generated by geniatagger.

In [None]:
import os, pickle
from geniatagger import GeniaTagger

In [None]:
# Directory for annotations
#directory = 'PICO-annotations/batch5k'
directory = 'bibm2011corpus-master'

# Directory for geniatagger
genia_directory = 'geniatagger-3.0.2/geniatagger'

tagger = GeniaTagger(genia_directory)

# Suffixes for the generated files
tokens_suffix = '_tokens.txt'
genia_tags_suffix = '_genia.tag'

In [None]:
DEBUG = False

# Let you view progress!
DISPLAY = True

# Number of abstracts processed
count = 0

# For each subdirectory
for subdir in os.listdir(directory):
    subdir_path = directory + '/' + subdir
    
    # Not a directory
    if not os.path.isdir(subdir_path):
        continue
    
    # For each abstract in subdirectory
    for abstract in os.listdir(subdir_path):
        if abstract[-4:] == '.txt' and tokens_suffix not in abstract:
            abstract_index = abstract[:-4]
            
            if DEBUG:
                if abstract_index != '7540705':
                    continue
            
            # First get the abstract text
            abstract_file = open(subdir_path + '/' + abstract)
            abstract_text = abstract_file.read()
            abstract_file.close()
            
            if DEBUG:
                print abstract_text
            
            # Now get tags from genia tagger
            # Format: [(Association, Association, NN, B-NP, O), ...]
            # Visualization here: http://nactem7.mib.man.ac.uk/geniatagger/
            genia_tags = tagger.parse(abstract_text)
            
            if DEBUG:
                print genia_tags
            
            # List of tokens from geniatagger
            # Format: ['Randomised', ',', 'placebo-controlled', ...]
            tokens = [tags[0] for tags in genia_tags]
            tokens_text = ' '.join(tokens)
            
            if DEBUG:
                print tokens_text
            
            # Write genia tags to file
            f = open(subdir_path + '/' + abstract_index + genia_tags_suffix, 'w')
            pickle.dump(genia_tags, f)
            f.close()
            
            # Write tokens to file
            f = open(subdir_path + '/' + abstract_index + tokens_suffix, 'w')
            f.write(tokens_text)
            f.close()
            
            count += 1
            
            if DISPLAY:
                # Print progress
                print '\r{0}: {1}'.format(count, abstract_text[:30]),     