In [1]:
from geniatagger import GeniaTagger
from preprocess_data import get_all_data

In [2]:
# Directory for geniatagger
genia_directory = 'geniatagger-3.0.2/geniatagger'

tagger = GeniaTagger(genia_directory)

In [17]:
# Get all data
# Format of word_array: [[word1, word2, ...], ...]
word_array, tag_array = get_all_data()

# Combine all words in the same abstract into one text
abstracts = [' '.join(abstract_words) for abstract_words in word_array]

In [18]:
# Print elements of a list with spaces
# Element l[i] is padded to have length space[i]
def print_with_spaces(l, spaces):
    # This pads strings to be of space length and aligned left
    formatter = lambda space: '{:' + str(space) + '}'
    
    print ''.join([formatter(space).format(string) for string, space in zip(l, spaces)])

In [21]:
DEBUG = False

'''
Clean up genia tags
Format: [[feature 1, feature 2, ...], ...]
[feature 1, feature 2, ...] are features of each word

Features: word, base form, POS, chunk, IOB, named entity,
whether word inside parentheses.

Result must be of length length.
'''
def clean_tags(genia_tags, length):
    cleaned_tags = []
    
    # Keep track of whether word is inside parantheses
    inside_paren = False
    
    for word, base_form, pos, chunk, named_entity in genia_tags:
        # ';' has POS ':'
        if word == pos or pos == ':' or pos == '(' or pos == ')' or word == '%':
            # This means the word is puctuation, parentheses, etc.,
            # so we do not make features for it.
            if pos == '(':
                inside_paren = True
            elif pos == ')':
                inside_paren = False
            elif len(word) > 1 and word not in ['``',  '\'\'', '--', 'TO', '...']:
                # This shouldn't happen
                raise ValueError('Unidentified word: ' + word)
            continue
        
        # Now build tags for word
        word_tags = []
        
        word_tags.append(word)
        word_tags.append(base_form)
        word_tags.append(pos)
        
        # Strip out IOB from chunk
        if chunk == 'O':
            word_tags.append(chunk)
        elif len(chunk) > 2:
            word_tags.append(chunk[2:])
        else:
            raise ValueError('Unidentified chunk: ' + chunk)
        
        # Get IOB
        iob = chunk[0]
        if iob == 'O' or iob == 'I' or iob == 'B':
            word_tags.append(iob)
        else:
            raise ValueError('Unidentified chunk:s ' + chunk)
        
        # Strip out IOB from named_entity
        if named_entity == 'O':
            word_tags.append(named_entity)
        elif len(named_entity) > 2:
            word_tags.append(named_entity[2:])
        else:
            raise ValueError('Unidentified named entity: ' + named_entity)
        
        word_tags.append(str(inside_paren))
        
        cleaned_tags.append(word_tags)
    
    for word_tags in cleaned_tags:
                print_with_spaces(word_tags, [25, 25, 5, 5, 5, 10, 5])
    if len(cleaned_tags) != length:
        raise ValueError('There are ' + str(len(cleaned_tags)) +
                         ' tags but there should be ' + str(length))
    
    return cleaned_tags

if DEBUG:
    for abstract, words in zip(abstracts, word_array):
        if abstract.startswith('Association of efavirenz'):
            print abstract
            genia_tags = tagger.parse(abstract)
            cleaned_tags = clean_tags(genia_tags, len(words))
            
            for word_tags in genia_tags:
                print_with_spaces(word_tags, [25, 25, 5, 10, 5])
            for word_tags in cleaned_tags:
                print_with_spaces(word_tags, [25, 25, 5, 5, 5, 10, 5])

In [6]:
DEBUG = False

# Number of words on each side whose features we take into account
num_surrounding = 4

# Placeholders for out-of-bound indices
start_token = 'START'
stop_token = 'STOP'

'''
Get features from cleaned tags
Same format as cleaned tags
'''
def tags2features(cleaned_tags):
    # Features for whole abstract
    abstract_features = []
    
    for i in range(len(cleaned_tags)):
        word, base_form, pos, chunk, iob, named_entity, inside_paren = cleaned_tags[i]

        word_features = []
            
        '''Features based on the word itself'''
            
        # Feature: POS tag
        word_features.append(pos)
        # Feature: whether word is inside parentheses
        word_features.append(str(inside_paren))

        '''Features based on the phrase containing the word'''
            
        # Feature: type of phrase containing the word
        word_features.append(chunk)
        # Feature: named entity of the word, if any
        word_features.append(named_entity)

        # Feature: whether word is the beginning of its phrase
        word_features.append(str(iob == 'B'))

        # Feature: whether word is the last of its phrase
        # IOB must be I or B and the next IOB must not be I
        if (iob == 'I' or iob == 'B') and \
        (i == len(cleaned_tags)-1 or cleaned_tags[i+1][4] != 'I'):
            word_features.append('True')
        else:
            word_features.append('False')
            
        '''Features based on surrounding words'''
            
        for offset in range(-num_surrounding, num_surrounding + 1):
            if offset != 0:
                new_index = i + offset
                if new_index < 0:
                    # Out of bound to the left
                    word_features.append(start_token)
                    word_features.append(start_token)
                    word_features.append(start_token)
                elif new_index >= len(cleaned_tags):
                    # Out of bound to the right
                    word_features.append(stop_token)
                    word_features.append(stop_token)
                    word_features.append(stop_token)
                else:
                    # Features: POS tags of surrounding words
                    new_pos = cleaned_tags[new_index][2]
                    word_features.append(new_pos)
                        
                    # Features: named entity of surrounding words, if any
                    new_entity = cleaned_tags[new_index][5]
                    word_features.append(new_entity)
            
                    # Features: whether surrounding words are in the same phrase as this word
                    start = min(new_index, i)
                    stop = max(new_index, i)
                    
                    in_same_phrase = True
                    
                    # Check that IOBs from start+1 to stop are all I's
                    for j in range(start+1, stop+1):
                        if cleaned_tags[j][4] != 'I':
                            in_same_phrase = False
                    
                    word_features.append(str(in_same_phrase))
                    
        abstract_features.append(word_features)
    
    return abstract_features

if DEBUG:
    for abstract, words in zip(abstracts, word_array):
        if abstract.startswith('Association of efavirenz'):
            genia_tags = tagger.parse(abstract)
            cleaned_tags = clean_tags(genia_tags, len(words))
            abstract_features = tags2features(cleaned_tags)
            
            for word_tags in cleaned_tags:
                print_with_spaces(word_tags, [25, 25, 5, 5, 5, 10, 5])
            for word_tags in abstract_features:
                print_with_spaces(word_tags, [4, 6, 4, 10, 6, 6] + [6, 10, 6]*24)

In [22]:
DEBUG = False

DISPLAY = True

'''
Get features from a list of abstracts
Format: [abstract features 1, abstract features 2, ...]
abstract features i = [word features 1, word features 2, ...]
word features i = [feature 1, feature 2, ...]

Length of abstract features i must match length_list[i]
'''
def abstracts2features(abstract_list, length_list):
    # Features of all abstracts
    all_abstracts_features = []
    
    for i in range(len(abstract_list)):
        abstract = abstract_list[i]
        
        if DEBUG:
            if not abstract.startswith('Association of efavirenz'):
                continue
            print abstract
        
        if DISPLAY:
            # Print progress
            print '\r{0}: {1}'.format(i, abstract[:30]),
    
        # Get tags from genia tagger
        # Format: [(Association, Association, NN, B-NP, O), ...]
        # Visualization here: http://nactem7.mib.man.ac.uk/geniatagger/
        genia_tags = tagger.parse(abstract)

        '''Step 1: Clean up genia tags'''

        cleaned_tags = clean_tags(genia_tags, length_list[i])

        '''Step 2: Get features from these tags'''

        abstract_features = tags2features(cleaned_tags)
                
        all_abstracts_features.append(abstract_features)
        
    return all_abstracts_features

if DEBUG:
    lengths = [len(words) for words in word_array]
    
    features = abstracts2features(abstracts, lengths)
    print features

In [23]:
lengths = [len(words) for words in word_array]

all_features = abstracts2features(abstracts, lengths)

0: Ipilimumab efficacy and safety Ipilimumab               Ipilimumab               NN   NP   B    O         False
efficacy                 efficacy                 NN   NP   I    O         False
and                      and                      CC   O    O    O         False
safety                   safety                   NN   NP   B    O         False
in                       in                       IN   PP   B    O         False
patients                 patient                  NNS  NP   B    O         False
with                     with                     IN   PP   B    O         False
advanced                 advance                  VBN  NP   B    O         False
melanoma                 melanoma                 NN   NP   I    O         False
a                        a                        DT   NP   B    O         False
retrospective            retrospective            JJ   NP   I    O         False
analysis                 analysis                 NN   NP   I    O         

ValueError: There are 224 tags but there should be 222