## CRF BIBM

In [1]:
from crf import *
from crf_support import compare_tags, filter_phrase

from preproccess_data_bibm2011 import get_all_data, get_all_data_train, get_all_data_dev, get_all_data_test

import os, time

from features_generator import abstracts2features, sanity_check

from gensim.models import Word2Vec



In [2]:
tag = 'P'
eval_tags = [tag]

Get data

In [3]:
def get_genia_tags(data_set):
    switcher = {
        'train': (0, 95),
        'dev': (95, 122),
        'test': (122, 135),
        'all': (0, 135)
    }
    start, end = switcher[data_set]
    
    f = open('./bibm2011corpus-master/abstracts_2.txt', 'r')
    abstract_list = f.readlines()
    f.close()
    abstract_list = [x.strip() for x in abstract_list]
    final_list = abstract_list[start:end]
    
    genia_tags = []
    
    for abstract_path in final_list:
        pickle_path = abstract_path[:-4] + '_genia.tag'
        pickle_file = open(pickle_path, 'rb')
        abstract_genia_tags = pickle.load(pickle_file)
        
        genia_tags.append(abstract_genia_tags)
    return genia_tags

In [4]:
# Get all data
all_tokens, all_tags = get_all_data()
all_genia_tags = get_genia_tags('all')



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Compute features

In [None]:
pubmed_w2v_name = 'PubMed-w2v.bin'
pubmed_w2v = Word2Vec.load_word2vec_format(pubmed_w2v_name, binary=True)

In [5]:
pubmed_wiki_w2v_name = 'wikipedia-pubmed-and-PMC-w2v.bin'
pubmed_wiki_w2v = Word2Vec.load_word2vec_format(pubmed_wiki_w2v_name, binary=True)

In [6]:
# Set options
big_options_string = 'left_neighbors=3 right_neighbors=3 inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
one_hot one_hot_neighbors w2v_model=pubmed w2v w2v_neighbors w2v_size=10 cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

options_string = 'left_neighbors=3 right_neighbors=3 one_hot one_hot_neighbors \
inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
w2v_model=pubmed_wiki w2v w2v_neighbors w2v_size=30 \
cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

w2v=pubmed_wiki_w2v

features_name = 'P8'

In [7]:
# Compute features for all
all_features = abstracts2features(all_tokens, all_genia_tags, w2v=w2v, options_string=options_string)

134: ['OBJECTIVE', ':', 'To']


In [8]:
# For debug
sanity_check(all_features)

Number of abstracts: 135
Number of tokens:    45133
Number of features:  12839548 

Avg tokens per abstract: 334
Avg features per token:  284 

Max features per token:  286
Min features per token:  161


Grid search

Run CRF

In [9]:
# Set options
num_iters = 200
l1 = 1
l2 = 0.01
file_name = 'crf_results/{}'.format(features_name)

Get model

In [10]:
# Get model from file
tagger = get_tagger(file_name)

In [11]:
# For debug
print_model_info(tagger)

Top likely transitions:
None   -> None    2.526950
P      -> P       2.104650
None   -> P       -1.951732
P      -> None    -3.203641

Top positive:
5.959447 None   word[0]:DESIGN
5.786099 None   word[0]:BACKGROUND
4.432196 P      word[0]:nonsmokers.
4.417129 None   word[0]:PARTICIPANTS
4.292859 P      word[-1]:PBSCT.
4.074433 P      word[-1]:rowers
4.040106 None   word[0]:INTERVENTIONS
3.606371 None   word[1]:Eleven
3.551399 None   word[1]:Young
3.496381 None   word[0]:PURPOSE
3.492651 P      word[0]:Norway
3.345511 None   word[0]:SUBJECTS
3.087589 P      word[-2]:Guillian-Barre
3.016858 None   word[1]:Men
2.871961 P      word[1]:Sprague-Dawley
2.752129 None   word[-1]:osteoporosis.
2.682785 P      word[0]:Third-
2.673258 P      word[1]:alpacas
2.671603 None   word[0]:AIM
2.592882 P      word[0]:evaluable.

Top negative:
-1.678454 None   word[-1]:Girls
-1.703361 None   word[-3]:Medicaid
-1.714073 None   word[0]:students
-1.726519 None   word[-2]:supervisor
-1.748018 P      word[0]:The

Predict tags

In [13]:
# Predict all tags
pred_all_tags = predict_tags(tagger, all_features)

# Evaluate all tags
all_result = evaluate_prediction(pred_all_tags, all_tags, eval_tags)
print 'all:'
print_result(all_result)
sys.stdout.flush()

all:
P: (0.48249170659786217, 0.3821897810218978, 0.4265232974910395)


Print a sample prediction for an abstract

In [None]:
def print_with_spaces(l, spaces):
    # This pads strings to be of space length and aligned left
    formatter = lambda space: '{:' + str(space) + '}'
    
    for sublist in l:
        print ''.join([formatter(space).format(string) for string, space in zip(sublist, spaces)])

In [None]:
i = 300
print_with_spaces(zip(dev_tokens[i], dev_tags[i], pred_dev_tags[i]), [25, 5, 5])

Analyze intervals

In [15]:
compare_tags(pred_all_tags, all_tags, tag)

There are 209 predicted intervals:
Number of type Identical      : 39
Number of type Subinterval    : 22
Number of type Superinterval  : 25
Number of type Overlapping    : 8
Number of type Non-overlapping: 115

There are 2713 predicted tokens:
Number of type Identical      : 626
Number of type Subinterval    : 287
Number of type Superinterval  : 518
Number of type Overlapping    : 166
Number of type Non-overlapping: 1116

There are 111 gold intervals:
Number of type Identical      : 39
Number of type Subinterval    : 25
Number of type Superinterval  : 20
Number of type Overlapping    : 7
Number of type Non-overlapping: 20

There are 3425 gold tokens:
Number of type Identical      : 626
Number of type Subinterval    : 287
Number of type Superinterval  : 1023
Number of type Overlapping    : 897
Number of type Non-overlapping: 592



Restrict evaluation to noun phrase

In [16]:
# Evaluate all tags
all_result = evaluate_prediction(filter_phrase(pred_all_tags, all_genia_tags),\
                                 filter_phrase(all_tags, all_genia_tags),\
                                 eval_tags)
print 'all:'
print_result(all_result)
sys.stdout.flush()

all:
P: (0.4624800425758382, 0.40064545873674506, 0.42934782608695654)
