## CRF What's Wrong with Participants?

In [1]:
from crf import *
from crf_support import get_all_data, compare_tags, filter_phrase

import os, time

from features_generator import abstracts2features, get_genia_tags, sanity_check

from gensim.models import Word2Vec



In [2]:
tag = 'P'
eval_tags = [tag]

Get data

In [3]:
# Get train data
train_tokens, train_tags = get_all_data('train', tag)
train_genia_tags = get_genia_tags('train')

In [4]:
# Get dev data
dev_tokens, dev_tags = get_all_data('dev', tag)
dev_genia_tags = get_genia_tags('dev')

In [5]:
# Get test data
test_tokens, test_tags = get_all_data('test', tag)
test_genia_tags = get_genia_tags('test')

Compute features

In [None]:
pubmed_w2v_name = 'PubMed-w2v.bin'
pubmed_w2v = Word2Vec.load_word2vec_format(pubmed_w2v_name, binary=True)

In [6]:
pubmed_wiki_w2v_name = 'wikipedia-pubmed-and-PMC-w2v.bin'
pubmed_wiki_w2v = Word2Vec.load_word2vec_format(pubmed_wiki_w2v_name, binary=True)

In [7]:
# Set options
big_options_string = 'left_neighbors=3 right_neighbors=3 inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
one_hot one_hot_neighbors w2v_model=pubmed w2v w2v_neighbors w2v_size=10 cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

options_string = 'left_neighbors=3 right_neighbors=3 one_hot one_hot_neighbors \
inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
w2v_model=pubmed_wiki w2v w2v_neighbors w2v_size=30 \
cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

w2v=pubmed_wiki_w2v

features_name = 'P8bibm'

In [8]:
# Compute features for train
train_features = abstracts2features(train_tokens, train_genia_tags, w2v=w2v, options_string=options_string)

3499: ['Pulsed', 'azithromycin', 'treatment']


In [9]:
# Compute features for dev
dev_features = abstracts2features(dev_tokens, dev_genia_tags, w2v=w2v, options_string=options_string)

999: ['Serum', 'bactericidal', 'activities']


In [10]:
# Compute features for test
test_features = abstracts2features(test_tokens, test_genia_tags, w2v=w2v, options_string=options_string)

500: ['Efficacy', 'and', 'cost-effectiveness']


In [11]:
# For debug
sanity_check(train_features)

Number of abstracts: 3500
Number of tokens:    927022
Number of features:  263345838 

Avg tokens per abstract: 264
Avg features per token:  284 

Max features per token:  286
Min features per token:  161


Run CRF

In [12]:
# Set options
num_iters = 200
l1 = 1
l2 = 0.01
file_name = 'crf_results/{}'.format(features_name)

Get model

In [13]:
# Get model from file
tagger = get_tagger(file_name)

In [14]:
# For debug
print_model_info(tagger)

Top likely transitions:
P      -> P       0.041363
None   -> None    -0.097811
P      -> None    -7.264975
None   -> P       -8.233716

Top positive:
4.927603 None   word[-1]:PARTICIPANTS
2.855721 P      word[-2]:PARTICIPANTS
2.396066 P      word[1]:INTERVENTIONS
2.058474 P      word[1]:Group
1.931018 P      word[-3]:NHS
1.811934 None   word[-2]:controlled
1.611998 P      word[1]:INTERVENTION
1.556044 None   word[0]:INTERVENTION
1.485613 P      word[-3]:PARTICIPANTS
1.418782 None   word[0]:INTERVENTIONS
1.407665 P      word[-3]:newspapers
1.369649 None   word[-3]:followed
1.342317 P      word[1]:Training
1.341184 None   word[-1]:randomisation.
1.326098 P      word[3]:clusters
1.316158 P      word[1]:Expectant
1.316158 P      word[-2]:incomplete
1.295212 None   chunk[-2]:PRT
1.272041 None   word[2]:closure
1.271684 P      word[-2]:Oxford

Top negative:
-0.897128 None   word[-1]:clinics.
-0.928468 None   word[-2]:Kingdom
-0.938274 None   word[3]:1
-0.944144 None   word[3]:30
-0.946226 No

Predict tags

In [15]:
# Predict dev tags
pred_dev_tags = predict_tags(tagger, dev_features)

# Evaluate dev tags
dev_result = evaluate_prediction(pred_dev_tags, dev_tags, eval_tags)
print 'dev:'
print_result(dev_result)
sys.stdout.flush()

# Predict train tags
pred_train_tags = predict_tags(tagger, train_features)

# Evaluate train tags
train_result = evaluate_prediction(pred_train_tags, train_tags, eval_tags)
print 'train:'
print_result(train_result)
sys.stdout.flush()

# Predict test tags
pred_test_tags = predict_tags(tagger, test_features)

# Evaluate test tags
test_result = evaluate_prediction(pred_test_tags, test_tags, eval_tags)
print 'test:'
print_result(test_result)
sys.stdout.flush()

dev:
P: (0.10406553398058252, 0.015518957560401773, 0.02701000078746358)
train:
P: (0.10797530553105708, 0.01076714325200392, 0.01958163393540574)
test:
P: (0.10617626648161, 0.013910355486862442, 0.02459807073954984)


Print a sample prediction for an abstract

In [19]:
def print_with_spaces(l, spaces):
    # This pads strings to be of space length and aligned left
    formatter = lambda space: '{:' + str(space) + '}'
    
    for sublist in l:
        print ''.join([formatter(space).format(string) for string, space in zip(sublist, spaces)])

In [41]:
i = 263
print_with_spaces(zip(dev_tokens[i], dev_tags[i], pred_dev_tags[i]), [25, 5, 5])

Reducing                 None None 
Sitting                  None None 
Time                     None None 
After                    None None 
Stroke                   None None 
:                        None None 
A                        None None 
Phase                    None None 
II                       None None 
Safety                   None None 
and                      None None 
Feasibility              None None 
Randomized               None None 
Controlled               None None 
Trial                    None None 
.                        None None 
OBJECTIVE                None None 
To                       None None 
test                     None None 
the                      None None 
safety                   None None 
,                        None None 
feasibility              None None 
,                        None None 
and                      None None 
effectiveness            None None 
of                       None None 
reducing                 Non

Analyze intervals

In [16]:
compare_tags(pred_dev_tags, dev_tags, tag)

There are 40 predicted intervals:
Number of type Identical      : 0
Number of type Subinterval    : 1
Number of type Superinterval  : 15
Number of type Overlapping    : 12
Number of type Non-overlapping: 12

There are 3296 predicted tokens:
Number of type Identical      : 0
Number of type Subinterval    : 28
Number of type Superinterval  : 1574
Number of type Overlapping    : 196
Number of type Non-overlapping: 1498

There are 2694 gold intervals:
Number of type Identical      : 0
Number of type Subinterval    : 22
Number of type Superinterval  : 1
Number of type Overlapping    : 13
Number of type Non-overlapping: 2658

There are 22102 gold tokens:
Number of type Identical      : 0
Number of type Subinterval    : 135
Number of type Superinterval  : 30
Number of type Overlapping    : 212
Number of type Non-overlapping: 21725



Restrict evaluation to noun phrase

In [17]:
# Evaluate dev tags
dev_result = evaluate_prediction(filter_phrase(pred_dev_tags, dev_genia_tags),\
                                 filter_phrase(dev_tags, dev_genia_tags),\
                                 eval_tags)
print 'dev:'
print_result(dev_result)
sys.stdout.flush()

# Evaluate train tags
train_result = evaluate_prediction(filter_phrase(pred_train_tags, train_genia_tags),\
                                   filter_phrase(train_tags, train_genia_tags),\
                                   eval_tags)
print 'train:'
print_result(train_result)
sys.stdout.flush()

# Evaluate test tags
test_result = evaluate_prediction(filter_phrase(pred_test_tags, test_genia_tags),\
                                  filter_phrase(test_tags, test_genia_tags),\
                                  eval_tags)
print 'test:'
print_result(test_result)
sys.stdout.flush()

dev:
P: (0.1174025974025974, 0.014533762057877814, 0.025865522174535048)
train:
P: (0.12414853878268513, 0.010076150732081395, 0.01863948271311692)
test:
P: (0.12672811059907835, 0.014098948987439118, 0.025374855824682817)


In [38]:
for i in range(len(pred_dev_tags)):
    for j in range(len(pred_dev_tags[i])):
        if pred_dev_tags[i][j] != 'None':
            print i
            break

129
254
263
285
293
295
325
367
377
419
424
450
464
469
479
486
490
491
519
547
586
603
605
609
616
636
639
690
709
715
767
783
798
826
838
845
863
975
993
995
