## CRF BIBM

In [1]:
from crf import *
from crf_support import compare_tags, filter_phrase

from preproccess_data_bibm2011 import get_all_data, get_all_data_train, get_all_data_dev, get_all_data_test

import os, time

from features_generator import abstracts2features, sanity_check

from gensim.models import Word2Vec



In [2]:
tag = 'P'
eval_tags = [tag]

Get data

In [3]:
def get_genia_tags(data_set):
    switcher = {
        'train': (0, 95),
        'dev': (95, 122),
        'test': (122, 135), 
    }
    start, end = switcher[data_set]
    
    f = open('./bibm2011corpus-master/abstracts_2.txt', 'r')
    abstract_list = f.readlines()
    f.close()
    abstract_list = [x.strip() for x in abstract_list]
    final_list = abstract_list[start:end]
    
    genia_tags = []
    
    for abstract_path in final_list:
        pickle_path = abstract_path[:-4] + '_genia.tag'
        pickle_file = open(pickle_path, 'rb')
        abstract_genia_tags = pickle.load(pickle_file)
        
        genia_tags.append(abstract_genia_tags)
    return genia_tags

In [4]:
# Get train data
train_tokens, train_tags = get_all_data_train()
train_genia_tags = get_genia_tags('train')



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [5]:
# Get dev data
dev_tokens, dev_tags = get_all_data_dev()
dev_genia_tags = get_genia_tags('dev')

In [6]:
# Get test data
test_tokens, test_tags = get_all_data_test()
test_genia_tags = get_genia_tags('test')

Compute features

In [None]:
pubmed_w2v_name = 'PubMed-w2v.bin'
pubmed_w2v = Word2Vec.load_word2vec_format(pubmed_w2v_name, binary=True)

In [10]:
pubmed_wiki_w2v_name = 'wikipedia-pubmed-and-PMC-w2v.bin'
pubmed_wiki_w2v = Word2Vec.load_word2vec_format(pubmed_wiki_w2v_name, binary=True)

In [11]:
# Set options
big_options_string = 'left_neighbors=3 right_neighbors=3 inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
one_hot one_hot_neighbors w2v_model=pubmed w2v w2v_neighbors w2v_size=10 cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

options_string = 'left_neighbors=3 right_neighbors=3 one_hot one_hot_neighbors \
inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
w2v_model=pubmed_wiki w2v w2v_neighbors w2v_size=30 \
cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

w2v=pubmed_wiki_w2v

features_name = 'P8bibm'

In [12]:
# Compute features for train
train_features = abstracts2features(train_tokens, train_genia_tags, w2v=w2v, options_string=options_string)

94: ['Effect', 'of', 'beta']


In [13]:
# Compute features for dev
dev_features = abstracts2features(dev_tokens, dev_genia_tags, w2v=w2v, options_string=options_string)

26: ['Mobilisation', 'with', 'movement']


In [14]:
# Compute features for test
test_features = abstracts2features(test_tokens, test_genia_tags, w2v=w2v, options_string=options_string)

12: ['OBJECTIVE', ':', 'To']


In [15]:
# For debug
sanity_check(train_features)

Number of abstracts: 95
Number of tokens:    31422
Number of features:  8938499 

Avg tokens per abstract: 330
Avg features per token:  284 

Max features per token:  286
Min features per token:  161


Grid search

In [16]:
# Run grid search
grid_file_name = 'crf_results/{}_grid'.format(features_name)
num_iters = 200
l1_list = [0.01, 0.1, 1]
l2_list = [0.01, 0.1, 1]

start_time = time.time()
grid_search_result = grid_search(train_features, train_tags, dev_features, dev_tags,\
                                 num_iters, l1_list, l2_list, eval_tags, file_name=grid_file_name, save=True)
print("--- %s seconds ---" % (time.time() - start_time))

os.remove(grid_file_name + '.model')

Adding data...
Training model...
Done!
L1: 0.01, L2: 0.01, scores: {'P': (0.9230769230769231, 0.4044943820224719, 0.5625)}
Adding data...
Training model...
Done!
L1: 0.01, L2: 0.1, scores: {'P': (0.9205298013245033, 0.3904494382022472, 0.5483234714003945)}
Adding data...
Training model...
Done!
L1: 0.01, L2: 1, scores: {'P': (0.9145907473309609, 0.3609550561797753, 0.5176233635448138)}
Adding data...
Training model...
Done!
L1: 0.1, L2: 0.01, scores: {'P': (0.9230769230769231, 0.4044943820224719, 0.5625)}
Adding data...
Training model...
Done!
L1: 0.1, L2: 0.1, scores: {'P': (0.9205298013245033, 0.3904494382022472, 0.5483234714003945)}
Adding data...
Training model...
Done!
L1: 0.1, L2: 1, scores: {'P': (0.9205298013245033, 0.3904494382022472, 0.5483234714003945)}
Adding data...
Training model...
Done!
L1: 1, L2: 0.01, scores: {'P': (0.9111111111111111, 0.3455056179775281, 0.5010183299389003)}
Adding data...
Training model...
Done!
L1: 1, L2: 0.1, scores: {'P': (0.9205298013245033, 0.3

In [17]:
# Sort result
sorted_result = sort_by_metric(grid_search_result, tag, metric='f1')
print_result(sorted_result)

L1: 0.1, L2: 0.01
P: (0.9230769230769231, 0.4044943820224719, 0.5625)
L1: 0.01, L2: 0.01
P: (0.9230769230769231, 0.4044943820224719, 0.5625)
L1: 0.1, L2: 0.1
P: (0.9205298013245033, 0.3904494382022472, 0.5483234714003945)
L1: 0.1, L2: 1
P: (0.9205298013245033, 0.3904494382022472, 0.5483234714003945)
L1: 1, L2: 0.1
P: (0.9205298013245033, 0.3904494382022472, 0.5483234714003945)
L1: 0.01, L2: 0.1
P: (0.9205298013245033, 0.3904494382022472, 0.5483234714003945)
L1: 0.01, L2: 1
P: (0.9145907473309609, 0.3609550561797753, 0.5176233635448138)
L1: 1, L2: 0.01
P: (0.9111111111111111, 0.3455056179775281, 0.5010183299389003)
L1: 1, L2: 1
P: (0.9036144578313253, 0.3160112359550562, 0.4682622268470344)


Run CRF

In [27]:
# Set options
num_iters = 200
l1 = 0.1
l2 = 0.01
file_name = 'crf_results/{}'.format(features_name)

In [28]:
# Run CRF
start_time = time.time()
crf_result = get_crf_results(train_features, train_tags, dev_features, dev_tags, num_iters, l1, l2, eval_tags,
                             file_name=file_name, save=True)
print("--- %s seconds ---" % (time.time() - start_time))

Adding data...
Training model...
Done!
--- 39.9441611767 seconds ---


In [29]:
# Print result
print_result(crf_result)

P: (0.9230769230769231, 0.4044943820224719, 0.5625)


Get model

In [30]:
# Get model from file
tagger = get_tagger(file_name)

In [31]:
# For debug
print_model_info(tagger)

Top likely transitions:
P      -> P       0.041363
None   -> None    -0.097811
P      -> None    -7.264975
None   -> P       -8.233716

Top positive:
4.927603 None   word[-1]:PARTICIPANTS
2.855721 P      word[-2]:PARTICIPANTS
2.396066 P      word[1]:INTERVENTIONS
2.058474 P      word[1]:Group
1.931018 P      word[-3]:NHS
1.811934 None   word[-2]:controlled
1.611998 P      word[1]:INTERVENTION
1.556044 None   word[0]:INTERVENTION
1.485613 P      word[-3]:PARTICIPANTS
1.418782 None   word[0]:INTERVENTIONS
1.407665 P      word[-3]:newspapers
1.369649 None   word[-3]:followed
1.342317 P      word[1]:Training
1.341184 None   word[-1]:randomisation.
1.326098 P      word[3]:clusters
1.316158 P      word[1]:Expectant
1.316158 P      word[-2]:incomplete
1.295212 None   chunk[-2]:PRT
1.272041 None   word[2]:closure
1.271684 P      word[-2]:Oxford

Top negative:
-0.897128 None   word[-1]:clinics.
-0.928468 None   word[-2]:Kingdom
-0.938274 None   word[3]:1
-0.944144 None   word[3]:30
-0.946226 No

Predict tags

In [32]:
# Predict dev tags
pred_dev_tags = predict_tags(tagger, dev_features)

# Evaluate dev tags
dev_result = evaluate_prediction(pred_dev_tags, dev_tags, eval_tags)
print 'dev:'
print_result(dev_result)
sys.stdout.flush()

# Predict train tags
pred_train_tags = predict_tags(tagger, train_features)

# Evaluate train tags
train_result = evaluate_prediction(pred_train_tags, train_tags, eval_tags)
print 'train:'
print_result(train_result)
sys.stdout.flush()

# Predict test tags
pred_test_tags = predict_tags(tagger, test_features)

# Evaluate test tags
test_result = evaluate_prediction(pred_test_tags, test_tags, eval_tags)
print 'test:'
print_result(test_result)
sys.stdout.flush()

dev:
P: (0.9230769230769231, 0.4044943820224719, 0.5625)
train:
P: (1.0, 1.0, 1.0)
test:
P: (1.0, 0.42011834319526625, 0.5916666666666667)


K-fold evaluation

In [33]:
# Run K-fold
kfold_file_name = 'crf_results/{}_kfold'.format(features_name)

start_time = time.time()
kfold_result = get_kfold_results(train_features, train_tags, num_iters, l1, l2, eval_tags,\
                                 file_name=kfold_file_name, save=True)
print("--- %s seconds ---" % (time.time() - start_time))

os.remove(kfold_file_name + '.model')

On fold 0
Adding data...
Training model...
Done!
On fold 1
Adding data...
Training model...
Done!
On fold 2
Adding data...
Training model...
Done!
On fold 3
Adding data...
Training model...
Done!
On fold 4
Adding data...
Training model...
Done!
--- 143.331276894 seconds ---


In [34]:
# Print all results
print_result(kfold_result)

Fold 0
P: (0.9361702127659575, 0.20657276995305165, 0.3384615384615385)
Fold 1
P: (0.8087248322147651, 0.9377431906614786, 0.8684684684684686)
Fold 2
P: (0.8820754716981132, 0.9211822660098522, 0.9012048192771083)
Fold 3
P: (0.6019417475728155, 0.23076923076923078, 0.3336322869955157)
Fold 4
P: (0.7298578199052133, 0.5992217898832685, 0.6581196581196581)
Average
P: (0.79175401683137281, 0.57909784945537646, 0.61997735426445777)


Print a sample prediction for an abstract

In [None]:
def print_with_spaces(l, spaces):
    # This pads strings to be of space length and aligned left
    formatter = lambda space: '{:' + str(space) + '}'
    
    for sublist in l:
        print ''.join([formatter(space).format(string) for string, space in zip(sublist, spaces)])

In [None]:
i = 300
print_with_spaces(zip(dev_tokens[i], dev_tags[i], pred_dev_tags[i]), [25, 5, 5])

Analyze intervals

In [35]:
compare_tags(pred_dev_tags, dev_tags, tag)

There are 18 predicted intervals:
Number of type Identical      : 15
Number of type Subinterval    : 2
Number of type Superinterval  : 1
Number of type Overlapping    : 0
Number of type Non-overlapping: 0

There are 312 predicted tokens:
Number of type Identical      : 210
Number of type Subinterval    : 64
Number of type Superinterval  : 38
Number of type Overlapping    : 0
Number of type Non-overlapping: 0

There are 21 gold intervals:
Number of type Identical      : 15
Number of type Subinterval    : 1
Number of type Superinterval  : 2
Number of type Overlapping    : 0
Number of type Non-overlapping: 3

There are 712 gold tokens:
Number of type Identical      : 210
Number of type Subinterval    : 14
Number of type Superinterval  : 420
Number of type Overlapping    : 0
Number of type Non-overlapping: 68



Restrict evaluation to noun phrase

In [36]:
# Evaluate dev tags
dev_result = evaluate_prediction(filter_phrase(pred_dev_tags, dev_genia_tags),\
                                 filter_phrase(dev_tags, dev_genia_tags),\
                                 eval_tags)
print 'dev:'
print_result(dev_result)
sys.stdout.flush()

# Evaluate train tags
train_result = evaluate_prediction(filter_phrase(pred_train_tags, train_genia_tags),\
                                   filter_phrase(train_tags, train_genia_tags),\
                                   eval_tags)
print 'train:'
print_result(train_result)
sys.stdout.flush()

# Evaluate test tags
test_result = evaluate_prediction(filter_phrase(pred_test_tags, test_genia_tags),\
                                  filter_phrase(test_tags, test_genia_tags),\
                                  eval_tags)
print 'test:'
print_result(test_result)
sys.stdout.flush()

dev:
P: (0.9336734693877551, 0.3969631236442516, 0.5570776255707763)
train:
P: (1.0, 1.0, 1.0)
test:
P: (1.0, 0.4474885844748858, 0.6182965299684543)
