## CRF What's Wrong with Participants?

In [40]:
from crf import *
from crf_support import get_all_data

import os, time, pprint

from features_generator import abstracts2features, get_genia_tags, sanity_check

from gensim.models import Word2Vec

In [2]:
tag = 'P'
eval_tags = [tag]

Get data

In [3]:
# Get train data
train_tokens, train_tags = get_all_data('train', tag)
train_genia_tags = get_genia_tags('train')

In [4]:
# Get dev data
dev_tokens, dev_tags = get_all_data('dev', tag)
dev_genia_tags = get_genia_tags('dev')

In [5]:
# Get test data
test_tokens, test_tags = get_all_data('test', tag)
test_genia_tags = get_genia_tags('test')

Compute features

In [6]:
# Set options
big_options_string = 'left_neighbors=3 right_neighbors=3 inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
one_hot one_hot_neighbors w2v_model=pubmed w2v w2v_neighbors w2v_size=10 cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

options_string = 'left_neighbors=1 right_neighbors=1 one_hot one_hot_neighbors'

w2v=None

features_name = 'P1'

In [7]:
# Compute features for train
train_features = abstracts2features(train_tokens, train_genia_tags, w2v=w2v, options_string=options_string)

3499: ['Pulsed', 'azithromycin', 'treatment']


In [8]:
# Compute features for dev
dev_features = abstracts2features(dev_tokens, dev_genia_tags, w2v=w2v, options_string=options_string)

999: ['Serum', 'bactericidal', 'activities']


In [9]:
# Compute features for test
test_features = abstracts2features(test_tokens, test_genia_tags, w2v=w2v, options_string=options_string)

500: ['Efficacy', 'and', 'cost-effectiveness']


In [10]:
# For debug
sanity_check(train_features)

Number of abstracts: 3500
Number of tokens:    927022
Number of features:  2774066 

Avg tokens per abstract: 264
Avg features per token:  2 

Max features per token:  3
Min features per token:  2


Grid search

In [11]:
# Run grid search
grid_file_name = 'crf_results/{}_grid'.format(features_name)
num_iters = 100
l1_list = [0, 0.0001, 0.001, 0.01, 0.1, 1]
l2_list = [0, 0.0001, 0.001, 0.01, 0.1, 1]

start_time = time.time()
grid_search_result = grid_search(train_features, train_tags, dev_features, dev_tags,\
                                 num_iters, l1_list, l2_list, eval_tags, file_name=grid_file_name, save=True)
print("--- %s seconds ---" % (time.time() - start_time))

os.remove(grid_file_name + '.model')

Adding data...
Training model...
Done!
L1: 0, L2: 0, scores: {'P': (0.7235947258848022, 0.4717672608813682, 0.5711546888694128)}
Adding data...
Training model...
Done!
L1: 0, L2: 0.0001, scores: {'P': (0.6936624085328041, 0.5061080445208579, 0.5852254891702418)}
Adding data...
Training model...
Done!
L1: 0, L2: 0.001, scores: {'P': (0.7200886791103482, 0.4555696317075378, 0.5580712207288346)}
Adding data...
Training model...
Done!
L1: 0, L2: 0.01, scores: {'P': (0.718247560205013, 0.4628540403583386, 0.5629385059843169)}
Adding data...
Training model...
Done!
L1: 0, L2: 0.1, scores: {'P': (0.6990767641552069, 0.4899104153470274, 0.5760953419701524)}
Adding data...
Training model...
Done!
L1: 0, L2: 1, scores: {'P': (0.7190753007976206, 0.4813139082435979, 0.5766478751084129)}
Adding data...
Training model...
Done!
L1: 0.0001, L2: 0, scores: {'P': (0.5994971264367817, 0.45308116912496604, 0.5161057568417255)}
Adding data...
Training model...
Done!
L1: 0.0001, L2: 0.0001, scores: {'P': (

In [12]:
# Sort result
sorted_result = sort_by_metric(grid_search_result, tag, metric='f1')
print_result(sorted_result)

L1: 0, L2: 0.0001
P: (0.6936624085328041, 0.5061080445208579, 0.5852254891702418)
L1: 1, L2: 0.01
P: (0.746385029770343, 0.4764274726269116, 0.5816072908036454)
L1: 1, L2: 0.0001
P: (0.7503596087456847, 0.47203872952673964, 0.579514525356885)
L1: 0.1, L2: 0.1
P: (0.7230842197980893, 0.4828522305673695, 0.5790401779658718)
L1: 1, L2: 0.1
P: (0.741732171942497, 0.4738937652701113, 0.5783066007785108)
L1: 1, L2: 0.001
P: (0.7524702121476315, 0.4686001266853678, 0.5775386159594045)
L1: 0, L2: 1
P: (0.7190753007976206, 0.4813139082435979, 0.5766478751084129)
L1: 0, L2: 0.1
P: (0.6990767641552069, 0.4899104153470274, 0.5760953419701524)
L1: 0.01, L2: 0.1
P: (0.7091561938958707, 0.48253551714776943, 0.574297945666514)
L1: 1, L2: 0
P: (0.7492160723401152, 0.46484481042439596, 0.5737260924193774)
L1: 0.0001, L2: 1
P: (0.7388038345972242, 0.46724278345851056, 0.5724501108647451)
L1: 0.01, L2: 1
P: (0.7425478166726813, 0.46547823726359605, 0.5722390633256389)
L1: 0, L2: 0
P: (0.7235947258848022, 

Run CRF

In [19]:
# Set options
num_iters = 100
l1 = 1
l2 = 0.01
file_name = 'crf_results/{}'.format(features_name)

In [20]:
# Run CRF
start_time = time.time()
crf_result = get_crf_results(train_features, train_tags, dev_features, dev_tags, num_iters, l1, l2, eval_tags,
                             file_name=file_name, save=True)
print("--- %s seconds ---" % (time.time() - start_time))

Adding data...
Training model...
Done!
--- 40.5949029922 seconds ---


In [21]:
# Print result
print_result(crf_result)

P: (0.746385029770343, 0.4764274726269116, 0.5816072908036454)


Get model

In [22]:
# Get model from file
tagger = get_tagger(file_name)

In [23]:
# For debug
print_model_info(tagger)

Top likely transitions:
P      -> P       1.919151
None   -> None    1.913085
None   -> P       -1.461767
P      -> None    -3.301887

Top positive:
8.067807 None   word[0]:BACKGROUND
5.761039 None   word[0]:PARTICIPANTS
5.751301 None   word[0]:DESIGN
5.359430 None   word[1]:Eleven
5.032509 None   word[0]:PURPOSE
4.428440 None   word[0]:INTERVENTIONS
4.086487 None   word[1]:Nine
4.025399 None   word[0]:Both
4.017348 P      word[0]:nonsmokers.
4.013344 P      word[0]:alcoholics
3.990093 None   word[0]:However
3.986556 P      word[-1]:PBSCT.
3.864085 P      word[-1]:rowers
3.673883 None   word[0]:SUBJECTS
3.661661 None   word[0]:METHODS
3.656098 None   word[1]:Fifty-four
3.602765 None   word[0]:Using
3.554243 None   word[0]:Overall
3.546100 None   word[-1]:hypertension.
3.532938 None   word[1]:Forty-six

Top negative:
-2.143282 None   word[-1]:brucellosis
-2.159246 None   word[-1]:PARTICIPANTS
-2.182490 None   word[0]:students
-2.209057 None   word[-1]:learning.
-2.211110 P      word[0]:

Predict tags

In [24]:
# Predict dev tags
pred_dev_tags = predict_tags(tagger, dev_features)

# Evaluate dev tags
dev_result = evaluate_prediction(pred_dev_tags, dev_tags, eval_tags)
print 'dev:'
print_result(dev_result)
sys.stdout.flush()

# Predict train tags
pred_train_tags = predict_tags(tagger, train_features)

# Evaluate train tags
train_result = evaluate_prediction(pred_train_tags, train_tags, eval_tags)
print 'train:'
print_result(train_result)
sys.stdout.flush()

# Predict test tags
pred_test_tags = predict_tags(tagger, test_features)

# Evaluate test tags
test_result = evaluate_prediction(pred_test_tags, test_tags, eval_tags)
print 'test:'
print_result(test_result)
sys.stdout.flush()

dev:
P: (0.746385029770343, 0.4764274726269116, 0.5816072908036454)
train:
P: (0.8331695473726805, 0.5432444656632409, 0.6576723882242891)
test:
P: (0.7552387740555951, 0.4816801527411583, 0.5882091706450538)


K-fold evaluation

In [25]:
# Run K-fold
kfold_file_name = 'crf_results/{}_kfold'.format(features_name)

start_time = time.time()
kfold_result = get_kfold_results(train_features, train_tags, num_iters, l1, l2, eval_tags,\
                                 file_name=kfold_file_name, save=True)
print("--- %s seconds ---" % (time.time() - start_time))

os.remove(kfold_file_name + '.model')

On fold 0
Adding data...
Training model...
Done!
On fold 1
Adding data...
Training model...
Done!
On fold 2
Adding data...
Training model...
Done!
On fold 3
Adding data...
Training model...
Done!
On fold 4
Adding data...
Training model...
Done!
--- 162.671914816 seconds ---


In [26]:
# Print all results
print_result(kfold_result)

Fold 0
P: (0.743448275862069, 0.4230492510955589, 0.5392471549460169)
Fold 1
P: (0.7338930105427567, 0.4609724691887915, 0.5662636990170602)
Fold 2
P: (0.712115031613298, 0.4341311781162574, 0.5394152408172724)
Fold 3
P: (0.7171599922615592, 0.4529569892473118, 0.5552310342245188)
Fold 4
P: (0.7343669781291714, 0.46014282956958114, 0.565778023890515)
Average
P: (0.72819665768177089, 0.44625054344350013, 0.5531870305790767)


In [41]:
def print_with_spaces(l, spaces):
    # This pads strings to be of space length and aligned left
    formatter = lambda space: '{:' + str(space) + '}'
    
    for sublist in l:
        print ''.join([formatter(space).format(string) for string, space in zip(sublist, spaces)])

In [55]:
i = 2
print_with_spaces(zip(dev_tokens[i], dev_tags[i], pred_dev_tags[i]), [25, 5, 5])

Netilmicin               None None 
in                       P    None 
the                      P    None 
neonate                  P    None 
:                        None None 
pharmacokinetic          None None 
analysis                 None None 
and                      None None 
influence                None None 
of                       None None 
parenteral               None None 
nutrition                None None 
.                        None None 
OBJECTIVE                None None 
The                      None None 
aim                      None None 
of                       None None 
this                     None None 
study                    None None 
was                      None None 
to                       None None 
investigate              None None 
the                      None None 
impact                   None None 
of                       None None 
parenteral               None None 
nutrition                None None 
on                       Non