## CRF Experiments on Participants

In [None]:
from crf import *
from crf_support import get_all_data

import time

from features_generator import abstracts2features, get_genia_tags, sanity_check

from gensim.models import Word2Vec

In [None]:
tag = 'P'
eval_tags = [tag]

Get data

In [None]:
# Get train data
train_tokens, train_tags = get_all_data('train', tag)
train_genia_tags = get_genia_tags('train')

In [None]:
# Get dev data
dev_tokens, dev_tags = get_all_data('dev', tag)
dev_genia_tags = get_genia_tags('dev')

In [None]:
# Get test data
test_tokens, test_tags = get_all_data('test', tag)
test_genia_tags = get_genia_tags('test')

Compute features

In [None]:
# Set options
big_options_string = 'left_neighbors=1 right_neighbors=0 inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
one_hot one_hot_neighbors w2v_model=pubmed w2v w2v_neighbors w2v_size=10 cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

options_string = 'left_neighbors=0 right_neighbors=0 one_hot'

w2v=None

In [None]:
# Compute features for train
train_features = abstracts2features(train_tokens, train_genia_tags, w2v=w2v, options_string=options_string)

In [None]:
# Compute features for dev
dev_features = abstracts2features(dev_tokens, dev_genia_tags, w2v=w2v, options_string=options_string)

In [None]:
# Compute features for test
test_features = abstracts2features(test_tokens, test_genia_tags, w2v=w2v, options_string=options_string)

In [None]:
# For debug
sanity_check(train_features)

Quick run :)

In [None]:
# Set options
num_iters = 100
l1 = 0.05
l2 = 0
file_name = 'b'

In [None]:
# Run CRF
start_time = time.time()
crf_result = get_crf_results(train_features, train_tags, dev_features, dev_tags, num_iters, l1, l2, eval_tags,
                             file_name=file_name)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Print result
print_result(crf_result)

Train model

In [None]:
# Train model
start_time = time.time()
model = train_crf(train_features, train_tags, num_iters, l1, l2, file_name)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Get model from file
tagger = get_tagger(file_name)

In [None]:
# For debug
print_model_info(tagger)

Predict tags

In [None]:
# Predict dev tags
pred_dev_tags = predict_tags(tagger, dev_features)

In [None]:
# Evaluate dev tags
dev_result = evaluate_prediction(pred_dev_tags, dev_tags, eval_tags)
print_result(dev_result)

In [None]:
# Predict train tags
pred_train_tags = predict_tags(tagger, train_features)

In [None]:
# Evaluate train tags
train_result = evaluate_prediction(pred_train_tags, train_tags, eval_tags)
print_result(train_result)

In [None]:
# Predict test tags
pred_test_tags = predict_tags(tagger, test_features)

In [None]:
# Evaluate test tags
test_result = evaluate_prediction(pred_test_tags, test_tags, eval_tags)
print_result(test_result)

K-fold evaluation

In [None]:
# Run K-fold
kfold_file_name = 'kfold'

start_time = time.time()
kfold_result = get_kfold_results(train_features, train_tags, num_iters, l1, l2, eval_tags, file_name=kfold_file_name)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Print all results
print_result(kfold_result)

In [None]:
# Print just the average scores
print_result(average_scores(kfold_result))

Grid search

In [None]:
# Run grid search
grid_file_name = 'grid_search'
l1_list = [0, 0.001, 0.01, 0.1, 1]
l2_list = [0, 0.001, 0.01, 0.1, 1]

start_time = time.time()
grid_search_result = grid_search(train_features, train_tags, dev_features, dev_tags,\
                                 num_iters, l1_list, l2_list, eval_tags, file_name=grid_file_name)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Sort result
sorted_result = sort_by_metric(grid_search_result, tag, metric='f1')
print_result(sorted_result)