## Linear-Chain CRF

pycrfsuite version 
source: https://github.com/bwallace/Deep-PICO/blob/3152ab3690cad1b6e369be8a8aac27393811341c/crf.py

In [None]:
import sys, time, pickle
from collections import Counter
import numpy as np

from preprocess_data import get_all_data_train, get_all_data_dev, get_all_data_test
from features_generator import abstracts2features, get_genia_tags, sanity_check

from gensim.models import Word2Vec

import pycrfsuite
from sklearn.cross_validation import KFold

### Train CRF
_INPUT_:
- features_list: list of list of features dictionaries
- tags_list: list of list of tags
- num_iters: number of iterations
- l1, l2: regularization parameters
- file_name: file name to write model out; '.model' added automatically

_OUTPUT_:
- The trained model

In [None]:
def train_crf(features_list, tags_list, num_iters, l1, l2, file_name=''):
    # Set up the model parameters 
    model = pycrfsuite.Trainer(verbose=False)
    model.set_params({
        'c1': l1,  # Coefficient for L1 penalty
        'c2': l2,  # Coefficient for L2 penalty
        'max_iterations': num_iters,

        # Include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    
    if len(features_list) != len(tags_list):
        raise ValueError('features_list has length {}, while tags_list has length {}'\
                         .format(len(features_list), len(tags_list)))
    
    print 'Adding data...'
    sys.stdout.flush()
    
    for i in range(len(tags_list)):
        features = features_list[i]
        tags = tags_list[i]
        
        if len(features) != len(tags):
            raise ValueError('features_list[{}] has length {}, while tags_list[{}] has length {}'\
                             .format(i, len(features), i, len(tags)))
        
        model.append(features, tags)

    print 'Training model...'
    sys.stdout.flush()
    
    model.train(file_name + '.model')
    print 'Done!'
    
    return model

### Get tagger
Get tagger which opens file_name ('.model' added automatically)

In [None]:
def get_tagger(file_name):
    tagger = pycrfsuite.Tagger()
    tagger.open(file_name + '.model')
    
    return tagger

### Print model info
_INPUT_:
- tagger: pycrfsuite.Tagger class (need to open model with it first)
- num_items: number of top positive/negative state features

In [None]:
def print_model_info(tagger, num_items=20):
    # A quick peak of the model
    info = tagger.info()

    def print_transitions(trans_features):
        for (label_from, label_to), weight in trans_features:
            print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

    print("Top likely transitions:")
    print_transitions(Counter(info.transitions).most_common())

    def print_state_features(state_features):
        for (attr, label), weight in state_features:
            print("%0.6f %-6s %s" % (weight, label, attr))

    print("\nTop positive:")
    print_state_features(Counter(info.state_features).most_common(num_items))

    print("\nTop negative:")
    print_state_features(Counter(info.state_features).most_common()[-num_items:])

### Predict tags
_INPUT_:
- tagger: pycrfsuite.Tagger class (need to open model with it first)
- features_list: list of list of features dictionaries

_OUTPUT_:
- List of list of predicted tags

In [None]:
def predict_tags(tagger, features_list):
    # Make predictions 
    pred_tags_list = []

    for features in features_list:
        pred_tags = tagger.tag(features)
        pred_tags_list.append(pred_tags)
    
    return pred_tags_list

### Count tags
_INPUT_:
- pred_tags_list: list of list of predicted tags
- gold_tags_list: list of list of gold tags
- tag_name: tag name to count (e.g. 'P')

_OUTPUT_:
- Number of tags with tag name in predicted tags, gold tags, and intersection of both, respectively

In [None]:
DEBUG = False

def count_tags(pred_tags_list, gold_tags_list, tag_name):
    num_pred_tags = 0
    num_gold_tags = 0
    num_both_tags = 0
    
    if len(pred_tags_list) != len(gold_tags_list):
        raise ValueError('pred_tags_list has length ' + str(len(pred_tags_list)) + \
                         ', while gold_tags_list has length ' + str(len(gold_tags_list)))
    
    for i in range(len(gold_tags_list)):
        pred_tags = pred_tags_list[i]
        gold_tags = gold_tags_list[i]
        
        if len(pred_tags) != len(gold_tags):
            raise ValueError('pred_tags_list[{}] has length {}, while gold_tags_list[{}] has length {}'\
                             .format(i, len(pred_tags), i, len(gold_tags)))
        
        for j in range(len(gold_tags)):
            if gold_tags[j] == tag_name:
                num_gold_tags += 1
                
                if pred_tags[j] == tag_name:
                    num_both_tags += 1
                    num_pred_tags += 1
            elif pred_tags[j] == tag_name:
                num_pred_tags += 1

    return num_pred_tags, num_gold_tags, num_both_tags

if DEBUG:
    gold_tags_list = [['None', 'P', 'None'], ['P', 'P', 'None', 'None']]
    pred_tags_list = [['P', 'P', 'None'], ['P', 'None', 'None', 'P']]
    
    print count_tags(pred_tags_list, gold_tags_list, 'P')

### Metrics
_INPUT_:
- Number of predicted tags, num of gold tags, number of tags predicted correctly

_OUTPUT_:
- Precision, recall, f1 scores

In [None]:
DEBUG = False

def metrics(num_pred_tags, num_gold_tags, num_both_tags):
    precision = 0
    recall = 0
    f1 = 0
    
    if num_both_tags > num_pred_tags:
        raise ValueError('num_both_tags = {} is greater than num_pred_tags = {}'\
                         .format(num_both_tags, num_pred_tags))
    if num_both_tags > num_gold_tags:
        raise ValueError('num_both_tags = {} is greater than num_gold_tags = {}'\
                         .format(num_both_tags, num_gold_tags))
    
    if num_pred_tags != 0:
        precision = float(num_both_tags)/num_pred_tags
        
    if num_gold_tags != 0:
        recall = float(num_both_tags)/num_gold_tags
    
    if precision != 0 and recall != 0:
        f1 = 2/(1/precision + 1/recall)
    
    return precision, recall, f1

if DEBUG:
    print metrics(3,4,2)

### Evaluate prediction
_INPUT_:
- pred_tags_list: list of list of predicted tags
- gold_tags_list: list of list of gold tags
- eval_tags: list of tags to evaluate on

_OUTPUT_:  
- Dictionary of format {tag: (precision, recall, f1), ...} for each tag in eval_tags. Also have key 'Overall' for precision, recall, f1 of all tags considered in aggregation.

In [None]:
def evaluate_prediction(pred_tags_list, gold_tags_list, eval_tags):
    # Compute evaluation metrics
    num_pred_all = 0
    num_gold_all = 0
    num_both_all = 0

    result = {}

    # Metrics for each tag
    for tag in eval_tags:
        num_pred, num_gold, num_both = count_tags(pred_tags_list, gold_tags_list, tag)

        p, r, f1 = metrics(num_pred, num_gold, num_both)
        result[tag] = (p, r, f1)

        num_pred_all += num_pred
        num_gold_all += num_gold
        num_both_all += num_both

    # Overall metrics
    p_overall, r_overall, f1_overall = metrics(num_pred_all, num_gold_all, num_both_all)
    result['Overall'] = (p_overall, r_overall, f1_overall)
    
    return result

### Get k-fold results
_INPUT_:
- features_list: list of list of features dictionaries
- tags_list: list of list of tags
- num_iters: number of iterations
- l1, l2: regularization parameters
- eval_tags: list of tags we are evaluating on, e.g. 'P'
- file_name: file name to write model out; '.model' added automatically
- save: whether to save result to file, named (file_name + '.result')
- n_folds: number of folds

_OUTPUT_:
- List of dictionaries for the each fold result, as computed by evaluate_prediction

In [None]:
def get_kfold_results(features_list, tags_list, num_iters, l1, l2, eval_tags, file_name='', save=False, n_folds=5):
    # Set up the KFold
    num_abstracts = len(tags_list)
    
    if len(features_list) != len(tags_list):
        raise ValueError('features_list has length {}, while tags_list has length {}'\
                         .format(len(features_list), len(tags_list)))

    kf = KFold(num_abstracts, random_state=1234, shuffle=True, n_folds=n_folds)
    
    # Store result of each fold
    fold_result_list = []
    
    for fold_idx, (train_indices, test_indices) in enumerate(kf):
        print 'On fold %s' % fold_idx

        train_features = [features_list[i] for i in train_indices]
        train_tags = [tags_list[i] for i in train_indices]

        test_features = [features_list[i] for i in test_indices]
        test_tags = [tags_list[i] for i in test_indices]

        # Train model
        model = train_crf(train_features, train_tags, num_iters, l1, l2, file_name)

        # Get tagger
        tagger = get_tagger(file_name)

        # Make predictions
        pred_test_tags = predict_tags(tagger, test_features)

        # Compute evaluation metrics
        fold_result = evaluate_prediction(pred_test_tags, test_tags, eval_tags)

        fold_result_list.append(fold_result)
    
    if save:
        f = open(file_name + '.result', 'w')
        pickle.dump(fold_result_list, f)
        f.close()
    
    return fold_result_list

### Average scores
Compute average scores from result outputted from get_kfold_results

In [None]:
def average_scores(result):
    if type(result) is not list:
        raise ValueError('result must be of type list')
    
    eval_tags = result[0].keys()
    
    avg_dict = dict()
    
    for tag in eval_tags:
        avg_dict[tag] = tuple(np.mean([fold_result[tag][i] for fold_result in result]) for i in range(3))
    
    return avg_dict

### Print result
Can print result of either evaluate_prediction, average_scores (a single dictionary) or get_kfold_results (list of dictionaries)

In [None]:
def print_result(result):
    if type(result) is dict:
        for tag, value in result.iteritems():
            print '{}: {}'.format(tag, value)
    elif type(result) is list:
        for i in range(len(result)):
            print 'Fold {}'.format(i)
            print_result(result[i])
        
        # Also print out average
        print 'Average'
        avg_dict = average_scores(result)
        print_result(avg_dict)
    else:
        raise ValueError('result must be of type dict or list')

In [47]:
def grid_search(features_list, tags_list, num_iters, l1_list, l2_list, eval_tags, n_folds=5):
    grid_search_result = {}
    for l1 in l1_list:
        for l2 in l2_list:
            # Run k-fold
            result = get_kfold_results(features_list, tags_list, num_iters, l1, l2, eval_tags, n_folds=n_folds)
            
            # Keep the average scores
            åavg_scores = average_scores(result)
            
            print 'L1: {}, L2: {}, scores: {}'.format(l1, l2, avg_scores)
            grid_search_result[l1, l2] = avg_scores

    return grid_search_result

### Running area

Get data

In [None]:
# Get train data
train_tokens, train_tags = get_all_data_train()
train_genia_tags = get_genia_tags('train')

In [None]:
# Get dev data
dev_tokens, dev_tags = get_all_data_dev()
dev_genia_tags = get_genia_tags('dev')

Compute features

In [None]:
# Set options
big_options_string = 'left_neighbors=1 right_neighbors=0 inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
one_hot one_hot_neighbors w2v_model=pubmed w2v w2v_neighbors w2v_size=10 cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

small_options_string = 'left_neighbors=0 right_neighbors=0 one_hot'

In [None]:
# Compute features for train
train_features = abstracts2features(train_tokens, train_genia_tags, w2v=None, options_string=small_options_string)

In [None]:
# Compute features for dev
dev_features = abstracts2features(dev_tokens, dev_genia_tags, w2v=None, options_string=small_options_string)

In [None]:
# For debug
sanity_check(train_features)

Train model

In [None]:
model = train_crf(train_features, train_tags, 100, 0.1, 0.1, 'b')

In [None]:
# Get model from file
tagger = get_tagger('b')

In [None]:
# For debug
print_model_info(tagger)

Predict tags

In [None]:
pred_dev_tags = predict_tags(tagger, dev_features)

In [None]:
# Evaluate a single tag
num_pred, num_gold, num_both = count_tags(pred_dev_tags, dev_tags, 'P')
p, r, f1 = metrics(num_pred, num_gold, num_both)
print num_pred, num_gold, num_both
print "Precision:", p, "Recall:", r, "F1:", f1

In [None]:
# Evaluate all tags at once
result = evaluate_prediction(pred_dev_tags, dev_tags, ['P'])
print_result(result)

K-fold evaluation

In [None]:
# Run K-fold
start_time = time.time()
kfold_result = get_kfold_results(train_features, train_tags, 100, 0.1, 0.1, ['P'], 'base')
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Print all results
print_result(kfold_result)

In [None]:
# Print just the average scores
print_result(average_scores(kfold_result))

Grid search

In [48]:
start_time = time.time()
grid_search_result = grid_search(train_features, train_tags, 100, [0,0.1,0.2], [0,0.1,0.2], ['P'])
print("--- %s seconds ---" % (time.time() - start_time))

On fold 0
Adding data...
Training model...
Done!
On fold 1
Adding data...
Training model...
Done!
On fold 2
Adding data...
Training model...
Done!
On fold 3
Adding data...
Training model...
Done!
On fold 4
Adding data...
Training model...
Done!
L1: 0, L2: 0, scores: {'P': (0.69074569376883965, 0.33279688828277443, 0.4483076485862994), 'Overall': (0.69074569376883965, 0.33279688828277443, 0.4483076485862994)}
On fold 0
Adding data...
Training model...
Done!
On fold 1
Adding data...
Training model...
Done!
On fold 2
Adding data...
Training model...
Done!
On fold 3
Adding data...
Training model...
Done!
On fold 4
Adding data...
Training model...
Done!
L1: 0, L2: 0.1, scores: {'P': (0.67840548284887525, 0.36793663268027083, 0.47660282738274251), 'Overall': (0.67840548284887525, 0.36793663268027083, 0.47660282738274251)}
On fold 0
Adding data...
Training model...
Done!
On fold 1
Adding data...
Training model...
Done!
On fold 2
Adding data...
Training model...
Done!
On fold 3
Adding data...


In [49]:
print grid_search_result

{(0.2, 0.1): {'P': (0.67542108391370292, 0.40599626271257661, 0.50694325154864717), 'Overall': (0.67542108391370292, 0.40599626271257661, 0.50694325154864717)}, (0, 0): {'P': (0.69074569376883965, 0.33279688828277443, 0.4483076485862994), 'Overall': (0.69074569376883965, 0.33279688828277443, 0.4483076485862994)}, (0.1, 0.1): {'P': (0.66840635704718032, 0.41287717337282004, 0.51032647244391049), 'Overall': (0.66840635704718032, 0.41287717337282004, 0.51032647244391049)}, (0, 0.2): {'P': (0.67443233429015259, 0.34814018930422863, 0.45826956504366512), 'Overall': (0.67443233429015259, 0.34814018930422863, 0.45826956504366512)}, (0.1, 0.2): {'P': (0.67912409058685408, 0.40156458228089686, 0.50456310428978135), 'Overall': (0.67912409058685408, 0.40156458228089686, 0.50456310428978135)}, (0.1, 0): {'P': (0.63981752128399116, 0.43025100983333725, 0.51434361622346536), 'Overall': (0.63981752128399116, 0.43025100983333725, 0.51434361622346536)}, (0.2, 0): {'P': (0.65414343746159775, 0.421597222