### Line-Chain CRF

pycrfsuite version 
source: https://github.com/bwallace/Deep-PICO/blob/3152ab3690cad1b6e369be8a8aac27393811341c/crf.py

In [None]:
import sys
from collections import Counter

from preprocess_data import get_all_data_train, get_all_data_dev, get_all_data_test
from features_generator import abstracts2features, get_genia_tags, sanity_check
from evaluation import eval_abstracts, eval_abstracts_avg

from gensim.models import Word2Vec

from sklearn_crfsuite import metrics
import pycrfsuite
import sklearn_crfsuite
import scipy

from sklearn.cross_validation import KFold
from sklearn.grid_search import RandomizedSearchCV
from sklearn.metrics import make_scorer

import numpy as np

In [None]:
def train_crf(features_list, tags_list, num_iters, l1, l2, file_name=''):
    # Set up the model parameters 
    model = pycrfsuite.Trainer(verbose=False)
    model.set_params({
        'c1': l1,  # Coefficient for L1 penalty
        'c2': l2,  # Coefficient for L2 penalty
        'max_iterations': num_iters,  # Stop earlier

        # Include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    
    print 'Adding data...'
    sys.stdout.flush()
    
    for i in range(len(tags_list)):
        model.append(features_list[i], tags_list[i])

    print 'Training model...'
    sys.stdout.flush()
    
    model.train(file_name)
    print 'Done!'
    
    return model

In [None]:
def print_model_info(tagger, num_items=20):
    # A quick peak of the model
    info = tagger.info()

    def print_transitions(trans_features):
        for (label_from, label_to), weight in trans_features:
            print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

    print("Top likely transitions:")
    print_transitions(Counter(info.transitions).most_common())

    def print_state_features(state_features):
        for (attr, label), weight in state_features:
            print("%0.6f %-6s %s" % (weight, label, attr))

    print("\nTop positive:")
    print_state_features(Counter(info.state_features).most_common(num_items))

    print("\nTop negative:")
    print_state_features(Counter(info.state_features).most_common()[-num_items:])

In [None]:
def predict_tags(tagger, features_list):
    # Make predictions 
    pred_tags_list = []

    for features in features_list:
        pred_tags = tagger.tag(features)
        pred_tags_list.append(pred_tags)
    
    return pred_tags_list

In [None]:
DEBUG = True

def evaluate(pred_tags_list, gold_tags_list, tag_name):
    num_pred_tags = 0
    num_gold_tags = 0
    num_both_tags = 0
    
    for i in range(len(gold_tags_list)):
        gold_tags = gold_tags_list[i]
        pred_tags = pred_tags_list[i]
        
        for j in range(len(gold_tags)):
            if gold_tags[j] == tag_name:
                num_gold_tags += 1
                
                if pred_tags[j] == tag_name:
                    num_both_tags += 1
                    num_pred_tags += 1
            elif pred_tags[j] == tag_name:
                num_pred_tags += 1

    return num_pred_tags, num_gold_tags, num_both_tags

if DEBUG:
    gold_tags_list = [['None', 'P', 'None'], ['P', 'P', 'None', 'None']]
    pred_tags_list = [['P', 'P', 'None'], ['P', 'None', 'None', 'P']]
    
    print evaluate(pred_tags_list, gold_tags_list, tag_name='P')

In [None]:
DEBUG = False

def metrics(num_pred_tags, num_gold_tags, num_both_tags):
    precision = 0
    recall = 0
    f1 = 0
    
    if num_pred_tags != 0:
        precision = float(num_both_tags)/num_pred_tags
        
    if num_gold_tags != 0:
        recall = float(num_both_tags)/num_gold_tags
    
    if precision != 0 and recall != 0:
        f1 = 2/(1/precision + 1/recall)
    
    return precision, recall, f1

if DEBUG:
    print metrics(3,4,2)

In [53]:
def train_crf_kfold(features_list, tags_list, num_iters, l1, l2, tag_name, file_name='', n_folds=5):
    # Set up the KFold
    num_abstracts = len(tags_list)

    kf = KFold(num_abstracts, random_state=1234, shuffle=True, n_folds=n_folds)

    recall_scores=[]
    precision_scores = []
    f1_scores = []

    labels = set(tags_list[0])

    for fold_idx, (train_indices, test_indices) in enumerate(kf):
        print 'on fold %s' % fold_idx

        train_x = [features_list[i] for i in train_indices]
        train_y = [tags_list[i] for i in train_indices]

        test_x = [features_list[i] for i in test_indices]
        test_y = [tags_list[i] for i in test_indices]

        #train models
        model_name = file_name + '_model {}'.format(fold_idx)
        model =train_crf(train_x, train_y, num_iters, l1, l2, model_name)
        
        tagger = pycrfsuite.Tagger()
        tagger.open(model_name)

        #make predictions
        test_pred = predict_tags(tagger, test_x)

        # compute evaluation metrics
        num_pred, num_gold, num_both = evaluate(test_pred, test_y, 'P')
        fold_precision,fold_recall, fold_f1_score = metrics(num_pred, num_gold, num_both)

        recall_scores.append(fold_recall)
        precision_scores.append(fold_precision)
        f1_scores.append(fold_f1_score)

        fold_recall_results = "Fold recall: {}".format(fold_recall)
        fold_precision_results = "Fold precision: {}".format(fold_precision)
        fold_f1_results = "Fold F1 Score: {}".format(fold_f1_score)
        print fold_recall_results
        print fold_precision_results
        print fold_f1_results

        file = open(file_name + '_results.txt', 'w+')

        file.write(fold_recall_results + '\n')
        file.write(fold_precision_results + '\n')
        file.write(fold_f1_results + '\n')


    recall_average = np.mean(recall_scores)
    precision_average = np.mean(precision_scores)
    f1_scores = np.mean(f1_scores)

    print "Recall Average: {}".format(recall_average)
    print "Precision Average: {}".format(precision_average)
    print "F1 Average: {}".format(f1_scores)

In [None]:
def crf(l2,l1,iters,grid_search,modelname,train_features,train_tag_array):
    
    # set up the model parameters 
    model = pycrfsuite.Trainer(verbose = False)
    n = len(train_tag_array)
    n_folds= 5
    kf = KFold(n ,random_state=1234, shuffle=True, n_folds=n_folds)
    
    recall_scores=[]
    precision_scores = []
    f1_scores = []
    
    labels = set(train_tag_array[0])
    
    for fold_idx, (train,test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        print('loading data...')
        train_x =[train_features[i] for i in train]
        train_y = [train_tag_array[i] for i in train]
        
        test_x =[train_features[i] for i in test]
        test_y = [train_tag_array[i] for i in test]
        
        for x, y in zip(train_x,train_y):
            model.append(x,y)
        
        #train the model
        if grid_search:
            model.set_params({'c1': l1,'c2': l2,'max_iterations': iters,'feature.possible_transitions': True})
                
                
            crf = sklearn_crfsuite.CRF(algorithm='lbfgs',c1=l1,c2=l2,max_iterations=iters,all_possible_transitions=False)
            
            params_space = {
                'c1': scipy.stats.expon(scale=0.5),
                'c2': scipy.stats.expon(scale=0.05),
            }
            
            # use the same metric for evaluation
            f1_scorer = make_scorer(metrics.flat_f1_score,
                                    average='weighted', labels=labels)


            # search
            rs = RandomizedSearchCV(crf, params_space,
                                    cv=3,
                                    verbose=1,
                                    n_jobs=-1,
                                    n_iter=50,
                                    scoring=f1_scorer)
            rs.fit(train_x, train_y)
            info = rs.best_estimator_.tagger_.info()
            tagger = rs.best_estimator_.tagger_
        else:
            model.set_params({
                'c1': l1,   # coefficient for L1 penalty
                'c2': l2,  # coefficient for L2 penalty
                'max_iterations': iters,  # stop earlier

                # include transitions that are possible, but not observed
                'feature.possible_transitions': True
            })
            model_name = modelname + '_model {}'.format(fold_idx)
            print('training model...')
            model.train(model_name)
            print('done...')
            tagger = pycrfsuite.Tagger()
            tagger.open(model_name)

            info = tagger.info()
    
        # a quick peak of the model 
        def print_transitions(trans_features):
            for (label_from, label_to), weight in trans_features:
                print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

        print("Top likely transitions:")
        print_transitions(Counter(info.transitions).most_common(80))

        print("\nTop unlikely transitions:")
        print_transitions(Counter(info.transitions).most_common()[-80:])

        def print_state_features(state_features):
            for (attr, label), weight in state_features:
                print("%0.6f %-6s %s" % (weight, label, attr))

        print("Top positive:")
        print_state_features(Counter(info.state_features).most_common(80))

        print("\nTop negative:")
        print_state_features(Counter(info.state_features).most_common()[-80:])
        
        print("parameters")
        model.get_params()

        
        #make predictions 
        test_pred = []
        

        for i,  (x, y) in enumerate(zip(test_x, test_y)):
            
            pred_labels = tagger.tag(x)
            test_pred.append(pred_labels)

            
        # compute evaluation metrics    
        fold_precision,fold_recall, fold_f1 = eval_abstracts(test_y, test_pred)
        
        recall_scores.append(fold_recall)
        precision_scores.append(fold_precision)
        f1_scores.append(fold_f1_score)

        fold_recall_results = "Fold recall: {}".format(fold_recall)
        fold_precision_results = "Fold precision: {}".format(fold_precision)
        fold_f1_results = "Fold F1 Score: {}".format(fold_f1_score)
        print fold_recall_results
        print fold_precision_results
        print fold_f1_results

        file = open(modelname + '_results.txt', 'w+')

        file.write(fold_recall_results + '\n')
        file.write(fold_precision_results + '\n')
        file.write(fold_f1_results + '\n')

       
    recall_average = np.mean(recall_scores)
    precision_average = np.mean(precision_scores)
    f1_scores = np.mean(f1_scores)

    print "Recall Average: {}".format(recall_average)
    print "Precision Average: {}".format(precision_average)
    print "F1 Average: {}".format(f1_scores)

Get data

In [None]:
# Get train data
train_tokens, train_tags = get_all_data_train()
train_genia_tags = get_genia_tags('train')

In [None]:
# Get dev data
dev_tokens, dev_tags = get_all_data_dev()
dev_genia_tags = get_genia_tags('dev')

Compute features

In [None]:
big_options_string = 'left_neighbors=1 right_neighbors=0 inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
one_hot one_hot_neighbors w2v_model=pubmed w2v w2v_neighbors w2v_size=10 cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

small_options_string = 'left_neighbors=0 right_neighbors=0 one_hot'

In [None]:
train_features = abstracts2features(train_tokens, train_genia_tags, w2v=None, options_string=small_options_string)

In [None]:
dev_features = abstracts2features(dev_tokens, dev_genia_tags, w2v=None, options_string=small_options_string)

In [None]:
# For debug
sanity_check(train_features)

Train model

In [None]:
model = train_crf(train_features, train_tags, 100, 0.1, 0.1, 'b')

In [None]:
# Get model from file
tagger = pycrfsuite.Tagger()
tagger.open('b')

In [None]:
# For debug
print_model_info(tagger)

Predict tags

In [None]:
pred_dev_tags = predict_tags(tagger, dev_features)

In [None]:
num_pred, num_gold, num_both = evaluate(pred_dev_tags, dev_tags, 'P')
p, r, f1 = metrics(num_pred, num_gold, num_both)
print num_pred, num_gold, num_both
print "Precision:", p, "Recall:", r, "F1:", f1

In [54]:
import time
start_time = time.time()


train_crf_kfold(train_features, train_tags, 100, 0.1, 0.1, 'Base_crf')
print("--- %s seconds ---" % (time.time() - start_time))

on fold 0
Adding data...
Training model...
Done!
Fold recall: 0.402249983648
Fold precision: 0.65488233415
Fold F1 Score: 0.498379254457
on fold 1
Adding data...
Training model...
Done!
Fold recall: 0.421791648783
Fold precision: 0.662844478705
Fold F1 Score: 0.515531906921
on fold 2
Adding data...
Training model...
Done!
Fold recall: 0.394653403792
Fold precision: 0.674601487779
Fold F1 Score: 0.497979996078
on fold 3
Adding data...
Training model...
Done!
Fold recall: 0.428396871945
Fold precision: 0.670043000478
Fold F1 Score: 0.522640032795
on fold 4
Adding data...
Training model...
Done!
Fold recall: 0.417293958695
Fold precision: 0.679660484124
Fold F1 Score: 0.517101171968
Recall Average: 0.412877173373
Precision Average: 0.668406357047
F1 Average: 0.510326472444
--- 80.889747858 seconds ---
