### Line-Chain CRF

pycrfsuite version 
source: https://github.com/bwallace/Deep-PICO/blob/3152ab3690cad1b6e369be8a8aac27393811341c/crf.py

In [20]:
# from features_generator import abstracts2features

from preprocess_data import get_all_data_train, get_all_data_dev, get_all_data_test
from gensim.models import Word2Vec
from features_generator import abstracts2features
from features_generator import get_genia_tags
from sklearn_crfsuite import metrics
import pycrfsuite
import sklearn_crfsuite
import scipy

from evaluation import eval_abstracts, eval_abstracts_avg

from collections import Counter

from sklearn.cross_validation import KFold
from sklearn.grid_search import RandomizedSearchCV
from sklearn.metrics import make_scorer

import numpy as np

In [2]:
default_options_string = 'left_neighbors=1 right_neighbors=0 inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
one_hot one_hot_neighbors w2v_model=pubmed w2v w2v_neighbors w2v_size=10 cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

In [2]:
fewer_options_string = 'left_neighbors=0 right_neighbors=0 one_hot'

In [9]:
def crf(l2,l1,iters,grid_search,modelname,train_features,train_tag_array):
    
    # set up the model parameters 
    model = pycrfsuite.Trainer(verbose = False)
    n = len(train_tag_array)
    n_folds= 5
    kf = KFold(n ,random_state=1234, shuffle=True, n_folds=n_folds)
    
    recall_scores=[]
    precision_scores = []
    f1_scores = []
    
    labels = set(train_tag_array[0])
    
    for fold_idx, (train,test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        print('loading data...')
        train_x =[train_features[i] for i in train]
        train_y = [train_tag_array[i] for i in train]
        
        test_x =[train_features[i] for i in test]
        test_y = [train_tag_array[i] for i in test]
        
        for x, y in zip(train_x,train_y):
            model.append(x,y)
        
        #train the model
        if grid_search:
            model.set_params({'c1': l1,'c2': l2,'max_iterations': iters,'feature.possible_transitions': True})
                
                
            crf = sklearn_crfsuite.CRF(algorithm='lbfgs',c1=l1,c2=l2,max_iterations=iters,all_possible_transitions=False)
            
            params_space = {
                'c1': scipy.stats.expon(scale=0.5),
                'c2': scipy.stats.expon(scale=0.05),
            }
            
            # use the same metric for evaluation
            f1_scorer = make_scorer(metrics.flat_f1_score,
                                    average='weighted', labels=labels)


            # search
            rs = RandomizedSearchCV(crf, params_space,
                                    cv=3,
                                    verbose=1,
                                    n_jobs=-1,
                                    n_iter=50,
                                    scoring=f1_scorer)
            rs.fit(train_x, train_y)
            info = rs.best_estimator_.tagger_.info()
            tagger = rs.best_estimator_.tagger_
        else:
            model.set_params({
                'c1': l1,   # coefficient for L1 penalty
                'c2': l2,  # coefficient for L2 penalty
                'max_iterations': iters,  # stop earlier

                # include transitions that are possible, but not observed
                'feature.possible_transitions': True
            })
            model_name = modelname + '_model {}'.format(fold_idx)
            print('training model...')
            model.train(model_name)
            print('done...')
            tagger = pycrfsuite.Tagger()
            tagger.open(model_name)

            info = tagger.info()
    
        # a quick peak of the model 
        def print_transitions(trans_features):
            for (label_from, label_to), weight in trans_features:
                print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

        print("Top likely transitions:")
        print_transitions(Counter(info.transitions).most_common(80))

        print("\nTop unlikely transitions:")
        print_transitions(Counter(info.transitions).most_common()[-80:])

        def print_state_features(state_features):
            for (attr, label), weight in state_features:
                print("%0.6f %-6s %s" % (weight, label, attr))

        print("Top positive:")
        print_state_features(Counter(info.state_features).most_common(80))

        print("\nTop negative:")
        print_state_features(Counter(info.state_features).most_common()[-80:])
        
        print("parameters")
        model.get_params()

        
        #make predictions 
        test_pred = []
        

        for i,  (x, y) in enumerate(zip(test_x, test_y)):
            
            pred_labels = tagger.tag(x)
            test_pred.append(pred_labels)

            
        # compute evaluation metrics    
        fold_precision,fold_recall, fold_f1 = eval_abstracts(test_y, test_pred)
        
        recall_scores.append(fold_recall)
        precision_scores.append(fold_precision)
        f1_scores.append(fold_f1_score)

        fold_recall_results = "Fold recall: {}".format(fold_recall)
        fold_precision_results = "Fold precision: {}".format(fold_precision)
        fold_f1_results = "Fold F1 Score: {}".format(fold_f1_score)
        print fold_recall_results
        print fold_precision_results
        print fold_f1_results

        file = open(modelname + '_results.txt', 'w+')

        file.write(fold_recall_results + '\n')
        file.write(fold_precision_results + '\n')
        file.write(fold_f1_results + '\n')

       
    recall_average = np.mean(recall_scores)
    precision_average = np.mean(precision_scores)
    f1_scores = np.mean(f1_scores)

    print "Recall Average: {}".format(recall_average)
    print "Precision Average: {}".format(precision_average)
    print "F1 Average: {}".format(f1_scores)

In [13]:
train_tokens, tag_array = get_all_data_train()
train_genia_tags = get_genia_tags('train')


In [14]:
train_features = abstracts2features(train_tokens, train_genia_tags,w2v=False,options_string=fewer_options_string)

3499: ['Pulsed', 'azithromycin', 'treatment']


In [21]:
import time
start_time = time.time()


crf(0.1,0.1,100,False,'Base_crf',train_features,tag_array)
print("--- %s seconds ---" % (time.time() - start_time))

on fold 0
loading data...
training model...
done...
Top likely transitions:
P      -> P       2.156933
None   -> None    1.971323
None   -> P       -0.817592
P      -> None    -3.026915

Top unlikely transitions:
P      -> P       2.156933
None   -> None    1.971323
None   -> P       -0.817592
P      -> None    -3.026915
Top positive:
5.734137 P      word[0]:Norway
5.644972 P      word[0]:alcoholics
5.634837 P      word[0]:alpacas
5.336022 P      word[0]:China.
5.319340 P      word[0]:Vietnam.
5.209645 P      word[0]:rowers
5.202165 None   word[0]:For
5.030955 P      word[0]:insulin-dependent
4.963336 P      word[0]:elderly.
4.918807 P      word[0]:third-party
4.835463 P      word[0]:Fifty-four
4.655907 P      word[0]:60-year-old
4.597937 P      word[0]:kala-azar
4.561541 None   word[0]:PARTICIPANTS
4.502174 P      word[0]:nonsmokers.
4.412506 P      word[0]:cirrhotics
4.327752 P      word[0]:Gambia.
4.318165 P      word[0]:dysmenorrhoea
4.275152 P      word[0]:Eighty
4.222200 P      w

KeyError: 'P'