### Line-Chain CRF

pycrfsuite version 
source: https://github.com/bwallace/Deep-PICO/blob/3152ab3690cad1b6e369be8a8aac27393811341c/crf.py

In [1]:
import sys
from collections import Counter

from preprocess_data import get_all_data_train, get_all_data_dev, get_all_data_test
from features_generator import abstracts2features, get_genia_tags, sanity_check
from evaluation import eval_abstracts, eval_abstracts_avg

from gensim.models import Word2Vec

from sklearn_crfsuite import metrics
import pycrfsuite
import sklearn_crfsuite
import scipy

from sklearn.cross_validation import KFold
from sklearn.grid_search import RandomizedSearchCV
from sklearn.metrics import make_scorer

import numpy as np



## Train CRF
_INPUT_:
- features_list: list of list of features dictionaries
- tags_list: list of list of tags
- num_iters: number of iterations
- l1, l2: regularization parameters
- file_name: file name to write model out; '.model' added automatically

_OUTPUT_:
- The trained model

In [2]:
def train_crf(features_list, tags_list, num_iters, l1, l2, file_name=''):
    # Set up the model parameters 
    model = pycrfsuite.Trainer(verbose=False)
    model.set_params({
        'c1': l1,  # Coefficient for L1 penalty
        'c2': l2,  # Coefficient for L2 penalty
        'max_iterations': num_iters,

        # Include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    
    print 'Adding data...'
    sys.stdout.flush()
    
    for i in range(len(tags_list)):
        model.append(features_list[i], tags_list[i])

    print 'Training model...'
    sys.stdout.flush()
    
    model.train(file_name + '.model')
    print 'Done!'
    
    return model

## Get tagger
Get tagger which opens file_name ('.model' added automatically)

In [3]:
def get_tagger(file_name):
    tagger = pycrfsuite.Tagger()
    tagger.open(file_name + '.model')
    
    return tagger

## Print model info
_INPUT_:
- tagger: pycrfsuite.Tagger class (need to open model with it first)
- num_items: number of top positive/negative state features

In [4]:
def print_model_info(tagger, num_items=20):
    # A quick peak of the model
    info = tagger.info()

    def print_transitions(trans_features):
        for (label_from, label_to), weight in trans_features:
            print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

    print("Top likely transitions:")
    print_transitions(Counter(info.transitions).most_common())

    def print_state_features(state_features):
        for (attr, label), weight in state_features:
            print("%0.6f %-6s %s" % (weight, label, attr))

    print("\nTop positive:")
    print_state_features(Counter(info.state_features).most_common(num_items))

    print("\nTop negative:")
    print_state_features(Counter(info.state_features).most_common()[-num_items:])

## Predict tags
_INPUT_:
- tagger: pycrfsuite.Tagger class (need to open model with it first)
- features_list: list of list of features dictionaries

_OUTPUT_:
- List of list of predicted tags

In [5]:
def predict_tags(tagger, features_list):
    # Make predictions 
    pred_tags_list = []

    for features in features_list:
        pred_tags = tagger.tag(features)
        pred_tags_list.append(pred_tags)
    
    return pred_tags_list

## Count tags
_INPUT_:
- pred_tags_list: list of list of predicted tags
- gold_tags_list: list of list of gold tags
- tag_name: tag name to count (e.g. 'P')

_OUTPUT_:
- Number of tags with tag name in predicted tags, gold tags, and intersection of both, respectively

In [6]:
DEBUG = False

def count_tags(pred_tags_list, gold_tags_list, tag_name):
    num_pred_tags = 0
    num_gold_tags = 0
    num_both_tags = 0
    
    if len(pred_tags_list) != len(gold_tags_list):
        raise ValueError('pred_tags_list has length ' + str(len(pred_tags_list)) + \
                         ', while gold_tags_list has length ' + str(len(gold_tags_list)))
    
    for i in range(len(gold_tags_list)):
        pred_tags = pred_tags_list[i]
        gold_tags = gold_tags_list[i]
        
        if len(pred_tags) != len(gold_tags):
            raise ValueError('pred_tags_list[{}] has length {}, while gold_tags_list[{}] has length {}'\
                             .format(i, len(pred_tags), i, len(gold_tags)))
        
        for j in range(len(gold_tags)):
            if gold_tags[j] == tag_name:
                num_gold_tags += 1
                
                if pred_tags[j] == tag_name:
                    num_both_tags += 1
                    num_pred_tags += 1
            elif pred_tags[j] == tag_name:
                num_pred_tags += 1

    return num_pred_tags, num_gold_tags, num_both_tags

if DEBUG:
    gold_tags_list = [['None', 'P', 'None'], ['P', 'P', 'None', 'None']]
    pred_tags_list = [['P', 'P', 'None'], ['P', 'None', 'None', 'P']]
    
    print count_tags(pred_tags_list, gold_tags_list, 'P')

## Metrics
_INPUT_:
- Number of predicted tags, num of gold tags, number of tags predicted correctly

_OUTPUT_:
- Precision, recall, f1 scores

In [7]:
DEBUG = False

def metrics(num_pred_tags, num_gold_tags, num_both_tags):
    precision = 0
    recall = 0
    f1 = 0
    
    if num_both_tags > num_pred_tags:
        raise ValueError('num_both_tags = {} is greater than num_pred_tags = {}'\
                         .format(num_both_tags, num_pred_tags))
    if num_both_tags > num_gold_tags:
        raise ValueError('num_both_tags = {} is greater than num_gold_tags = {}'\
                         .format(num_both_tags, num_gold_tags))
    
    if num_pred_tags != 0:
        precision = float(num_both_tags)/num_pred_tags
        
    if num_gold_tags != 0:
        recall = float(num_both_tags)/num_gold_tags
    
    if precision != 0 and recall != 0:
        f1 = 2/(1/precision + 1/recall)
    
    return precision, recall, f1

if DEBUG:
    print metrics(3,4,2)

In [9]:
def train_crf_kfold(features_list, tags_list, num_iters, l1, l2, file_name='', save=False, n_folds=5):
    # Set up the KFold
    num_abstracts = len(tags_list)

    kf = KFold(num_abstracts, random_state=1234, shuffle=True, n_folds=n_folds)

    recall_scores=[]
    precision_scores = []
    f1_scores = []

    labels = set(tags_list[0])

    for fold_idx, (train_indices, test_indices) in enumerate(kf):
        print 'On fold %s' % fold_idx

        train_x = [features_list[i] for i in train_indices]
        train_y = [tags_list[i] for i in train_indices]

        test_x = [features_list[i] for i in test_indices]
        test_y = [tags_list[i] for i in test_indices]

        # Train model
        model_name = file_name + '_fold{}'.format(fold_idx)
        model = train_crf(train_x, train_y, num_iters, l1, l2, model_name)

        # Get tagger
        tagger = get_tagger(model_name)

        # Make predictions
        test_pred = predict_tags(tagger, test_x)

        # compute evaluation metrics
        num_pred_all = 0
        num_gold_all = 0
        num_both_all = 0

        fold_result = {}

        #label level metrics
        for l in labels:
            if l != 'None':
                num_pred, num_gold, num_both = count_tags(test_pred, test_y, l)

                num_pred_all += num_pred
                num_gold_all += num_gold
                num_both_all += num_both

                fold_p, fold_r, fold_f1 = metrics(num_pred, num_gold, num_both)
                fold_result[l] = [fold_p, fold_r, fold_f1]

        # overal metrics
        fold_p_overall, fold_r_overall, fold_f1_overall = metrics(num_pred_all, num_gold_all, num_both_all)
        fold_result['Overall'] = [fold_p_overall, fold_r_overall, fold_f1_overall]

        precision_scores.append(fold_p_overall)
        recall_scores.append(fold_r_overall)
        f1_scores.append(fold_f1_overall)


        fold_precision_results = "Fold precision: {}".format(fold_p_overall)
        fold_recall_results = "Fold recall: {}".format(fold_r_overall)
        fold_f1_results = "Fold F1 Score: {}".format(fold_f1_overall)
        print fold_recall_results
        print fold_precision_results
        print fold_f1_results

        if save:
            f = open(file_name + '_results.txt', 'w+')
            pickle.dump(fold_result, f)
            f.close()


    precision_average = np.mean(precision_scores)
    recall_average = np.mean(recall_scores)
    f1_scores = np.mean(f1_scores)


    print "Recall Average: {}".format(recall_average)
    print "Precision Average: {}".format(precision_average)
    print "F1 Average: {}".format(f1_scores)

    return precision_average,recall_average,f1_scores

In [20]:
def grid_search(features_list, tags_list, num_iters, l1_list, l2_list, file_name='',save=False,n_folds=5):
    result = {}
    for l1 in l1_list:
        for l2 in l2_list:
            p, r, f1 = train_crf_kfold(features_list, tags_list, num_iters, l1, l2, file_name,save=save,n_folds=n_folds)
            result[l1,l2] = [p, r, f1]

    return result

Get data

In [10]:
# Get train data
train_tokens, train_tags = get_all_data_train()
train_genia_tags = get_genia_tags('train')

In [11]:
# Get dev data
dev_tokens, dev_tags = get_all_data_dev()
dev_genia_tags = get_genia_tags('dev')

Compute features

In [12]:
big_options_string = 'left_neighbors=1 right_neighbors=0 inside_paren pos chunk iob named_entity \
inside_paren_neighbors pos_neighbors chunk_neighbors iob_neighbors named_entity_neighbors \
chunk_end chunk_end_neighbors same_chunk_neighbors \
one_hot one_hot_neighbors w2v_model=pubmed w2v w2v_neighbors w2v_size=10 cosine_simil cosine_simil_neighbors \
isupper isupper_neighbors istitle istitle_neighbors'

small_options_string = 'left_neighbors=0 right_neighbors=0 one_hot'

In [13]:
train_features = abstracts2features(train_tokens, train_genia_tags, w2v=None, options_string=small_options_string)

3499: ['Pulsed', 'azithromycin', 'treatment']


In [14]:
dev_features = abstracts2features(dev_tokens, dev_genia_tags, w2v=None, options_string=small_options_string)

999: ['Serum', 'bactericidal', 'activities']


In [None]:
# For debug
sanity_check(train_features)

Train model

In [None]:
model = train_crf(train_features, train_tags, 100, 0.1, 0.1, 'b')

In [None]:
# Get model from file
tagger = get_tagger('b')

In [None]:
# For debug
print_model_info(tagger)

Predict tags

In [None]:
pred_dev_tags = predict_tags(tagger, dev_features)

In [None]:
num_pred, num_gold, num_both = count_tags(pred_dev_tags, dev_tags, 'P')
p, r, f1 = metrics(num_pred, num_gold, num_both)
print num_pred, num_gold, num_both
print "Precision:", p, "Recall:", r, "F1:", f1

In [19]:
import time
start_time = time.time()


train_crf_kfold(train_features, train_tags, 100, 0.1, 0.1, 'Base_crf')
print("--- %s seconds ---" % (time.time() - start_time))

On fold 0
Adding data...
Training model...
Done!
Fold recall: 0.402249983648
Fold precision: 0.65488233415
Fold F1 Score: 0.498379254457
On fold 1
Adding data...
Training model...
Done!
Fold recall: 0.421791648783
Fold precision: 0.662844478705
Fold F1 Score: 0.515531906921
On fold 2
Adding data...
Training model...
Done!
Fold recall: 0.394653403792
Fold precision: 0.674601487779
Fold F1 Score: 0.497979996078
On fold 3
Adding data...
Training model...
Done!
Fold recall: 0.428396871945
Fold precision: 0.670043000478
Fold F1 Score: 0.522640032795
On fold 4
Adding data...
Training model...
Done!
Fold recall: 0.417293958695
Fold precision: 0.679660484124
Fold F1 Score: 0.517101171968
Recall Average: 0.412877173373
Precision Average: 0.668406357047
F1 Average: 0.510326472444
--- 87.6969969273 seconds ---


In [15]:
a = {}

In [17]:
a['a','b']=2


In [18]:
a

{('a', 'b'): 2}