In [None]:
%matplotlib inline

#%load_ext line_profiler
#%load_ext memory_profiler

In [None]:
from __future__ import division

import numpy as np
import matplotlib.pyplot as plt

In [None]:
from gensim import corpora

dic = corpora.Dictionary()
tokens = ['']
for line in open('data.conll', 'r'):
    line = line.strip()
    if len(line) == 0:
        dic.add_documents([tokens])
        del tokens[:]
    else:
        tokens.extend(line.split()[:-1])

In [None]:
import codecs

import numpy as np

from sqlearn.crfsuite import crfutils
from sqlearn.crfsuite import ner

def load_data(f):
    X = []
    y = []
    sent = []
    sent_label = []
    for line in codecs.open(f, 'r', 'utf-8'):
        line = line.strip('\n')
        if line.strip() == '':
            sent.append('\n')
            for item in crfutils.readiter(sent, ['w', 'pos'], ' ', dic.token2id):
                ner.feature_extractor(item)
                X.append(item)
                y.append(sent_label)
            sent = []
            sent_label = []
        else:
            splited_line = line.split(' ')
            sent.append('%s %s' % (splited_line[0], splited_line[1]))
            sent_label.append(splited_line[2])

    X = [[feature['F'] for feature in sent] for sent in X]

    X = np.asarray(X)
    y = np.asarray(y)
    
    return X, y

In [None]:
%%time
X, y = load_data('data.conll')

In [None]:
#%%writefile prof.py
from __future__ import division

import numpy as np
import pycrfsuite as crf

from itertools import chain
from collections import defaultdict

from sklearn import cross_validation
from sklearn.preprocessing import LabelBinarizer

trainer = crf.Trainer(verbose=False)
trainer.set_params({
#    'c1': 0.0,   # coefficient for L1 penalty
    'c2': 1.0,   # coefficient for L2 penalty
#    'max_iterations': 50,  # stop earlier
#
    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

tagger = crf.Tagger()
    
def f_score(y_true, y_pred):
    lb = LabelBinarizer()
    y_true = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred = lb.transform(list(chain.from_iterable(y_pred)))

    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    y_acc = y_true + y_pred
    
#    print {item[0]: sum(y_true_combined[:, item[1]] > 0) for item in class_indices.items()}
#    print {item[0]: sum(y_pred_combined[:, item[1]] > 0) for item in class_indices.items()}
    
    return {item[0]: 
            (f_acc(y_acc, item[1]),
             f_rec(y_acc, y_true, item[1]),
             f_pre(y_acc, y_pred, item[1]),
             f_f1(y_acc, y_true, y_pred, item[1])) for item in class_indices.items()}

def f_acc(y_acc, index):
    return sum(y_acc[:, index] == 2)/sum(y_acc[:, index] > 0)

def f_rec(y_acc, y_true, index):
    return sum(y_acc[:, index] == 2)/sum(y_true[:, index] == 1)

def f_pre(y_acc, y_pred, index):
    return sum(y_acc[:, index] == 2)/sum(y_pred[:, index] == 1)

def f_f1(y_acc, y_true, y_pred, index):
    rec = f_rec(y_acc, y_true, index)
    pre = f_pre(y_acc, y_pred, index)
    return 2*rec*pre/(rec+pre)

def measure(data_size, X, y):
    # tag -> metrics -> [score0, scor1]
    train_scores = defaultdict(lambda: defaultdict(list))
    test_scores = defaultdict(lambda: defaultdict(list))

    def append_score(d, item):
        d[item[0]]['acc'].append(item[1][0])
        d[item[0]]['rec'].append(item[1][1])
        d[item[0]]['pre'].append(item[1][2])
        d[item[0]]['f1'].append(item[1][3])

    kf = cross_validation.KFold(n=data_size, n_folds=3, shuffle=True, random_state=None)
    
    for fold_idx, (train_index, test_index) in enumerate(kf):
        print('Iteration #%i' % fold_idx)
        
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]

        # train
        model_name = 'm%i.crfsuite' % fold_idx
        Xy = zip(X_train, y_train)
        for xseq, yseq in Xy:
            trainer.append(xseq, yseq)
        trainer.train(model_name)
        trainer.clear()
        
        # predict
        tagger.open(model_name)
        y_train_pred = [tagger.tag(xseq) for xseq in X_train]
        y_test_pred  = [tagger.tag(xseq) for xseq in X_test]
        tagger.close()
        
        # evaluate
        train_score = f_score(y_train, y_train_pred)
        test_score = f_score(y_test, y_test_pred)
        
        map(lambda item: append_score(train_scores, item), train_score.items())
        map(lambda item: append_score(test_scores, item), test_score.items())
        
        del Xy
        del y_train_pred
        del y_test_pred
        del X_train
        del y_train
        del X_test
        del y_test
        del train_index
        del test_index
        
    return ({item[0]: (np.mean(item[1]['acc']), np.mean(item[1]['rec']), np.mean(item[1]['pre']), np.mean(item[1]['f1'])) for item in train_scores.items()},
            {item[0]: (np.mean(item[1]['acc']), np.mean(item[1]['rec']), np.mean(item[1]['pre']), np.mean(item[1]['f1'])) for item in test_scores.items()})

def bias_variance(X, y, start, stop, step):
    data_sizes = np.arange(start, stop, step)
    train_scores = defaultdict(lambda: defaultdict(list))
    test_scores = defaultdict(lambda: defaultdict(list))

    def append_score(d, item):
        d[item[0]]['Error'].append(1-item[1][0])
        d[item[0]]['Recall'].append(item[1][1])
        d[item[0]]['Precision'].append(item[1][2])
        d[item[0]]['F1'].append(item[1][3])
        
    for data_size in data_sizes:
        print('Size %i' % data_size)
        train_score, test_score = measure(data_size, X, y)
        
        map(lambda item: append_score(train_scores, item), train_score.items())
        map(lambda item: append_score(test_scores, item), test_score.items())
    
    return data_sizes, train_scores, test_scores

In [None]:
# Run Profiler
#import prof
#reload(prof)
#%lprun -T lprof -f prof.measure prof.measure(60, X, y)
#%mprun -T mprof -f prof.measure prof.measure(100, X, y)
#%mprun -T mprof -f prof.bias_variance prof.bias_variance(X, y, 100, 300, 10)

In [None]:
data_sizes, train_scores, test_scores = bias_variance(X, y, 50, 500, 10)

In [None]:
def plot_bias_variance(data_sizes, train_scores, test_scores, tags):
    plt.figure(num=None, figsize=(6, 5))
    plt.ylim([0.0, 1.0])
    plt.xlabel('Data set size')
    plt.ylabel('Error')
    plt.title("Bias-Variance")
    map(lambda tag: plt.plot(data_sizes, train_scores[tag]['Error'], data_sizes, test_scores[tag]['Error'], lw=1), tags)
    plt.legend(list(chain.from_iterable([['%s train error' % tag, '%s test error' % tag] for tag in tags])), loc="upper right")
    plt.grid(True, linestyle='-', color='0.75')
    plt.show()

In [None]:
def plot_score(data_sizes, scores, score_name, tags):
    plt.figure(num=None, figsize=(6, 5))
    plt.ylim([0.0, 1.0])
    plt.xlabel('Data set size')
    plt.ylabel('Score')
    plt.title(score_name)
    map(lambda tag: plt.plot(data_sizes, scores[tag][score_name], lw=1), tags)
    plt.legend(tags, loc="upper right")
    plt.grid(True, linestyle='-', color='0.75')
    plt.show()

In [None]:
plot_bias_variance(data_sizes,
                   train_scores,
                   test_scores,
                   tags=['B-OCCATION', 'I-OCCATION'])

In [None]:
plot_score(data_sizes,
           test_scores,
           'F1',
           tags=['B-OCCATION', 'I-OCCATION'])