In [None]:
%matplotlib inline

#%load_ext line_profiler
#%load_ext memory_profiler

In [None]:
from __future__ import division

import numpy as np
import matplotlib.pyplot as plt

In [None]:
from gensim import corpora

dic = corpora.Dictionary()
tokens = ['']
for line in open('data.conll', 'r'):
    line = line.strip()
    if len(line) == 0:
        dic.add_documents([tokens])
        del tokens[:]
    else:
        tokens.extend(line.split()[:-1])

In [None]:
import codecs

import numpy as np

from sqlearn.crfsuite import crfutils
from sqlearn.crfsuite import ner

def load_data(f):
    X = []
    y = []
    sent = []
    sent_label = []
    for line in codecs.open(f, 'r', 'utf-8'):
        line = line.strip('\n')
        if line.strip() == '':
            sent.append('\n')
            for item in crfutils.readiter(sent, ['w', 'pos'], ' ', dic.token2id):
                ner.feature_extractor(item)
                X.append(item)
                y.append(sent_label)
            sent = []
            sent_label = []
        else:
            splited_line = line.split(' ')
            sent.append('%s %s' % (splited_line[0], splited_line[1]))
            sent_label.append(splited_line[2])

    X = [[feature['F'] for feature in sent] for sent in X]

    X = np.asarray(X)
    y = np.asarray(y)

    print(len(X))
    print(len(y))
    
    return X, y

In [None]:
%%time
X, y = load_data('data.conll')

In [None]:
X[0][10]

In [None]:
#%%writefile prof.py
from __future__ import division

import numpy as np
import pycrfsuite as crf

from itertools import chain
from collections import defaultdict

from sklearn import cross_validation
from sklearn.preprocessing import LabelBinarizer

trainer = crf.Trainer(verbose=False)
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier
    'num_memories': 3,

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

tagger = crf.Tagger()
    
def error_score(y_true, y_pred):
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

#    tagset = set(lb.classes_) - {'O'}
#    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    y_acc_eval_array = y_true_combined + y_pred_combined
    error_score = {item[0]: 1 - sum(y_acc_eval_array[:, item[1]] == 2)/sum(y_acc_eval_array[:, item[1]] > 0) for item in class_indices.items()}
    
    return error_score

def measure(data_size, X, y):
    train_errors = defaultdict(list)
    test_errors = defaultdict(list)
    
    kf = cross_validation.KFold(n=data_size, n_folds=3, shuffle=True, random_state=None)
    
    for fold_idx, (train_index, test_index) in enumerate(kf):
        print('Iteration #%i' % fold_idx)
        
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]

        # train
        model_name = 'm%i.crfsuite' % fold_idx
        Xy = zip(X_train, y_train)
        for xseq, yseq in Xy:
            trainer.append(xseq, yseq)
        trainer.train(model_name)
        trainer.clear()
        
        # predict

        tagger.open(model_name)
        y_train_pred = [tagger.tag(xseq) for xseq in X_train]
        y_test_pred  = [tagger.tag(xseq) for xseq in X_test]
        tagger.close()
        
        # evaluate
        train_error = error_score(y_train, y_train_pred)
        test_error = error_score(y_test, y_test_pred)
        
        map(lambda item: train_errors[item[0]].append(item[1]), train_error.items())
        map(lambda item: test_errors[item[0]].append(item[1]), test_error.items())
        
        del Xy
        del y_train_pred
        del y_test_pred
        del X_train
        del y_train
        del X_test
        del y_test
        del train_index
        del test_index
        
    return {item[0]: np.mean(item[1]) for item in train_errors.items()}, {item[0]: np.mean(item[1]) for item in test_errors.items()}

def bias_variance(X, y, start, stop, step):
    data_sizes = np.arange(start, stop, step)
    train_errors = defaultdict(list)
    test_errors = defaultdict(list)

    for data_size in data_sizes:
        print('Size %i' % data_size)
        train_error, test_error = measure(data_size, X, y)

        map(lambda item: train_errors[item[0]].append(item[1]), train_error.items())
        map(lambda item: test_errors[item[0]].append(item[1]), test_error.items())
    
    return data_sizes, train_errors, test_errors

In [None]:
# Run Profiler
#import prof
#reload(prof)
#%lprun -T lprof -f prof.measure prof.measure(60, X, y)
#%mprun -T mprof -f prof.measure prof.measure(100, X, y)
#%mprun -T mprof -f prof.bias_variance prof.bias_variance(X, y, 100, 300, 10)

In [None]:
data_sizes, train_errors, test_errors = bias_variance(X, y, 50, 500, 10)

In [None]:
def plot_bias_variance(data_sizes, train_errors, test_errors, tags):
    plt.figure(num=None, figsize=(6, 5))
    plt.ylim([0.0, 1.0])
    plt.xlabel('Data set size')
    plt.ylabel('Error')
    plt.title("Bias-Variance")
    map(lambda tag: plt.plot(data_sizes, train_errors[tag], data_sizes, test_errors[tag], lw=1), tags)
    plt.legend(list(chain.from_iterable([['%s test error' % tag, '%s train error' % tag] for tag in tags])), loc="upper right")
    plt.grid(True, linestyle='-', color='0.75')
    plt.show()

In [None]:
plot_bias_variance(data_sizes,
                   train_errors,
                   test_errors,
                   tags=['B-OCCATION', 'I-OCCATION'])

In [None]:
plot_bias_variance(data_sizes,
                   train_errors,
                   test_errors,
                   tags=['B-LOCATION', 'I-LOCATION'])

In [None]:
plot_bias_variance(data_sizes,
                   train_errors,
                   test_errors,
                   tags=['B-ADDRESS', 'I-ADDRESS'])

In [None]:
plot_bias_variance(data_sizes,
                   train_errors,
                   test_errors,
                   tags=['B-DATE_PERIOD', 'I-DATE_PERIOD'])