In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [3]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\Jan\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [13]:
print(nltk.corpus.conll2002.fileids())

train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

test_sents[0]

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']


[('La', 'DA', 'B-LOC'),
 ('Coruña', 'NC', 'I-LOC'),
 (',', 'Fc', 'O'),
 ('23', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFECOM', 'NP', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

In [18]:
def load_data(fname):
    idata = open(fname, 'rt', encoding='utf-8').readlines()
    sentences = []
    sentence = []
    for line in idata:
        if line == '\n':
            sentences.append(sentence)
            sentence = []
            continue
        if line.startswith('#'):
            continue
        word = line.replace('\n', '').split('\t')[1:]
        sentence.append(tuple(word))
    return sentences
        
    
train_sents = load_data('GermEval2014_complete_data/NER-de-train.tsv')
valid_sents = load_data('GermEval2014_complete_data/NER-de-dev.tsv')
test_sents = load_data('GermEval2014_complete_data/NER-de-test.tsv')

test_sents

[[('1951', 'O', 'O'),
  ('bis', 'O', 'O'),
  ('1953', 'O', 'O'),
  ('wurde', 'O', 'O'),
  ('der', 'O', 'O'),
  ('nördliche', 'O', 'O'),
  ('Teil', 'O', 'O'),
  ('als', 'O', 'O'),
  ('Jugendburg', 'O', 'O'),
  ('des', 'O', 'O'),
  ('Kolpingwerkes', 'B-OTH', 'O'),
  ('gebaut', 'O', 'O'),
  ('.', 'O', 'O')],
 [('Da', 'O', 'O'),
  ('Muck', 'B-PER', 'O'),
  ('das', 'O', 'O'),
  ('Kriegsschreiben', 'O', 'O'),
  ('nicht', 'O', 'O'),
  ('überbracht', 'O', 'O'),
  ('hat', 'O', 'O'),
  (',', 'O', 'O'),
  ('wird', 'O', 'O'),
  ('er', 'O', 'O'),
  ('als', 'O', 'O'),
  ('Retter', 'O', 'O'),
  ('des', 'O', 'O'),
  ('Landes', 'O', 'O'),
  ('ausgezeichnet', 'O', 'O'),
  ('und', 'O', 'O'),
  ('soll', 'O', 'O'),
  ('zum', 'O', 'O'),
  ('Schatzmeister', 'O', 'O'),
  ('ernannt', 'O', 'O'),
  ('werden', 'O', 'O'),
  ('.', 'O', 'O')],
 [('Mit', 'O', 'O'),
  ('1.', 'O', 'O'),
  ('Jänner', 'O', 'O'),
  ('2007', 'O', 'O'),
  ('wurde', 'O', 'O'),
  ('Robert', 'B-PER', 'O'),
  ('Schörgenhofer', 'I-PER', 'O'),
  

In [21]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [24]:
sent2features(train_sents[0])

[{'+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'sagte',
  'BOS': True,
  'bias': 1.0,
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': False,
  'word.lower()': 'schartau',
  'word[-2:]': 'au',
  'word[-3:]': 'tau'},
 {'+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'dem',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:word.lower()': 'schartau',
  'bias': 1.0,
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': 'sagte',
  'word[-2:]': 'te',
  'word[-3:]': 'gte'},
 {'+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': '"',
  '-1:word.istitle()': False,
  '-1:word.isupper()': False,
  '-1:word.lower()': 'sagte',
  'bias': 1.0,
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': 'dem',
  'word[-2:]': 'em',
  'word[-3:]': 'dem'},
 {'+1:word.istitle()': True,
  '+1:wor

In [23]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_train[0]

ValueError: too many values to unpack (expected 2)

In [10]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [11]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']

In [12]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.7964686316443963

In [13]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

      B-LOC      0.810     0.784     0.797      1084
      I-LOC      0.690     0.637     0.662       325
     B-MISC      0.731     0.569     0.640       339
     I-MISC      0.699     0.589     0.639       557
      B-ORG      0.807     0.832     0.820      1400
      I-ORG      0.852     0.786     0.818      1104
      B-PER      0.850     0.884     0.867       735
      I-PER      0.893     0.943     0.917       634

avg / total      0.809     0.787     0.796      6178

