# Sequence Tagging with CRF

Here is the code for the baseline system. The System achieves a score of 0.54 points. With just a few additional Features the score can be improved by 0.2 points. 

In [1]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV
import numpy as np

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics



In [2]:
def load_data(fname):
    idata = open(fname, 'rt', encoding='utf-8').readlines()
    sentences = []
    sentence = []
    for line in idata:
        if line == '\n':
            sentences.append(sentence)
            sentence = []
            continue
        if line.startswith('#'):
            continue
        word = line.replace('\n', '').split('\t')[1:4]
        if not len(word) == 3:
            print(word)
        sentence.append(tuple(word))
    return sentences
        
    
train_sents = load_data('GermEval2014_complete_data/NER-de-train.tsv')
valid_sents = load_data('GermEval2014_complete_data/NER-de-dev.tsv')
test_sents = load_data('GermEval2014_complete_data/NER-de-test.tsv')


In [3]:
#load brown clusters

paths = open('paths', 'rt', encoding='utf-8').readlines()
word2cluster_id = {}
for line in paths:
    line = line.replace('\n', '')
    sline = line.split('\t')
    cluster_id = sline[0]
    lemma = sline[1]
    freq = sline[2]
    word2cluster_id[lemma] = cluster_id
    

# Feature Extraction

The features are extracted on a word-level. At the moment, we use the lowercase surface form of the word, and some boolean fields that indicate if the word is uppercased, a digit or if it is a title. Additionaly, there are some dummy-features, which indicate how to incorporate vector-features.

In [4]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }
    #if you want to use embeddigns, this is the way to incorporate them into the dictionary.
    for i in range(5):
        features['super_dummy_feature_{}'.format(i)] = 1.0

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label, _ in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [5]:
# an example of a featurized sentence.
sent2features(train_sents[0])

[{'bias': 1.0,
  'super_dummy_feature_0': 1.0,
  'super_dummy_feature_1': 1.0,
  'super_dummy_feature_2': 1.0,
  'super_dummy_feature_3': 1.0,
  'super_dummy_feature_4': 1.0,
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': False,
  'word.lower()': 'schartau'},
 {'bias': 1.0,
  'super_dummy_feature_0': 1.0,
  'super_dummy_feature_1': 1.0,
  'super_dummy_feature_2': 1.0,
  'super_dummy_feature_3': 1.0,
  'super_dummy_feature_4': 1.0,
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': 'sagte'},
 {'bias': 1.0,
  'super_dummy_feature_0': 1.0,
  'super_dummy_feature_1': 1.0,
  'super_dummy_feature_2': 1.0,
  'super_dummy_feature_3': 1.0,
  'super_dummy_feature_4': 1.0,
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': 'dem'},
 {'bias': 1.0,
  'super_dummy_feature_0': 1.0,
  'super_dummy_feature_1': 1.0,
  'super_dummy_feature_2': 1.0,
  'super_dummy_feature_3': 1.0,
  'su

In [6]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_dev = [sent2features(s) for s in valid_sents]
y_dev = [sent2labels(s) for s in valid_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [7]:
#run the training. Note that you can (and should) use the CV and GridSearchCV using the crfsuite.
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [8]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-PER',
 'B-ORG',
 'I-PER',
 'B-LOC',
 'I-ORG',
 'B-LOCderiv',
 'B-ORGpart',
 'B-OTH',
 'I-OTH',
 'I-LOCderiv',
 'B-PERpart',
 'I-ORGpart',
 'B-LOCpart',
 'I-LOC',
 'B-OTHderiv',
 'B-PERderiv',
 'B-OTHpart',
 'I-OTHpart',
 'I-OTHderiv',
 'B-ORGderiv',
 'I-PERpart',
 'I-LOCpart',
 'I-PERderiv']

In [9]:
#predict the labels on the test set and print the f1-score. We expect you to report this score.
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.5415890308983848

In [10]:
#There is also the possibility to print a report of the scores you achieved. 
#We expect you to report these scores for your final system.

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

      B-LOC      0.866     0.460     0.601      1706
      I-LOC      0.578     0.307     0.401       303
 B-LOCderiv      0.833     0.611     0.705       561
 I-LOCderiv      0.000     0.000     0.000         4
  B-LOCpart      0.875     0.064     0.120       109
  I-LOCpart      0.000     0.000     0.000         0
      B-ORG      0.773     0.433     0.555      1150
      I-ORG      0.636     0.481     0.548       698
 B-ORGderiv      0.000     0.000     0.000         8
  B-ORGpart      0.800     0.023     0.045       172
  I-ORGpart      0.000     0.000     0.000         5
      B-OTH      0.784     0.339     0.473       697
      I-OTH      0.356     0.239     0.286       866
 B-OTHderiv      0.857     0.308     0.453        39
 I-OTHderiv      0.000     0.000     0.000         0
  B-OTHpart      0.750     0.071     0.130        42
  I-OTHpart      0.000     0.000     0.000         0
      B-PER      0.839     0.457     0.591   

In [11]:
#load TextBerg 
text_berg = open('TextBerg10Saetze.tsv', 'rt', encoding='utf-8').readlines()
berg_sentences = []
sentence = []
for line in text_berg:
    if line == '\n':
        berg_sentences.append(sentence)
        sentence = []
        continue
        
    sentence.append(line.replace('\n', ''))


In [12]:
X_berg = [sent2features(s) for s in berg_sentences]

y_berg_pred = crf.predict(X_berg)

In [13]:
y_berg_pred

[['B-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'B-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'I-OTH',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O