In [2]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [3]:
import reader
import scorer
import utils
import classifiers.sequence_classifier as sc

In [4]:
from itertools import chain
from sklearn.model_selection import cross_val_score

import nltk
import sklearn
import numpy as np

import sklearn_crfsuite

In [5]:
dataset = reader.DataReader('./dataset', fileids='eng.train.txt', columntypes=('words', 'pos', 'chunk', 'ne'))
y = [el[1] for el in dataset.get_ne()]
x = dataset.get_tags(tags=['words', 'pos', 'chunk'])
x_sent_base, y_sent = [], []
index = 0
for sent in dataset.sents():
    length = len(sent)
    if length == 0:
        continue
    x_sent_base.append(x[index:index + length])
    y_sent.append(y[index:index + length])
    index += length
    
x_sent = [sent2features(s) for s in x_sent_base]

x_docs, y_docs = [], []
index = 0
for doc in dataset.docs():
    length = len(doc)
    if length == 0:
        continue
    x_docs.append(x_sent[index:index + length])
    y_docs.append(y_sent[index:index + length])
    index += length

In [6]:
%%time
crf = sc.SequenceClassifier(cls='CRF')
print(np.mean(cross_val_score(crf, x_docs, y_docs)))

0.831364803184
Wall time: 1min 8s


In [9]:
test_dataset = reader.DataReader('./dataset', fileids='eng.testa.dev.txt', columntypes=('words', 'pos', 'chunk', 'ne'))
y_test = [el[1] for el in test_dataset.get_ne()]
x_test = test_dataset.get_tags(tags=['words', 'pos', 'chunk'])
x_test_sent_b, y_test_sent = [], []
index = 0
for sent in test_dataset.sents():
    length = len(sent)
    if length == 0:
        continue
    x_test_sent_b.append(x_test[index:index + length])
    y_test_sent.append(y_test[index:index + length])
    index += length
    
x_test_sent = [sent2features(s) for s in x_test_sent_b]

x_test_docs, y_test_docs = [], []
index = 0
for doc in test_dataset.docs():
    length = len(doc)
    if length == 0:
        continue
    x_test_docs.append(x_test_sent[index:index + length])
    y_test_docs.append(y_test_sent[index:index + length])
    index += length

In [10]:
crf.fit(x_docs, y_docs)
print(crf.score(x_test_docs, y_test_docs))

0.8715919085312225
