In [1]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [2]:
def add_history(docs):
    for doc in docs:
        all_tokens = []
        for sent in doc:
            all_tokens += sent
        for i in range(0, len(all_tokens), 1):
            for j in range(i - 1, max(0, i - 1000), -1):
                if all_tokens[i]['word.lower()'] == all_tokens[j]['word.lower()']:
                    all_tokens[i].update({key + '_history': value for key, value in all_tokens[j].items() if key != 'word.lower()'})
                    break

In [3]:
import reader
import scorer
import utils
import classifiers.sequence_classifier as sc

In [4]:
from itertools import chain
from sklearn.model_selection import cross_val_score

import nltk
import sklearn
import numpy as np

import sklearn_crfsuite

In [5]:
# Используем get_tags, потом разбиваем на предложения по размеру, потому что
# иначе есть пустые предложения, что ломает работу системы

dataset = reader.DataReader('./dataset', fileids='eng.train.txt', columntypes=('words', 'pos', 'chunk', 'ne'))
y_train = [el[1] for el in dataset.get_ne()]
x_train = dataset.get_tags(tags=['words', 'pos', 'chunk'])

x_train_sent_b, y_train_sent = [], []
index = 0
for sent in dataset.sents():
    length = len(sent)
    if length == 0:
        continue
    x_train_sent_b.append(x_train[index:index + length])
    y_train_sent.append(y_train[index:index + length])
    index += length
    
x_train_sent = [sent2features(s) for s in x_train_sent_b]

x_train_docs, y_train_docs = [], []
index = 0
for doc in dataset.docs():
    length = len(doc)
    if length == 0:
        continue
    x_train_docs.append(x_train_sent[index:index + length])
    y_train_docs.append(y_train_sent[index:index + length])
    index += length

In [6]:
test_dataset = reader.DataReader('./dataset', fileids='eng.testa.dev.txt', columntypes=('words', 'pos', 'chunk', 'ne'))
y_test = [el[1] for el in test_dataset.get_ne()]
x_test = test_dataset.get_tags(tags=['words', 'pos', 'chunk'])
x_test_sent_b, y_test_sent = [], []
index = 0
for sent in test_dataset.sents():
    length = len(sent)
    if length == 0:
        continue
    x_test_sent_b.append(x_test[index:index + length])
    y_test_sent.append(y_test[index:index + length])
    index += length
    
x_test_sent = [sent2features(s) for s in x_test_sent_b]

x_test_docs, y_test_docs = [], []
index = 0
for doc in test_dataset.docs():
    length = len(doc)
    if length == 0:
        continue
    x_test_docs.append(x_test_sent[index:index + length])
    y_test_docs.append(y_test_sent[index:index + length])
    index += length

In [7]:
%%time
crf = sc.SequenceClassifier(cls='CRF')
print(np.mean(cross_val_score(crf, x_train_docs, y_train_docs)))

0.831364803184
Wall time: 1min 31s


In [8]:
crf.fit(x_train_docs, y_train_docs)
print(crf.score(x_test_docs, y_test_docs))

0.8715919085312225


In [9]:
%%time
add_history(x_train_docs)
crf = sc.SequenceClassifier(cls='CRF')
print(np.mean(cross_val_score(crf, x_train_docs, y_train_docs)))

0.845314726668
Wall time: 4min 51s


In [10]:
add_history(x_test_docs)
crf.fit(x_train_docs, y_train_docs)
print(crf.score(x_test_docs, y_test_docs))

0.8835087719298246
