In [16]:
import reader
import scorer
import utils
import classifiers.sequence_classifier as sc

In [17]:
from itertools import chain
from sklearn.model_selection import cross_val_score

import nltk
import sklearn
import numpy as np

import sklearn_crfsuite

In [18]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [19]:
def add_history(docs):
    for doc in docs:
        all_tokens = []
        for sent in doc:
            all_tokens += sent
        for i in range(0, len(all_tokens), 1):
            for j in range(i - 1, max(0, i - 1000), -1):
                if all_tokens[i]['word.lower()'] == all_tokens[j]['word.lower()']:
                    all_tokens[i].update({key + '_history': value for key, value in all_tokens[j].items() if key != 'word.lower()'})
                    break

In [20]:
x_train_docs, y_train_docs = utils.docs_from_dataset('./dataset', 'eng.train.txt', 
                                         ('words', 'pos', 'chunk', 'ne'), ['words', 'pos', 'chunk'], sent2features)

In [21]:
x_testa_docs, y_testa_docs = utils.docs_from_dataset('./dataset', 'eng.testa.dev.txt', 
                                         ('words', 'pos', 'chunk', 'ne'), ['words', 'pos', 'chunk'], sent2features)

In [22]:
x_testb_docs, y_testb_docs = utils.docs_from_dataset('./dataset', 'eng.testb.test.txt', 
                                         ('words', 'pos', 'chunk', 'ne'), ['words', 'pos', 'chunk'], sent2features)

In [23]:
%%time
crf = sc.SequenceClassifier(cls='CRF')
print(np.mean(cross_val_score(crf, x_train_docs, y_train_docs)))

0.831364803184
Wall time: 1min 21s


In [24]:
%%time
crf = sc.SequenceClassifier(cls='CRF')
crf.fit(x_train_docs, y_train_docs)

Wall time: 40.4 s


In [25]:
%%time
print('Результаты на TESTA')
crf.get_full_score(x_testa_docs, y_testa_docs)

Результаты на TESTA
label    precision    recall    f1-score

PER      0.8943       0.8958    0.8951
ORG      0.8357       0.7971    0.8160
LOC      0.9154       0.8709    0.8926
MISC     0.9041       0.8258    0.8632

total    0.8889       0.8549    0.8716
Wall time: 507 ms


In [26]:
%%time
print('Результаты на TESTB')
crf.get_full_score(x_testb_docs, y_testb_docs)

Результаты на TESTB
label    precision    recall    f1-score

PER      0.8286       0.8558    0.8419
ORG      0.7588       0.7271    0.7426
LOC      0.8599       0.8001    0.8290
MISC     0.8342       0.7373    0.7827

total    0.8174       0.7863    0.8015
Wall time: 479 ms


In [27]:
%%time
add_history(x_train_docs)
crf = sc.SequenceClassifier(cls='CRF')
print(np.mean(cross_val_score(crf, x_train_docs, y_train_docs)))

0.845314726668
Wall time: 4min 10s


In [28]:
%%time
crf = sc.SequenceClassifier(cls='CRF')
crf.fit(x_train_docs, y_train_docs)

Wall time: 1min 59s


In [29]:
%%time
print('Результаты на TESTA + HISTORY')
add_history(x_testa_docs)
crf.get_full_score(x_testa_docs, y_testa_docs)

Результаты на TESTA + HISTORY
label    precision    recall    f1-score

PER      0.9284       0.9320    0.9302
ORG      0.8378       0.8092    0.8233
LOC      0.9141       0.8737    0.8935
MISC     0.8957       0.8202    0.8563

total    0.8986       0.8689    0.8835
Wall time: 4.47 s


In [30]:
%%time
print('Результаты на TESTB + HISTORY')
add_history(x_testb_docs)
crf.get_full_score(x_testb_docs, y_testb_docs)

Результаты на TESTB + HISTORY
label    precision    recall    f1-score

PER      0.8537       0.8983    0.8754
ORG      0.7746       0.7308    0.7521
LOC      0.8577       0.8158    0.8362
MISC     0.8424       0.7446    0.7904

total    0.8306       0.8049    0.8176
Wall time: 2.64 s
