## Посмотрим, какие признаки генерируются в случае с использованием истории и без нее:

### 1) Без истории

In [1]:
import features
import reader
import os

from classifiers.token_classifier import TokenClassifier

In [2]:
dataset_train = reader.DataReader('./dataset', fileids='eng.train.txt',
                            columntypes=('words', 'pos', 'chunk', 'ne'))
dataset_testb = reader.DataReader('./dataset', fileids='eng.testb.test.txt',
                            columntypes=('words', 'pos', 'chunk', 'ne'))
gen = features.Generator(columntypes=('words', 'pos', 'chunk'), context_len=2, language='en',
                         rare_count=5, min_weight=0.95, rewrite=True, history=False)

In [3]:
def docs_from_dataset_tokens(dataset, tags=['words', 'pos', 'chunk']):
    y = [el[1] for el in dataset.get_ne()]
    x = dataset.get_tags(tags=tags)
    
    x_sent, y_sent = [], []
    index = 0
    for sent in dataset.sents():
        length = len(sent)
        if length == 0:
            continue
        x_sent.append(x[index:index + length])
        y_sent.append(y[index:index + length])
        index += length
    x_docs, y_docs = [], []
    index = 0
    for doc in dataset.docs():
        length = len(doc)
        if length == 0:
            continue
        x_docs.append(x_sent[index:index + length])
        y_docs.append(y_sent[index:index + length])
        index += length
    return x_docs, y_docs

def xdocs_from_x_dataset(x, dataset):
    x_sent = []
    index = 0
    for sent in dataset.sents():
        length = len(sent)
        if length == 0:
            continue
        x_sent.append(x[index:index + length])
        index += length
    x_docs = []
    index = 0
    for doc in dataset.docs():
        length = len(doc)
        if length == 0:
            continue
        x_docs.append(x_sent[index:index + length])
        index += length
    return x_docs

In [4]:
%%time
x_docs_train, y_docs_train = docs_from_dataset_tokens(dataset_train)
x_feat_train = gen.fit_generate(x_docs_train, y_docs_train, "./prepared_data/conll_train_history.npz")
x_docs_feat_train = xdocs_from_x_dataset(x_feat_train, dataset_train)

x_docs_testb, y_docs_testb = docs_from_dataset_tokens(dataset_testb)
x_feat_testb = gen.generate(x_docs_testb, "./prepared_data/conll_testb_history.npz")
x_docs_feat_testb = xdocs_from_x_dataset(x_feat_testb, dataset_testb)

clf = TokenClassifier(cls='XGBClassifier', learning_rate=0.3, max_depth=14, colsample_bytree=0.5, colsample_bylevel=0.5)
clf.fit(x_docs_feat_train, y_docs_train)

Осталось [20, 321, 319, 240, 395, 9, 331, 324, 112, 326, 322, 320, 329, 66, 95, 8409, 347, 341, 276, 339, 4403, 16414, 323, 19, 193, 101, 12408, 158, 334, 343, 281, 258, 332, 346, 204, 187, 336, 338, 344, 342, 312, 55, 147, 317, 337, 14, 327, 328, 294, 16388, 335, 263, 111, 157, 10, 7512, 345, 4391, 21, 49, 22, 52, 4377, 340, 259, 96, 203, 59, 8139, 325, 316, 241, 56, 262, 13, 299, 60, 330, 65, 51, 245, 280, 348, 8381, 16144, 8383, 128, 352, 26, 298, 333, 18430, 197, 351, 295, 160, 105, 4386, 349, 206, 151, 277, 114, 98, 12391, 97, 7499, 194, 267, 148, 117, 8384, 189, 16397, 318, 8392, 2124, 12396, 198, 106, 220, 152, 143, 313, 7168, 244, 8397, 350, 16402, 99, 145, 53, 8530, 8494, 4387, 190, 144, 174, 5656, 8412, 68, 113, 219, 10341, 54, 45, 13600, 191, 19873, 132, 142, 82, 11871, 100, 1874, 212, 12384, 50, 15173, 4684, 1224, 12411, 4378, 188, 222, 192, 141, 11499, 15504, 16417, 137, 4665, 2076, 4406, 1358, 5640, 14433, 890, 303, 20141, 6341, 226, 1650, 18249, 8673, 6082, 5595, 1460, 1

In [5]:
clf.get_full_score(x_docs_feat_testb, y_docs_testb)
features_classes = gen.features_classes_
features_importances = clf.obj._Booster.get_score(importance_type='gain')
features_counts = clf.obj._Booster.get_score(importance_type='weight')

label    precision    recall    f1-score

PER      0.7425       0.8300    0.7838
ORG      0.6690       0.6921    0.6804
LOC      0.7894       0.8152    0.8021
MISC     0.7432       0.6763    0.7082

total    0.7345       0.7652    0.7495


In [6]:
import numpy

for name, gain, weight in zip(features_classes, features_importances, features_counts):
    with open('out.txt', 'w+', encoding='utf-8') as file:
        file.write(f'{name:15} - {gain:10} - {weight:10}\n')

In [7]:
print(len(features_classes))

4344


In [8]:
print(len(features_importances))

2918


In [9]:
print(len(features_counts))

2918


In [20]:
print(x_docs_feat_train[0][0].shape)

(9, 4344)
