## Посмотрим, какие признаки генерируются в случае с использованием истории и без нее:

### 1) Без истории

In [1]:
import features
import reader
import os

from utils import docs_from_dataset_tokens, xdocs_from_x_dataset
from classifiers.token_classifier import TokenClassifier

dataset_train = reader.DataReader(
    './dataset',
    fileids='eng.train.txt',
    columntypes=('words', 'pos', 'chunk', 'ne'))
dataset_testb = reader.DataReader(
    './dataset',
    fileids='eng.testb.test.txt',
    columntypes=('words', 'pos', 'chunk', 'ne'))
gen = features.Generator(
    columntypes=('words', 'pos', 'chunk'),
    context_len=2,
    language='en',
    rare_count=5,
    min_weight=0.975,
    rewrite=True,
    history=False)

In [2]:
%%time
x_docs_train, y_docs_train = docs_from_dataset_tokens(dataset_train)
x_feat_train = gen.fit_generate(x_docs_train, y_docs_train,
                                "./prepared_data/conll_train.npz")
x_docs_feat_train = xdocs_from_x_dataset(x_feat_train, dataset_train)

x_docs_testb, y_docs_testb = docs_from_dataset_tokens(dataset_testb)
x_feat_testb = gen.generate(x_docs_testb, "./prepared_data/conll_testb.npz")
x_docs_feat_testb = xdocs_from_x_dataset(x_feat_testb, dataset_testb)

clf = TokenClassifier(
    cls='XGBClassifier',
    learning_rate=0.3,
    max_depth=14,
    colsample_bytree=0.5,
    colsample_bylevel=0.5)
clf.fit(x_docs_feat_train, y_docs_train)

print()

clf.get_full_score(x_docs_feat_testb, y_docs_testb)

Признаков в исходном виде: 30
Обработано 203621 токенов!
Признаков осталось: 6451

label    precision    recall    f1-score

PER      0.7520       0.8435    0.7951
ORG      0.7019       0.6988    0.7003
LOC      0.7869       0.8214    0.8038
MISC     0.7393       0.6749    0.7056

total    0.7465       0.7727    0.7594
Wall time: 7min 15s


In [3]:
import numpy

features_classes = gen.features_classes_

booster = clf.obj.get_booster()

features_counts = booster.get_score(importance_type='weight')
features_gains = booster.get_score(importance_type='gain')

all_features_counts = [
    features_counts.get(f, 0.) for f in booster.feature_names
]

all_features_gains = [
    features_gains.get(f, 0.) for f in booster.feature_names
]

all_features_gains = numpy.array(all_features_gains, dtype=numpy.float32)
all_features_counts = numpy.array(all_features_counts, dtype=numpy.float32)

with open('out.txt', 'w+', encoding='utf-8') as file:
    for name, weight, gain, importance in sorted(
            zip(features_classes, all_features_counts, all_features_gains,
                clf.obj.feature_importances_),
            key=lambda x: -x[1]):
        file.write(f'{name:40} | weight: {weight:20} | gain: {gain:20} | importance: {importance:16.10f}\n')

### 2) С историей

In [4]:
gen_hist = features.Generator(
    columntypes=('words', 'pos', 'chunk'),
    context_len=2,
    language='en',
    rare_count=5,
    min_weight=0.975,
    rewrite=True,
    history=True)

In [5]:
%%time
x_docs_train_hist, y_docs_train_hist = docs_from_dataset_tokens(dataset_train)
x_feat_train_hist = gen_hist.fit_generate(
    x_docs_train_hist, y_docs_train_hist,
    "./prepared_data/conll_train_history.npz")
x_docs_feat_train_hist = xdocs_from_x_dataset(x_feat_train_hist, dataset_train)

x_docs_testb_hist, y_docs_testb_hist = docs_from_dataset_tokens(dataset_testb)
x_feat_testb_hist = gen_hist.generate(x_docs_testb_hist, "./prepared_data/conll_testb_history.npz")
x_docs_feat_testb_hist = xdocs_from_x_dataset(x_feat_testb_hist, dataset_testb)

clf_hist = TokenClassifier(
    cls='XGBClassifier',
    learning_rate=0.3,
    max_depth=14,
    colsample_bytree=0.5,
    colsample_bylevel=0.5)
clf_hist.fit(x_docs_feat_train_hist, y_docs_train_hist)

print()

clf_hist.get_full_score(x_docs_feat_testb_hist, y_docs_testb_hist)

Признаков в исходном виде: 30
Обработано 203621 токенов!
9974 токенов имеют историю в рамках документа!
Признаков осталось: 6818

label    precision    recall    f1-score

PER      0.7497       0.8371    0.7910
ORG      0.7010       0.7062    0.7036
LOC      0.7976       0.8246    0.8108
MISC     0.7264       0.6821    0.7036

total    0.7469       0.7749    0.7607
Wall time: 10min 11s


In [6]:
import numpy

features_classes = gen_hist.features_classes_

booster = clf_hist.obj.get_booster()

features_counts = booster.get_score(importance_type='weight')
features_gains = booster.get_score(importance_type='gain')

all_features_counts = [
    features_counts.get(f, 0.) for f in booster.feature_names
]

all_features_gains = [
    features_gains.get(f, 0.) for f in booster.feature_names
]

all_features_gains = numpy.array(all_features_gains, dtype=numpy.float32)
all_features_counts = numpy.array(all_features_counts, dtype=numpy.float32)

with open('out_hist.txt', 'w+', encoding='utf-8') as file:
    for name, weight, gain, importance in sorted(
            zip(features_classes, all_features_counts, all_features_gains,
                clf_hist.obj.feature_importances_),
            key=lambda x: -x[1]):
        file.write(f'{name:40} | weight: {weight:20} | gain: {gain:20} | importance: {importance:16.10f}\n')