## Посмотрим, какие признаки генерируются в случае с использованием истории и без нее:

### 1) Без истории

In [1]:
import features
import reader
import os

from classifiers.token_classifier import TokenClassifier

In [2]:
dataset_train = reader.DataReader(
    './dataset',
    fileids='eng.train.txt',
    columntypes=('words', 'pos', 'chunk', 'ne'))
dataset_testb = reader.DataReader(
    './dataset',
    fileids='eng.testb.test.txt',
    columntypes=('words', 'pos', 'chunk', 'ne'))
gen = features.Generator(
    columntypes=('words', 'pos', 'chunk'),
    context_len=2,
    language='en',
    rare_count=5,
    min_weight=0.95,
    rewrite=True,
    history=False,
    file_out='features.txt')

In [3]:
def docs_from_dataset_tokens(dataset, tags=['words', 'pos', 'chunk']):
    y = [el[1] for el in dataset.get_ne()]
    x = dataset.get_tags(tags=tags)

    x_sent, y_sent = [], []
    index = 0
    for sent in dataset.sents():
        length = len(sent)
        if length == 0:
            continue
        x_sent.append(x[index:index + length])
        y_sent.append(y[index:index + length])
        index += length
    x_docs, y_docs = [], []
    index = 0
    for doc in dataset.docs():
        length = len(doc)
        if length == 0:
            continue
        x_docs.append(x_sent[index:index + length])
        y_docs.append(y_sent[index:index + length])
        index += length
    return x_docs, y_docs


def xdocs_from_x_dataset(x, dataset):
    x_sent = []
    index = 0
    for sent in dataset.sents():
        length = len(sent)
        if length == 0:
            continue
        x_sent.append(x[index:index + length])
        index += length
    x_docs = []
    index = 0
    for doc in dataset.docs():
        length = len(doc)
        if length == 0:
            continue
        x_docs.append(x_sent[index:index + length])
        index += length
    return x_docs

In [4]:
%%time
x_docs_train, y_docs_train = docs_from_dataset_tokens(dataset_train)
x_feat_train = gen.fit_generate(x_docs_train, y_docs_train,
                                "./prepared_data/conll_train.npz")
x_docs_feat_train = xdocs_from_x_dataset(x_feat_train, dataset_train)

x_docs_testb, y_docs_testb = docs_from_dataset_tokens(dataset_testb)
x_feat_testb = gen.generate(x_docs_testb, "./prepared_data/conll_testb.npz")
x_docs_feat_testb = xdocs_from_x_dataset(x_feat_testb, dataset_testb)

clf = TokenClassifier(
    cls='XGBClassifier',
    learning_rate=0.3,
    max_depth=14,
    colsample_bytree=0.5,
    colsample_bylevel=0.5)
clf.fit(x_docs_feat_train, y_docs_train)

Признаков в исходном виде: (203621, 30)
Признаков осталось: 4163
Wall time: 6min 36s


In [5]:
clf.get_full_score(x_docs_feat_testb, y_docs_testb)

label    precision    recall    f1-score

PER      0.7533       0.8377    0.7933
ORG      0.6931       0.7081    0.7005
LOC      0.7929       0.8227    0.8075
MISC     0.7384       0.6676    0.7012

total    0.7456       0.7733    0.7592


In [6]:
import numpy

features_classes = gen.features_classes_

booster = clf.obj.get_booster()

features_counts = booster.get_score(importance_type='weight')
features_gains = booster.get_score(importance_type='gain')

all_features_counts = [
    features_counts.get(f, 0.) for f in booster.feature_names
]

all_features_gains = [
    features_gains.get(f, 0.) for f in booster.feature_names
]

all_features_gains = numpy.array(all_features_gains, dtype=numpy.float32)
all_features_counts = numpy.array(all_features_counts, dtype=numpy.float32)

In [7]:
with open('out.txt', 'w+', encoding='utf-8') as file:
    for name, weight, gain, importance in sorted(
            zip(features_classes, all_features_counts, all_features_gains,
                clf.obj.feature_importances_),
            key=lambda x: -x[1]):
        file.write(f'{name:40} | weight: {weight:20} | gain: {gain:20} | importance: {importance:16.10f}\n')

### 2) С историей

In [8]:
gen_hist = features.Generator(
    columntypes=('words', 'pos', 'chunk'),
    context_len=2,
    language='en',
    rare_count=5,
    min_weight=0.95,
    rewrite=True,
    history=True,
    file_out='hist_features.txt')

In [9]:
%%time
x_docs_train_hist, y_docs_train_hist = docs_from_dataset_tokens(dataset_train)
x_feat_train_hist = gen_hist.fit_generate(
    x_docs_train_hist, y_docs_train_hist,
    "./prepared_data/conll_train_history.npz")
x_docs_feat_train_hist = xdocs_from_x_dataset(x_feat_train_hist, dataset_train)

x_docs_testb_hist, y_docs_testb_hist = docs_from_dataset_tokens(dataset_testb)
x_feat_testb_hist = gen_hist.generate(x_docs_testb_hist, "./prepared_data/conll_testb_history.npz")
x_docs_feat_testb_hist = xdocs_from_x_dataset(x_feat_testb_hist, dataset_testb)

clf_hist = TokenClassifier(
    cls='XGBClassifier',
    learning_rate=0.3,
    max_depth=14,
    colsample_bytree=0.5,
    colsample_bylevel=0.5)
clf_hist.fit(x_docs_feat_train_hist, y_docs_train_hist)

Признаков в исходном виде: (203621, 60)
Признаков осталось: 4233
Wall time: 9min 7s


In [10]:
clf_hist.get_full_score(x_docs_feat_testb_hist, y_docs_testb_hist)

label    precision    recall    f1-score

PER      0.7506       0.8371    0.7915
ORG      0.6891       0.7099    0.6994
LOC      0.8073       0.8271    0.8171
MISC     0.7323       0.6589    0.6937

total    0.7468       0.7738    0.7601


In [11]:
import numpy

features_classes = gen_hist.features_classes_

booster = clf_hist.obj.get_booster()

features_counts = booster.get_score(importance_type='weight')
features_gains = booster.get_score(importance_type='gain')

all_features_counts = [
    features_counts.get(f, 0.) for f in booster.feature_names
]

all_features_gains = [
    features_gains.get(f, 0.) for f in booster.feature_names
]

all_features_gains = numpy.array(all_features_gains, dtype=numpy.float32)
all_features_counts = numpy.array(all_features_counts, dtype=numpy.float32)

In [12]:
with open('out_hist.txt', 'w+', encoding='utf-8') as file:
    for name, weight, gain, importance in sorted(
            zip(features_classes, all_features_counts, all_features_gains,
                clf_hist.obj.feature_importances_),
            key=lambda x: -x[1]):
        file.write(f'{name:40} | weight: {weight:20} | gain: {gain:20} | importance: {importance:16.10f}\n')