### XGBoost with optimal parameters and history from GridSearchCV on testb

In [6]:
import features
import reader
import os

from classifiers.token_classifier import TokenClassifier

In [7]:
dataset_train = reader.DataReader('./dataset', fileids='eng.train.txt',
                            columntypes=('words', 'pos', 'chunk', 'ne'))
dataset_testb = reader.DataReader('./dataset', fileids='eng.testb.test.txt',
                            columntypes=('words', 'pos', 'chunk', 'ne'))
gen = features.Generator(columntypes=('words', 'pos', 'chunk'), context_len=2, language='en',
                         rare_count=5, min_weight=0.95, rewrite=False, history=True)

In [8]:
def docs_from_dataset_tokens(dataset, tags=['words', 'pos', 'chunk']):
    y = [el[1] for el in dataset.get_ne()]
    x = dataset.get_tags(tags=tags)
    
    x_sent, y_sent = [], []
    index = 0
    for sent in dataset.sents():
        length = len(sent)
        if length == 0:
            continue
        x_sent.append(x[index:index + length])
        y_sent.append(y[index:index + length])
        index += length
    x_docs, y_docs = [], []
    index = 0
    for doc in dataset.docs():
        length = len(doc)
        if length == 0:
            continue
        x_docs.append(x_sent[index:index + length])
        y_docs.append(y_sent[index:index + length])
        index += length
    return x_docs, y_docs

def xdocs_from_x_dataset(x, dataset):
    x_sent = []
    index = 0
    for sent in dataset.sents():
        length = len(sent)
        if length == 0:
            continue
        x_sent.append(x[index:index + length])
        index += length
    x_docs = []
    index = 0
    for doc in dataset.docs():
        length = len(doc)
        if length == 0:
            continue
        x_docs.append(x_sent[index:index + length])
        index += length
    return x_docs

In [9]:
x_docs_train, y_docs_train = docs_from_dataset_tokens(dataset_train)
x_feat_train = gen.fit_generate(x_docs_train, y_docs_train, "./prepared_data/conll_train_history.npz")
x_docs_feat_train = xdocs_from_x_dataset(x_feat_train, dataset_train)

x_docs_testb, y_docs_testb = docs_from_dataset_tokens(dataset_testb)
x_feat_testb = gen.generate(x_docs_testb, "./prepared_data/conll_testb_history.npz")
x_docs_feat_testb = xdocs_from_x_dataset(x_feat_testb, dataset_testb)

clf = TokenClassifier(cls='XGBClassifier', learning_rate=0.3, max_depth=14, colsample_bytree=0.5, colsample_bylevel=0.5)
clf.fit(x_docs_feat_train, y_docs_train)

TokenClassifier(base_score=0.5, booster='gbtree', cls='XGBClassifier',
        colsample_bylevel=0.5, colsample_bytree=0.5, gamma=0,
        learning_rate=0.3, max_delta_step=0, max_depth=14,
        min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
        nthread=None, objective='multi:softprob', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1)

In [10]:
clf.get_full_score(x_docs_feat_testb, y_docs_testb)

label    precision    recall    f1-score

PER      0.7542       0.6697    0.7094
ORG      0.6119       0.6570    0.6337
LOC      0.7212       0.8427    0.7772
MISC     0.7292       0.6604    0.6931

total    0.6962       0.7153    0.7056
