In [1]:
import reader
import scorer
import utils
import classifiers.sequence_classifier as sc

from itertools import chain
from sklearn.model_selection import cross_val_score

import nltk
import sklearn
import numpy as np

import sklearn_crfsuite

In [2]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [3]:
x_docs, y_docs = utils.docs_from_dataset('./dataset', 'eng.train.txt', 
                                         ('words', 'pos', 'chunk', 'ne'), ['words', 'pos', 'chunk'], sent2features)

In [4]:
crf = sc.SequenceClassifier(cls='CRF')

In [5]:
import scipy
from sklearn.model_selection import RandomizedSearchCV

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50)
rs.fit(x_docs, y_docs)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 16.4min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=SequenceClassifier(algorithm='lbfgs', all_possible_states=None,
          all_possible_transitions=True, averaging=None, c=None, c1=0.1,
          c2=0.1, calibration_candidates=None, calibration_eta=None,
          calibration_max_trials=None, calibration_rate=None,
          calibration_...None,
          pa_type=None, period=None, trainer_cls=None, variance=None,
          verbose=False),
          fit_params=None, iid=True, n_iter=50, n_jobs=-1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000142D3FDCA90>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000142D3FDCE10>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [7]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

best params: {'c1': 0.0034483310690783405, 'c2': 0.02582292025770358}
best CV score: 0.839579160236
