In [1]:
# kerastext example

In [2]:
import numpy as np
dat = np.load('emabse_matrices2.npz')

In [3]:
# load in data + embeddings
X, y, emb = dat['X'], dat['y'], dat['emb']

In [4]:
from kerastext import CNNTextClassifier

Using Theano backend.
Using gpu device 0: GRID K520 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 5005)


In [7]:
from hyperopt import tpe, hp, STATUS_OK, Trials
from kerastext import fmin_persist
import numpy as np


space = {
         "filter_size": hp.choice("filter_size", [[1], [2], [3], [4], [5], [6], [7], [1, 3], [1, 3, 5], [3, 5, 7]]),
         "num_filters": hp.choice('num_filters',  np.arange(50, 250, dtype=int)),
         "num_hidden_layers": hp.choice('num_hidden_layers', np.arange(0,3, dtype=int)),
         "undersample_ratio": hp.uniform('oversample_ratio', 0.2, 1),
         "dropout": hp.uniform('dropout', 0.1, 0.7),
         "class_weight": {True:hp.uniform('true_weight', 1, 30), False:1},
         "max_features": hp.choice("max_features", [5000, 10000, 12500, 15000, 17500, 20000, 25000, 30000]),
         "l2": hp.uniform('l2', 0, 3),
         "dim_hidden_layers": hp.choice("dim_hidden_layers", [50, 100, 150, 200, 250, 300, 350, 400]),
         "stopping_patience": 1,
         "stopping_target": "val_specificity_at_recall",
         "stopping_less_is_good": False,
         "validation_split":0.1
}


def objective(params):
    print(params)
    params['embedding_weights'] = emb
    clf = CNNTextClassifier(**params)
    clf.fit(X, y)
    return {'loss': -clf.history.history['val_specificity_at_recall'][-1], 'status': STATUS_OK}

# this persist version of the function helps since running 200 CNNs sequentially will
# often produce a memory error and crash... at least nothing lost
# also... it stores the evals so far to pickle, hence you can later decide to do a bunch
# more without starting from scratch
trials = fmin_persist(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=100,
                    verbose=4,
                    trials_pickle="RCT_CNN_1.pck")

In [8]:
import hyperopt
optimal_params = hyperopt.space_eval(space, trials.argmin)
print(optimal_params)

{'l2': 0.7597291536923072, 'class_weight': {False: 1, True: 29.19848956152622}, 'stopping_less_is_good': False, 'undersample_ratio': 0.7212566713178366, 'stopping_target': 'val_specificity_at_recall', 'num_hidden_layers': 2, 'filter_size': (6,), 'stopping_patience': 1, 'validation_split': 0.1, 'dropout': 0.12879244796905195, 'num_filters': 134, 'max_features': 30000, 'dim_hidden_layers': 150}


In [None]:
trials = fmin_persist(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=100,
                    verbose=4,
                    trials_pickle="RCT_CNN_1.pck")

{'l2': 2.113407059650438, 'class_weight': {False: 1, True: 29.83258153452071}, 'stopping_less_is_good': False, 'stopping_target': 'val_specificity_at_recall', 'num_hidden_layers': 0, 'filter_size': (3,), 'stopping_patience': 1, 'oversample_ratio': 0.9745347388823917, 'validation_split': 0.1, 'dropout': 0.37753196802434114, 'num_filters': 152, 'max_features': 15000, 'dim_hidden_layers': 200}
Processing data (280620 samples)
464240 sampled indices from 252558 total, which comprise 229126.76059759464 positive, 235114 negative examples
Sampled with ratio of 0.9745347388823917, increased to 464240 samples.
Train on 464240 samples, validate on 28062 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Trial 42 done!.. pickling...
{'l2': 2.321343135890656, 'class_weight': {False: 1, True: 28.115631605208314}, 'stopping_less_is_good': False, 'stopping_target': 'val_specificity_at_recall', 'num_hidden_layers': 0, 'filter_size': (3,), 'stopping_patience': 1, 'oversample_ratio': 0.9333902885880284, 'validati

In [24]:
# demo of loading the trials data
import pickle
with open("RCT_CNN_1.pck", 'rb') as f:
    t = pickle.load(f)

In [25]:
len(t) # number of trials so far

41

In [26]:
t.trials # the params

[{'book_time': datetime.datetime(2016, 10, 4, 15, 58, 27, 176000),
  'exp_key': None,
  'misc': {'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'idxs': {'dim_hidden_layers': [0],
    'dropout': [0],
    'filter_size': [0],
    'l2': [0],
    'max_features': [0],
    'num_filters': [0],
    'num_hidden_layers': [0],
    'oversample_ratio': [0],
    'true_weight': [0]},
   'tid': 0,
   'vals': {'dim_hidden_layers': [3],
    'dropout': [0.37520755829191943],
    'filter_size': [1],
    'l2': [2.0924613767363596],
    'max_features': [5],
    'num_filters': [152],
    'num_hidden_layers': [2],
    'oversample_ratio': [0.8130407613298556],
    'true_weight': [23.34346564022296]},
   'workdir': None},
  'owner': None,
  'refresh_time': datetime.datetime(2016, 10, 4, 16, 1, 11, 723000),
  'result': {'loss': -0.11537114970251153, 'status': 'ok'},
  'spec': None,
  'state': 2,
  'tid': 0,
  'version': 0},
 {'book_time': datetime.datetime(2016, 10, 4, 16, 2, 40, 933000),
  'exp_key': None,
