In [1]:
# Copyright 2019 Google Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

### Multilabel classification with TF-HUB

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from tensorflow import keras
import os
import re

from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

# predicted logits to distribution
import numpy as np

# confusion matrix
from matplotlib import pyplot as plt
import itertools

# avoid printing e-notation, such as 9.9687493e-01 instead of 0.9968
np.set_printoptions(suppress=True,
   formatter={'float_kind':'{:0.4f}'.format})

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# modified to multilabel classification from binary example:
# https://github.com/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb

# original way in example for metrics, F1, False Negatives etc does not work for multi lable. 
# They use tf.metric.recall etc. which cast labels to boolean, returning incorrect values for multi label.


In [4]:
# optional, load local environment: source activate tf-hub

In [5]:
dataset = 'hatespeech'

current = os.getcwd()
basefolder = current + '/dataset_'+ dataset+'/'
datafolder = basefolder + 'data/'  # for example /dataset_businessnews/data/
print(basefolder)

infolder =  basefolder + 'input/'
outfolder = basefolder + 'output/'

/home/max/git/newcombined/dataset_hatespeech/


In addition to the standard libraries we imported above, we'll need to install BERT's python package.

In [6]:
#!pip install bert-tensorflow

In [7]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. If you're running this code in Google's hosted Colab, the directory won't persist after the Colab session ends.

Alternatively, if you're a GCP user, you can store output in a GCP bucket. To do that, set a directory name in OUTPUT_DIR and the name of the GCP bucket in the BUCKET field.

Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist).

In [8]:
# create directory if need
OUTPUT_DIR = 'tf-hub-output'#@param {type:"string"}

#Data

First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub).

In [9]:
train = pd.read_csv(basefolder+'input/train.csv',sep='\t', header = None)
test = pd.read_csv(basefolder+'input/dev.csv'  ,sep='\t', header = None)
#test.to_csv('input/test.csv',sep='\t', index = False, header = False)

train.columns = ['id','label','text']
test.columns  = ['id','label', 'text']

In [10]:
#train.head()

In [11]:
# Test smaller
train = train[0:3000]

In [12]:
#train.head()

In [13]:
#test.head()

In [14]:
print(len(train))
print(len(test))

3000
3000


### Universal 

## Model
### Input functions

[Estimator framework](https://www.tensorflow.org/get_started/premade_estimators#overview_of_programming_with_estimators) provides [input functions](https://www.tensorflow.org/api_docs/python/tf/estimator/inputs/pandas_input_fn) that wrap Pandas dataframes.

In [15]:
train_df = train
test_df = test

# Training input on the whole training set with no limit on training epochs.
train_input_fn = tf.estimator.inputs.pandas_input_fn(
    train_df, train_df["label"], num_epochs=None, shuffle=True,  batch_size=32)

# Prediction on the whole training set.
predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
    train_df, train_df["label"], shuffle=False, batch_size=32)

# Prediction on the test set.
predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
    test_df, test_df["label"], shuffle=False, batch_size=32)

### Feature columns

TF-Hub provides a feature column that applies a module on the given text feature and passes further the outputs of the module. We will be using the nnlm-en-dim128 module. 

    The module takes a batch of sentences in a 1-D tensor of strings as input.
    The module is responsible for preprocessing of sentences (e.g. removal of punctuation and splitting on spaces).
    The module works with any input (e.g. nnlm-en-dim128 hashes words not present in vocabulary into  ~20.000 buckets).


In [16]:
embedded_text_feature_column = hub.text_embedding_column(
    key="text", 
    module_spec="https://tfhub.dev/google/nnlm-en-dim128/1",
    trainable=True
)

In [110]:
embedded_text_feature_column2 = hub.text_embedding_column(
    key="text", 
    module_spec="https://tfhub.dev/google/universal-sentence-encoder/2", # DAN
    trainable=True
    )

In [129]:
embedded_text_feature_column3 = hub.text_embedding_column(
    key="text", 
    module_spec="https://tfhub.dev/google/universal-sentence-encoder-large/3", # univ-3 is Transformer
    trainable=True
    )

In [17]:
embedded_text_feature_column_elmo = hub.text_embedding_column(
    key="text", 
    module_spec="https://tfhub.dev/google/elmo/2", 
    trainable=True
    )

In [173]:
def setEmbedding(module = "https://tfhub.dev/google/universal-sentence-encoder/2"):
    
    return hub.text_embedding_column(
    key="text", 
    module_spec=module,
    trainable=True
    )
    

In [176]:
embedded_text_feature_column = setEmbedding("https://tfhub.dev/google/universal-sentence-encoder/2")

In [28]:
# embedded_text_feature_column.

### Estimator

For classification we can use a [DNN Classifier](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNClassifier) (note further remarks about different modelling of the label function at the end of the tutorial).

In [18]:
estimator = tf.estimator.DNNClassifier(
    hidden_units=[500, 100],
    #hidden_units=[1024, 512, 256],
    feature_columns=[embedded_text_feature_column],
    n_classes=3,
    optimizer=tf.train.AdagradOptimizer(learning_rate=0.003)
)

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.






INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpc48bqaz2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd0b26ea668>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpc48bqaz2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd0b26ea668>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### Training

Train the estimator for a reasonable amount of steps.

In [19]:
# Training for 1,000 steps means 128,000 training examples with the default
# batch size. This is roughly equivalent to 5 epochs since the training dataset
# contains 25,000 examples.
estimator.train(input_fn=train_input_fn, steps=3000);

Instructions for updating:
To construct input pipelines, use the `tf.data` module.


Instructions for updating:
To construct input pipelines, use the `tf.data` module.


Instructions for updating:
To construct input pipelines, use the `tf.data` module.


Instructions for updating:
To construct input pipelines, use the `tf.data` module.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


Instructions for updating:
To construct input pipelines, use the `tf.data` module.


Instructions for updating:
To construct input pipelines, use the `tf.data` module.


INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpc48bqaz2/model.ckpt.


INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpc48bqaz2/model.ckpt.


InternalError: Blas GEMM launch failed : a.shape=(32, 128), b.shape=(128, 500), m=32, n=500, k=128
	 [[node dnn/hiddenlayer_0/MatMul (defined at <ipython-input-19-6db09b378193>:4)  = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](dnn/input_from_feature_columns/input_layer/text_hub_module_embedding/Reshape_1, dnn/hiddenlayer_0/kernel)]]
	 [[{{node dnn/hiddenlayer_1/Relu/_65}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_115_dnn/hiddenlayer_1/Relu", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'dnn/hiddenlayer_0/MatMul', defined at:
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 563, in start
    self.io_loop.start()
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/asyncio/base_events.py", line 438, in run_forever
    self._run_once()
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/asyncio/base_events.py", line 1451, in _run_once
    handle._run()
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tornado/ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tornado/gen.py", line 787, in inner
    self.run()
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 361, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 541, in execute_request
    user_expressions, allow_stdin,
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 300, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2854, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2880, in _run_cell
    return runner(coro)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3057, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3248, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3325, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-6db09b378193>", line 4, in <module>
    estimator.train(input_fn=train_input_fn, steps=3000);
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 354, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 1207, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 1237, in _train_model_default
    features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 1195, in _call_model_fn
    model_fn_results = self._model_fn(features=features, **kwargs)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/estimator/canned/dnn.py", line 486, in _model_fn
    shared_state_manager=shared_state_manager)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/estimator/canned/dnn.py", line 300, in _dnn_model_fn
    logits = logit_fn(features=features, mode=mode)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/estimator/canned/dnn.py", line 109, in dnn_logit_fn
    return dnn_model(features, mode)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py", line 757, in __call__
    outputs = self.call(inputs, *args, **kwargs)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/estimator/canned/dnn.py", line 206, in call
    net = self._hidden_layers[i](net)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/layers/base.py", line 374, in __call__
    outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py", line 757, in __call__
    outputs = self.call(inputs, *args, **kwargs)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/keras/layers/core.py", line 970, in call
    outputs = gen_math_ops.mat_mul(inputs, self.kernel)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 4560, in mat_mul
    name=name)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
    op_def=op_def)
  File "/home/max/anaconda3/envs/tf-hub/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

InternalError (see above for traceback): Blas GEMM launch failed : a.shape=(32, 128), b.shape=(128, 500), m=32, n=500, k=128
	 [[node dnn/hiddenlayer_0/MatMul (defined at <ipython-input-19-6db09b378193>:4)  = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](dnn/input_from_feature_columns/input_layer/text_hub_module_embedding/Reshape_1, dnn/hiddenlayer_0/kernel)]]
	 [[{{node dnn/hiddenlayer_1/Relu/_65}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_115_dnn/hiddenlayer_1/Relu", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


# Prediction

Run predictions for both training and test set.

In [90]:
test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)
print("Test set accuracy: {accuracy}".format(**test_eval_result))

INFO:tensorflow:Calling model_fn.


I0411 13:47:55.890263 140411733964608 tf_logging.py:115] Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0411 13:47:56.753264 140411733964608 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0411 13:47:57.985651 140411733964608 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


I0411 13:47:58.216483 140411733964608 tf_logging.py:115] Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2019-04-11-10:47:58


I0411 13:47:58.244279 140411733964608 tf_logging.py:115] Starting evaluation at 2019-04-11-10:47:58


INFO:tensorflow:Graph was finalized.


I0411 13:47:58.490012 140411733964608 tf_logging.py:115] Graph was finalized.


INFO:tensorflow:Restoring parameters from /tmp/tmpvyderz7p/model.ckpt-3000


I0411 13:47:58.494348 140411733964608 tf_logging.py:115] Restoring parameters from /tmp/tmpvyderz7p/model.ckpt-3000


INFO:tensorflow:Running local_init_op.


I0411 13:47:59.190602 140411733964608 tf_logging.py:115] Running local_init_op.


INFO:tensorflow:Done running local_init_op.


I0411 13:47:59.775360 140411733964608 tf_logging.py:115] Done running local_init_op.


INFO:tensorflow:Finished evaluation at 2019-04-11-10:48:04


I0411 13:48:04.495506 140411733964608 tf_logging.py:115] Finished evaluation at 2019-04-11-10:48:04


INFO:tensorflow:Saving dict for global step 3000: accuracy = 0.9076667, average_loss = 0.50751317, global_step = 3000, loss = 8.098614


I0411 13:48:04.497148 140411733964608 tf_logging.py:115] Saving dict for global step 3000: accuracy = 0.9076667, average_loss = 0.50751317, global_step = 3000, loss = 8.098614


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 3000: /tmp/tmpvyderz7p/model.ckpt-3000


I0411 13:48:05.644914 140411733964608 tf_logging.py:115] Saving 'checkpoint_path' summary for global step 3000: /tmp/tmpvyderz7p/model.ckpt-3000


Test set accuracy: 0.9076666831970215


### Final test data

In [113]:
test = pd.read_csv(basefolder+'input/test.csv'  ,sep='\t', header = None)
test.columns  = ['id','label', 'text']

final_test_df = test

# Prediction on the final set.
predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
    final_test_df, final_test_df["label"], shuffle=False, batch_size=16)

In [101]:
#predictions = estimator.predict(input_fn=predict_test_input_fn)
#[prediction for prediction in  predictions]

In [118]:
def get_predictions(estimator, predict_test_input_fn):
  #return [x for x in estimator.predict(input_fn=predict_test_input_fn)]
  labels = [0,1,2]
  predictions = estimator.predict(input_fn=predict_test_input_fn)
  return [ (prediction['probabilities'], prediction['class_ids'][0])  for prediction in  predictions]
    # class_ids is a list, such [1], use [0] to return just integer without surrounding list

In [132]:
res = get_predictions(estimator, predict_test_input_fn)

INFO:tensorflow:Calling model_fn.


I0411 14:57:52.471598 140411733964608 tf_logging.py:115] Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0411 14:57:53.334416 140411733964608 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0411 14:57:54.542077 140411733964608 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


I0411 14:57:54.717478 140411733964608 tf_logging.py:115] Done calling model_fn.


INFO:tensorflow:Graph was finalized.


I0411 14:57:54.974824 140411733964608 tf_logging.py:115] Graph was finalized.


INFO:tensorflow:Restoring parameters from /tmp/tmp5p51_vu0/model.ckpt-3000


I0411 14:57:54.977990 140411733964608 tf_logging.py:115] Restoring parameters from /tmp/tmp5p51_vu0/model.ckpt-3000


INFO:tensorflow:Running local_init_op.


I0411 14:57:55.706377 140411733964608 tf_logging.py:115] Running local_init_op.


INFO:tensorflow:Done running local_init_op.


I0411 14:57:56.329119 140411733964608 tf_logging.py:115] Done running local_init_op.


In [142]:
results = pd.DataFrame(res)
results.columns=['probs','label']
# add id's
results = pd.concat([test['id'], results], axis=1)
results.head(5)

Unnamed: 0,id,probs,label
0,14206,"[0.00014044867, 0.0005699134, 0.99928963]",2
1,16354,"[0.0005789527, 0.0064157858, 0.99300534]",2
2,21621,"[0.00025487362, 0.9996896, 5.5497745e-05]",1
3,5899,"[0.00021313019, 0.99969983, 8.711357e-05]",1
4,19499,"[0.00020131422, 0.00072864885, 0.9990701]",2


### Save predictions

In [134]:
def saveResults(df, name="defaultname"):
    df.to_json(basefolder+'output/'+name+'.json', orient='records')
    df.to_csv(basefolder+'output/'+name+'.csv', header=True, sep='\t', index=False)

In [135]:
modelname = 'univ3_18k'
saveResults(results, name='predictions_'+modelname)

In [143]:
results

Unnamed: 0,id,probs,label
0,14206,"[0.00014044867, 0.0005699134, 0.99928963]",2
1,16354,"[0.0005789527, 0.0064157858, 0.99300534]",2
2,21621,"[0.00025487362, 0.9996896, 5.5497745e-05]",1
3,5899,"[0.00021313019, 0.99969983, 8.711357e-05]",1
4,19499,"[0.00020131422, 0.00072864885, 0.9990701]",2
5,6659,"[0.00014100528, 0.99941206, 0.00044693804]",1
6,24999,"[0.00014207677, 0.9997265, 0.00013143675]",1
7,9068,"[0.00013567579, 0.9997669, 9.746551e-05]",1
8,18454,"[0.00019105988, 0.9991104, 0.00069855223]",1
9,14633,"[0.00022559425, 0.0007601781, 0.99901426]",2


### ALL



In [None]:
choose embed
create model
train
produce predictions
addPredictions

repeat for n models and train size

In [None]:
embedded_text_feature_column = setEmbedding("https://tfhub.dev/google/universal-sentence-encoder/2")

In [144]:
all = []

model_name = modelname

all.append({'model': model_name,
                'labels': results['label'],
                'probs': results['probs']
               }
              )

In [160]:
all.to_pickle('all.pkl')

AttributeError: 'list' object has no attribute 'to_pickle'

In [147]:
# save
import shelve
# file to be used
shelf = shelve.open("all.shlf")
# serializing
shelf["all"] = all
shelf.close() # you must close the shelve file!!!

In [148]:
# read
import shelve
shelf = shelve.open("all.shlf") # the same filename that you used before, please
new = shelf["all"]
shelf.close()

In [151]:
all

[{'model': 'univ3_18k', 'labels': 0       2
  1       2
  2       1
  3       1
  4       2
  5       1
  6       1
  7       1
  8       1
  9       2
  10      1
  11      1
  12      1
  13      1
  14      1
  15      1
  16      1
  17      1
  18      1
  19      1
  20      1
  21      1
  22      1
  23      1
  24      1
  25      1
  26      1
  27      1
  28      1
  29      1
         ..
  2970    1
  2971    1
  2972    1
  2973    1
  2974    1
  2975    1
  2976    1
  2977    1
  2978    1
  2979    1
  2980    1
  2981    1
  2982    2
  2983    1
  2984    1
  2985    1
  2986    1
  2987    1
  2988    1
  2989    1
  2990    1
  2991    1
  2992    0
  2993    2
  2994    1
  2995    1
  2996    2
  2997    1
  2998    1
  2999    1
  Name: label, Length: 3000, dtype: int64, 'probs': 0        [0.00014044867, 0.0005699134, 0.99928963]
  1         [0.0005789527, 0.0064157858, 0.99300534]
  2        [0.00025487362, 0.9996896, 5.5497745e-05]
  3        [0.00021313019, 

In [159]:
all[0]['model']

'univ3_18k'

In [157]:
all[0]['labels']

0       2
1       2
2       1
3       1
4       2
5       1
6       1
7       1
8       1
9       2
10      1
11      1
12      1
13      1
14      1
15      1
16      1
17      1
18      1
19      1
20      1
21      1
22      1
23      1
24      1
25      1
26      1
27      1
28      1
29      1
       ..
2970    1
2971    1
2972    1
2973    1
2974    1
2975    1
2976    1
2977    1
2978    1
2979    1
2980    1
2981    1
2982    2
2983    1
2984    1
2985    1
2986    1
2987    1
2988    1
2989    1
2990    1
2991    1
2992    0
2993    2
2994    1
2995    1
2996    2
2997    1
2998    1
2999    1
Name: label, Length: 3000, dtype: int64