In [20]:
import json
import os
import random

import tensorflow as tf
import numpy as np
import pickle
from bert_serving.client import BertClient, ConcurrentBertClient
from tensorflow.estimator import BaselineClassifier
from tensorflow.python.estimator.canned.dnn import DNNClassifier
from tensorflow.python.estimator.run_config import RunConfig
from tensorflow.python.estimator.training import TrainSpec, EvalSpec, train_and_evaluate

tf.logging.set_verbosity(tf.logging.INFO)

In [2]:
batch_size = 128
num_parallel_calls = 1
bc = BertClient()

In [78]:
def get_encodes(data):
    # x is `batch_size` of lines, each of which is a json object
    features = bc.encode([x[0] for x in data])
    # randomly choose a label
    labels = [x[1] for x in data]
    return features, labels

def get_input_fn(data_dir, num_examples=None, num_epochs=10):
    data_files = os.listdir(data_dir)
    
    # open pre-embedded data
    feature_list = []
    label_list = []
    for data_file in data_files:
        with open(os.path.join(data_dir, data_file), 'rb') as f:
            features, labels = pickle.load(f)
            feature_list.append(features)
            label_list.append(labels)
    features = np.concatenate(feature_list)
    labels = [label for labels in label_list for label in labels]
    
    # split into train and dev set
    train_features = features[0:int(0.8*len(features))]
    train_labels = labels[0:int(0.8*len(features))]
    dev_features = features[int(0.8*len(features)):len(features)]
    dev_labels = labels[int(0.8*len(features)):len(features)]
    
    train_labels = np.array(train_labels).astype('int32')
    dev_labels = np.array(dev_labels).astype('int32')
    
    if num_examples is not None:
        train_features = train_features[0:num_examples]
        train_labels = train_labels[0:num_examples]
    
    print('{} train data points'.format(len(train_features)))
    print('{} dev data points'.format(len(dev_features)))
    
    train_fn = tf.estimator.inputs.numpy_input_fn(
        x={'feature': train_features},
        y=train_labels,
        num_epochs=num_epochs,
        batch_size=128,
        shuffle=True
    )
    dev_fn = tf.estimator.inputs.numpy_input_fn(
        x={'feature': dev_features},
        y=dev_labels,
        num_epochs=1,
        batch_size=128,
        shuffle=False
    )
    return (train_fn, dev_fn)

## Effect of number of steps in evaluation

In [63]:
evaluate_steps = [1, 10, 100, 200, 500, 1000, 2000, 5000]
hidden_units = [10]
learning_rate = 0.003
bert_model = 'bert_uncased_small'
classifier = 'DNN'
dropout_rate = 0.1
train_max_steps = 2000
input_size = None # all

config = tf.ConfigProto()
run_config = RunConfig(model_dir='/home/eugenet/final_project/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate),
                       session_config=config,
                       save_checkpoints_steps=100)
estimator = DNNClassifier(
    hidden_units=hidden_units,
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2,
    config=run_config,
    optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate),
    dropout=dropout_rate)
train_input_fn, dev_input_fn = get_input_fn('/home/eugenet/final_project/cached_data/train/', input_size)
estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)
res = []
for steps in evaluate_steps:
    res.append(estimator.evaluate(dev_input_fn, steps=steps))

INFO:tensorflow:Using config: {'_model_dir': '/home/eugenet/final_project/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.003_dropout0.1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3f5cad5748>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
20000 train data points
5000 dev data points
INFO:tensorflow:Skipping training since max_steps has already saved.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model

INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: /home/eugenet/final_project/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.003_dropout0.1/model.ckpt-2000
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-06T14:22:43Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /home/eugenet/final_project/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.003_dropout0.1/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Evaluation [200/1000]
INFO:tensorflow:Evaluation [300/1000]
INFO:tensorflow:Evaluation [400/1000]
INFO:tensorflow:Evaluation [500/1000]
INFO:tensorflow:Evaluation [600/1000]
INFO:tensorflow:Evaluation [700/1000]
INFO:tensorflow:Evaluation [800/1000]
INFO:tensorflow:Evaluation [900/1000]
INFO:tensorflow:Evaluati

In [64]:
[x['accuracy'] for x in res]

[0.74, 0.826, 0.7936, 0.7936, 0.7936, 0.7936, 0.7936, 0.7936]

## Effect of Number of Hidden Units

In [77]:
hidden_units = [1000, 500, 200, 100]
learning_rate = 0.003
bert_model = 'bert_uncased_small'
classifier = 'DNN'
dropout_rate = 0.2
train_max_steps = 2000
input_size = None # all

config = tf.ConfigProto()
run_config = RunConfig(model_dir='/home/eugenet/final_project/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate),
                       session_config=config,
                       save_checkpoints_steps=100)
estimator = DNNClassifier(
    hidden_units=hidden_units,
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2,
    config=run_config,
    optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate),
    dropout=dropout_rate)
train_input_fn, dev_input_fn = get_input_fn('/home/eugenet/final_project/cached_data/train_uncased_small/', input_size)
estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)
estimator.evaluate(dev_input_fn)

INFO:tensorflow:Using config: {'_model_dir': '/home/eugenet/final_project/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu1000_500_200_100_lr0.003_dropout0.2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3f5db4d6d8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
20000 train data points
5000 dev data points
INFO:tensorflow:Skipping training since max_steps has already saved.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done

{'accuracy': 0.8118,
 'accuracy_baseline': 0.5,
 'auc': 0.9000076,
 'auc_precision_recall': 0.899112,
 'average_loss': 0.40113974,
 'label/mean': 0.5,
 'loss': 20.056988,
 'precision': 0.792057,
 'prediction/mean': 0.52676105,
 'recall': 0.8456,
 'global_step': 2000}