ensure jupyter notebook and bert-serving-start are running inside the virtualenv

---   
  
# BERT experiments with Imdb dataset  

---  

## Requirements & Config

In [2]:
import json
import os
import random
import sys
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
from bert_serving.client import BertClient, ConcurrentBertClient
from tensorflow.estimator import BaselineClassifier
from tensorflow.python.estimator.canned.dnn import DNNClassifier
from tensorflow.python.estimator.run_config import RunConfig
from tensorflow.python.estimator.training import TrainSpec, EvalSpec, train_and_evaluate

  from ._conv import register_converters as _register_converters


In [3]:
tf.logging.set_verbosity(tf.logging.INFO)
batch_size = 128
num_parallel_calls = 1
bc = BertClient()

---   
  
## Cache data

---  

In order to avoid having to encode embeddings for the dataset every experiment, we create separate files for BERT encodings. Creating this 'cache' is performed in chunks for practical considerations (in case of errors). 

In [4]:
# pipeline
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

encode_count = 0
def encode(chunk):
    global encode_count
    print('Chunk {}'.format(encode_count))
    encode_count += 1
    return bc.encode(chunk)
        
def get_encodes(data):
    # x is `batch_size` of lines, each of which is a json object
    features = np.array([])
    text = [x[0] for x in data]
    features = np.concatenate([encode(chunk) for chunk in chunks(text, 256)])
    # randomly choose a label
    labels = [x[1] for x in data]
    return features, labels

def cache_data(data_dir, dest_dir, start_chunk, end_chunk):
    pos_files = os.listdir(os.path.join(data_dir, 'pos'))
    neg_files = os.listdir(os.path.join(data_dir, 'neg'))
        
    data = []
    for pos_file, neg_file in zip(pos_files, neg_files):
        with open(os.path.join(data_dir, 'pos', pos_file)) as f:
            review = f.readlines()[0].strip()
            data.append((review, 1))
        with open(os.path.join(data_dir, 'neg', neg_file)) as f:
            review = f.readlines()[0].strip()
            data.append((review, 0))
    chunk_num = -1
    chunk_size = 2048
    for chunk in chunks(data, chunk_size):
        chunk_num += 1
        if chunk_num < start_chunk:
            continue
        if chunk_num > end_chunk:
            break
        features, output = get_encodes(chunk)
        print('Wrote data_{:03d}.p'.format(chunk_num))
        with open(os.path.join(dest_dir, 'data_{:03d}.p'.format(chunk_num)), 'wb') as f:
            pickle.dump((features, output), f)

### 	BERT-Base, Uncased
12-layer, 768-hidden, 12-heads, 110M parameters 

In [None]:
# # !bert-serving-start -model_dir ./uncased_L-12_H-768_A-12 -num_worker=4

In [11]:
%%time

# set directories
my_dir_train = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/BERT_Imdb/aclImdb/train'
my_dir_test  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/test'
my_dir_train_output = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/train'
my_dir_test_output  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/test'

# build cache
input_fn_train = cache_data(my_dir_train, my_dir_train_output, 0, 14)
input_fn_eval = cache_data(my_dir_test, my_dir_test_output, 0, 14)

Chunk 0


here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


Chunk 1
Chunk 2
Chunk 3
Chunk 4
Chunk 5
Chunk 6
Chunk 7
Wrote data_000.p
Chunk 8
Chunk 9
Chunk 10
Chunk 11
Chunk 12
Chunk 13
Chunk 14
Chunk 15
Wrote data_001.p
Chunk 16
Chunk 17
Chunk 18
Chunk 19
Chunk 20
Chunk 21
Chunk 22
Chunk 23
Wrote data_002.p
Chunk 24
Chunk 25
Chunk 26
Chunk 27
Chunk 28
Chunk 29
Chunk 30
Chunk 31
Wrote data_003.p
Chunk 32
Chunk 33
Chunk 34
Chunk 35
Chunk 36
Chunk 37
Chunk 38
Chunk 39
Wrote data_004.p
Chunk 40
Chunk 41
Chunk 42
Chunk 43
Chunk 44
Chunk 45
Chunk 46
Chunk 47
Wrote data_005.p
Chunk 48
Chunk 49
Chunk 50
Chunk 51
Chunk 52
Chunk 53
Chunk 54
Chunk 55
Wrote data_006.p
Chunk 56
Chunk 57
Chunk 58
Chunk 59
Chunk 60
Chunk 61
Chunk 62
Chunk 63
Wrote data_007.p
Chunk 64
Chunk 65
Chunk 66
Chunk 67
Chunk 68
Chunk 69
Chunk 70
Chunk 71
Wrote data_008.p
Chunk 72
Chunk 73
Chunk 74
Chunk 75
Chunk 76
Chunk 77
Chunk 78
Chunk 79
Wrote data_009.p
Chunk 80
Chunk 81
Chunk 82
Chunk 83
Chunk 84
Chunk 85
Chunk 86
Chunk 87
Wrote data_010.p
Chunk 88
Chunk 89
Chunk 90
Chunk 91
Chu

### 	BERT-Base, Cased
12-layer, 768-hidden, 12-heads , 110M parameters

In [20]:
%%time

# set directories
my_dir_train = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/train'
my_dir_test  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/test'
my_dir_train_output = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/cased_L-12_H-768_A-12/cache/train'
my_dir_test_output  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/cased_L-12_H-768_A-12/cache/test'

# build cache
input_fn_train = cache_data(my_dir_train, my_dir_train_output, 0, 14)
input_fn_eval = cache_data(my_dir_test, my_dir_test_output, 0, 14)

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


Wrote data_000.p
Wrote data_001.p
Wrote data_002.p
Wrote data_003.p
Wrote data_004.p
Wrote data_005.p
Wrote data_006.p
Wrote data_007.p
Wrote data_008.p
Wrote data_009.p
Wrote data_010.p
Wrote data_011.p
Wrote data_012.p
Wrote data_000.p
Wrote data_001.p
Wrote data_002.p
Wrote data_003.p
Wrote data_004.p
Wrote data_005.p
Wrote data_006.p
Wrote data_007.p
Wrote data_008.p
Wrote data_009.p
Wrote data_010.p
Wrote data_011.p
Wrote data_012.p
CPU times: user 3.87 s, sys: 4.66 s, total: 8.53 s
Wall time: 29min 13s


### 	BERT-Large, Uncased
24-layer, 1024-hidden, 16-heads, 340M parameters

In [None]:
%%time

# set directories
my_dir_train = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/train'
my_dir_test  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/test'
my_dir_train_output = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-24_H-1024_A-16/cache/train'
my_dir_test_output  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-24_H-1024_A-16/cache/test'

# build cache
input_fn_train = cache_data(my_dir_train, my_dir_train_output, 0, 14)
input_fn_eval = cache_data(my_dir_test, my_dir_test_output, 0, 14)

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


### BERT-Large, Cased
24-layer, 1024-hidden, 16-heads, 340M parameters

In [14]:
%%time

# set directories
my_dir_train = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/train'
my_dir_test  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/test'
my_dir_train_output = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/cased_L-24_H-1024_A-16/cache/train'
my_dir_test_output  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/cased_L-24_H-1024_A-16/cache/test'

# build cache
input_fn_train = cache_data(my_dir_train, my_dir_train_output, 0, 14)
input_fn_eval = cache_data(my_dir_test, my_dir_test_output, 0, 14)

Chunk 588


here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


Chunk 589
Chunk 590
Chunk 591
Chunk 592
Chunk 593
Chunk 594
Chunk 595
Wrote data_000.p
Chunk 596
Chunk 597
Chunk 598
Chunk 599
Chunk 600
Chunk 601
Chunk 602
Chunk 603
Wrote data_001.p
Chunk 604
Chunk 605
Chunk 606
Chunk 607
Chunk 608
Chunk 609
Chunk 610
Chunk 611
Wrote data_002.p
Chunk 612
Chunk 613
Chunk 614
Chunk 615
Chunk 616
Chunk 617
Chunk 618
Chunk 619
Wrote data_003.p
Chunk 620
Chunk 621
Chunk 622
Chunk 623
Chunk 624
Chunk 625
Chunk 626
Chunk 627
Wrote data_004.p
Chunk 628
Chunk 629
Chunk 630
Chunk 631
Chunk 632
Chunk 633
Chunk 634
Chunk 635
Wrote data_005.p
Chunk 636
Chunk 637
Chunk 638
Chunk 639
Chunk 640
Chunk 641
Chunk 642
Chunk 643
Wrote data_006.p
Chunk 644
Chunk 645
Chunk 646
Chunk 647
Chunk 648
Chunk 649
Chunk 650
Chunk 651
Wrote data_007.p
Chunk 652
Chunk 653
Chunk 654
Chunk 655
Chunk 656
Chunk 657
Chunk 658
Chunk 659
Wrote data_008.p
Chunk 660
Chunk 661
Chunk 662
Chunk 663
Chunk 664
Chunk 665
Chunk 666
Chunk 667
Wrote data_009.p
Chunk 668
Chunk 669
Chunk 670
Chunk 671


### preparation for testing

In [79]:
# BERT-small, uncased
dir_BSU_train = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/train'
dir_BSU_test  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/test'
# BERT-large, uncased
dir_BLU_train = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-24_H-1024_A-16/cache/train'
dir_BLU_test  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-24_H-1024_A-16/cache/test'
# BERT-small, cased
dir_BSC_train = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/cased_L-12_H-768_A-12/cache/train'
dir_BSC_test  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/cased_L-12_H-768_A-12/cache/test'
# BERT-large, cased
dir_BLC_train = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/cased_L-24_H-1024_A-16/cache/train'
dir_BLC_test  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/cased_L-24_H-1024_A-16/cache/test'

# Model output directory
bert_model, classifier, input_size, train_max_steps, hidden_units, learning_rate, dropout_rate = '','','','','','',''
dir_models = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate)

In [12]:
def get_encodes(data):
    # x is `batch_size` of lines, each of which is a json object
    features = bc.encode([x[0] for x in data])
    # randomly choose a label
    labels = [x[1] for x in data]
    return features, labels

def get_input_fn(data_dir, num_examples=None, num_epochs=10):
    data_files = os.listdir(data_dir)
    
    # open pre-embedded data
    feature_list = []
    label_list = []
    for data_file in data_files:
        with open(os.path.join(data_dir, data_file), 'rb') as f:
            features, labels = pickle.load(f)
            feature_list.append(features)
            label_list.append(labels)
    features = np.concatenate(feature_list)
    labels = [label for labels in label_list for label in labels]
    
    # split into train and dev set
    train_features = features[0:int(0.8*len(features))]
    train_labels = labels[0:int(0.8*len(features))]
    dev_features = features[int(0.8*len(features)):len(features)]
    dev_labels = labels[int(0.8*len(features)):len(features)]
    
    train_labels = np.array(train_labels).astype('int32')
    dev_labels = np.array(dev_labels).astype('int32')
    
    if num_examples is not None:
        train_features = train_features[0:num_examples]
        train_labels = train_labels[0:num_examples]
    
    print('{} train data points'.format(len(train_features)))
    print('{} dev data points'.format(len(dev_features)))
    
    train_fn = tf.estimator.inputs.numpy_input_fn(
        x={'feature': train_features},
        y=train_labels,
        num_epochs=num_epochs,
        batch_size=128,
        shuffle=True
    )
    dev_fn = tf.estimator.inputs.numpy_input_fn(
        x={'feature': dev_features},
        y=dev_labels,
        num_epochs=1,
        batch_size=128,
        shuffle=False
    )
    return (train_fn, dev_fn)

---   
  
## Expiriment: Learning Rate

---  
When the learning rate is too large, gradient descent can inadvertently increase rather than decrease the training error. When the learning rate is too small, training is not only slower, but may become permanently stuck with a high training error.

It is not possible to calculate the best learning rate a priori. One approach to tuning this hyperparameter is to run a grid search (e.g., run with learning rates in a range like [.1, .01, .001, .0001 , .0001], and plot to see how key performance indicators change). 

### @ 0.1

In [None]:
# !bert-serving-start -model_dir ./uncased_L-12_H-768_A-12 -num_worker=4

In [104]:
%%time

hidden_units = [10]
learning_rate = 0.1
bert_model = 'bert_uncased_small'
classifier = 'DNN'
dropout_rate = 0.2
train_max_steps = 2000
input_size = None # all

dir_models = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate)
print ('\n\nsaving to:\n\n',dir_models, '\n\n')

config = tf.ConfigProto()
run_config = RunConfig(model_dir = dir_models, session_config=config,save_checkpoints_steps=100)
estimator = DNNClassifier(
    hidden_units=hidden_units,
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2,
    config=run_config,
    optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate),
    dropout=dropout_rate)
train_input_fn, dev_input_fn = get_input_fn(dir_BSU_train, input_size) # BERT-small, uncased
estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)



saving to:

 /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.1_dropout0.2 


INFO:tensorflow:Using config: {'_model_dir': '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.1_dropout0.2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x13487acc0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
20000 tra

In [105]:
# result
df_LR1 = pd.DataFrame.from_dict(estimator.evaluate(dev_input_fn), orient='index', columns=['LR_1'])
df_LR1

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-07-06:30:54
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.1_dropout0.2/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-07-06:30:55
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.7138, accuracy_baseline = 0.5, auc = 0.7891901, auc_precision_recall = 0.7955693, average_loss = 0.5596144, global_step = 2000, label/mean = 0.5, loss = 69.9518, precision = 0.74507105, prediction/mean = 0.51458234, recall = 0.65
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.1_dr

Unnamed: 0,LR_1
accuracy,0.7138
accuracy_baseline,0.5
auc,0.78919
auc_precision_recall,0.795569
average_loss,0.559614
label/mean,0.5
loss,69.951797
precision,0.745071
prediction/mean,0.514582
recall,0.65


### @ 0.05

In [106]:
%%time

hidden_units = [10]
learning_rate = 0.05
bert_model = 'bert_uncased_small'
classifier = 'DNN'
dropout_rate = 0.2
train_max_steps = 2000
input_size = None # all

dir_models = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate)
print ('\n\nsaving to:\n\n',dir_models, '\n\n')

config = tf.ConfigProto()
run_config = RunConfig(model_dir=dir_models, session_config=config,save_checkpoints_steps=100)
estimator = DNNClassifier(
    hidden_units=hidden_units,
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2,
    config=run_config,
    optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate),
    dropout=dropout_rate)
train_input_fn, dev_input_fn = get_input_fn(dir_BSU_train, input_size) # BERT-small, uncased
estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)



saving to:

 /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.05_dropout0.2 


INFO:tensorflow:Using config: {'_model_dir': '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.05_dropout0.2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x133b4c860>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
20000 t

In [107]:
# result
df_LR05 = pd.DataFrame.from_dict(estimator.evaluate(dev_input_fn), orient='index', columns=['LR_05'])
df_LR05

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-07-06:30:59
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.05_dropout0.2/model.ckpt-1563
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-07-06:31:00
INFO:tensorflow:Saving dict for global step 1563: accuracy = 0.7038, accuracy_baseline = 0.5, auc = 0.7782934, auc_precision_recall = 0.7903569, average_loss = 0.5676637, global_step = 1563, label/mean = 0.5, loss = 70.95796, precision = 0.7448342, prediction/mean = 0.5072773, recall = 0.62
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1563: /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.05_d

Unnamed: 0,LR_05
accuracy,0.7038
accuracy_baseline,0.5
auc,0.778293
auc_precision_recall,0.790357
average_loss,0.567664
label/mean,0.5
loss,70.957962
precision,0.744834
prediction/mean,0.507277
recall,0.62


### @ 0.01

In [108]:
%%time

hidden_units = [10]
learning_rate = 0.01
bert_model = 'bert_uncased_small'
classifier = 'DNN'
dropout_rate = 0.2
train_max_steps = 2000
input_size = None # all

dir_models = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate)
print ('\n\nsaving to:\n\n',dir_models, '\n\n')

config = tf.ConfigProto()
run_config = RunConfig(model_dir=dir_models, session_config=config,save_checkpoints_steps=100)
estimator = DNNClassifier(
    hidden_units=hidden_units,
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2,
    config=run_config,
    optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate),
    dropout=dropout_rate)
train_input_fn, dev_input_fn = get_input_fn(dir_BSU_train, input_size) # BERT-small, uncased
estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)



saving to:

 /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.01_dropout0.2 


INFO:tensorflow:Using config: {'_model_dir': '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.01_dropout0.2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1339f8828>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
20000 t

In [109]:
# result
df_LR01 = pd.DataFrame.from_dict(estimator.evaluate(dev_input_fn), orient='index', columns=['LR_01'])
df_LR01

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-07-06:31:01
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.01_dropout0.2/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-07-06:31:02
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.7126, accuracy_baseline = 0.5, auc = 0.79420185, auc_precision_recall = 0.7977605, average_loss = 0.544821, global_step = 2000, label/mean = 0.5, loss = 68.10263, precision = 0.7276231, prediction/mean = 0.5082678, recall = 0.6796
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.01

Unnamed: 0,LR_01
accuracy,0.7126
accuracy_baseline,0.5
auc,0.794202
auc_precision_recall,0.79776
average_loss,0.544821
label/mean,0.5
loss,68.102631
precision,0.727623
prediction/mean,0.508268
recall,0.6796


### @ 0.001

In [110]:
hidden_units = [10]
learning_rate = 0.001
bert_model = 'bert_uncased_small'
classifier = 'DNN'
dropout_rate = 0.2
train_max_steps = 2000
input_size = None # all

dir_models = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate)
print ('\n\nsaving to:\n\n',dir_models, '\n\n')

config = tf.ConfigProto()
run_config = RunConfig(model_dir=dir_models, session_config=config,save_checkpoints_steps=100)
estimator = DNNClassifier(
    hidden_units=hidden_units,
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2,
    config=run_config,
    optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate),
    dropout=dropout_rate)
train_input_fn, dev_input_fn = get_input_fn(dir_BSU_train, input_size) # BERT-small, uncased
estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)



saving to:

 /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.001_dropout0.2 


INFO:tensorflow:Using config: {'_model_dir': '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.001_dropout0.2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x132d850f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
20000

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x132d85390>

In [111]:
# result
df_LR001 = pd.DataFrame.from_dict(estimator.evaluate(dev_input_fn), orient='index', columns=['LR_001'])
df_LR001

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-07-06:31:05
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.001_dropout0.2/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-07-06:31:05
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.6904, accuracy_baseline = 0.5, auc = 0.7625588, auc_precision_recall = 0.7616379, average_loss = 0.59062976, global_step = 2000, label/mean = 0.5, loss = 73.82872, precision = 0.69224554, prediction/mean = 0.49986613, recall = 0.6856
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr

Unnamed: 0,LR_001
accuracy,0.6904
accuracy_baseline,0.5
auc,0.762559
auc_precision_recall,0.761638
average_loss,0.59063
label/mean,0.5
loss,73.82872
precision,0.692246
prediction/mean,0.499866
recall,0.6856


### @ 0.0001

In [112]:
%%time

hidden_units = [10]
learning_rate = 0.0001
bert_model = 'bert_uncased_small'
classifier = 'DNN'
dropout_rate = 0.2
train_max_steps = 2000
input_size = None # all

dir_models = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate)
print ('\n\nsaving to:\n\n',dir_models, '\n\n')

config = tf.ConfigProto()
run_config = RunConfig(model_dir=dir_models, session_config=config,save_checkpoints_steps=100)
estimator = DNNClassifier(
    hidden_units=hidden_units,
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2,
    config=run_config,
    optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate),
    dropout=dropout_rate)
train_input_fn, dev_input_fn = get_input_fn(dir_BSU_train, input_size) # BERT-small, uncased
estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)



saving to:

 /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.0001_dropout0.2 


INFO:tensorflow:Using config: {'_model_dir': '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.0001_dropout0.2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x134562978>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
200

In [113]:
# result
df_LR0001 = pd.DataFrame.from_dict(estimator.evaluate(dev_input_fn), orient='index', columns=['LR_0001'])
df_LR0001

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-07-06:31:08
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.0001_dropout0.2/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-07-06:31:09
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.5888, accuracy_baseline = 0.5, auc = 0.6162612, auc_precision_recall = 0.602811, average_loss = 0.6750571, global_step = 2000, label/mean = 0.5, loss = 84.38214, precision = 0.58090377, prediction/mean = 0.5056232, recall = 0.6376
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr0.

Unnamed: 0,LR_0001
accuracy,0.5888
accuracy_baseline,0.5
auc,0.616261
auc_precision_recall,0.602811
average_loss,0.675057
label/mean,0.5
loss,84.382141
precision,0.580904
prediction/mean,0.505623
recall,0.6376


### @ 0.00001

In [114]:
%%time

hidden_units = [10]
learning_rate = 0.00001
bert_model = 'bert_uncased_small'
classifier = 'DNN'
dropout_rate = 0.2
train_max_steps = 2000
input_size = None # all

dir_models = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate)
print ('\n\nsaving to:\n\n',dir_models, '\n\n')

config = tf.ConfigProto()
run_config = RunConfig(model_dir=dir_models, session_config=config,save_checkpoints_steps=100)
estimator = DNNClassifier(
    hidden_units=hidden_units,
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2,
    config=run_config,
    optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate),
    dropout=dropout_rate)
train_input_fn, dev_input_fn = get_input_fn(dir_BSU_train, input_size) # BERT-small, uncased
estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)
estimator.evaluate(dev_input_fn)



saving to:

 /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr1e-05_dropout0.2 


INFO:tensorflow:Using config: {'_model_dir': '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr1e-05_dropout0.2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1339b00b8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
20000

In [115]:
# result
df_LR00001 = pd.DataFrame.from_dict(estimator.evaluate(dev_input_fn), orient='index', columns=['LR_00001'])
df_LR00001

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-07-06:31:13
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr1e-05_dropout0.2/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-07-06:31:13
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.5112, accuracy_baseline = 0.5, auc = 0.5534019, auc_precision_recall = 0.5399217, average_loss = 0.71737605, global_step = 2000, label/mean = 0.5, loss = 89.672005, precision = 0.50577796, prediction/mean = 0.6174993, recall = 0.9804
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_DNN_inputNone_maxsteps2000_hu10_lr

Unnamed: 0,LR_00001
accuracy,0.5112
accuracy_baseline,0.5
auc,0.553402
auc_precision_recall,0.539922
average_loss,0.717376
label/mean,0.5
loss,89.672005
precision,0.505778
prediction/mean,0.617499
recall,0.9804


### Comparison

In [116]:
pd.concat([df_LR1, df_LR05, df_LR01, df_LR001, df_LR0001, df_LR00001], axis=1)

Unnamed: 0,LR_1,LR_05,LR_01,LR_001,LR_0001,LR_00001
accuracy,0.7138,0.7038,0.7126,0.6904,0.5888,0.5112
accuracy_baseline,0.5,0.5,0.5,0.5,0.5,0.5
auc,0.78919,0.778293,0.794202,0.762559,0.616261,0.553402
auc_precision_recall,0.795569,0.790357,0.79776,0.761638,0.602811,0.539922
average_loss,0.559614,0.567664,0.544821,0.59063,0.675057,0.717376
label/mean,0.5,0.5,0.5,0.5,0.5,0.5
loss,69.951797,70.957962,68.102631,73.82872,84.382141,89.672005
precision,0.745071,0.744834,0.727623,0.692246,0.580904,0.505778
prediction/mean,0.514582,0.507277,0.508268,0.499866,0.505623,0.617499
recall,0.65,0.62,0.6796,0.6856,0.6376,0.9804


---   
  
## Expiriment: Max Token Length

---  

By default, BERT is configured to cut sequences off at 25 tokens. While this ensures that computations don't become too large, it is also introduces an important artifact: only the first part of the reviews are considered for classification.

Based on our EDA, `*****`

In order to change this parameter, we
* re-initialize `bert_serving.client` with
    * `-max_seq_len=50`
    * `-max_seq_len=100`
    * `-max_seq_len=200`
* re-run with `learning_rate = 0.01` (based on previous experiment) in order to compare directly

### Increase to 50 tokens

In [None]:
# !bert-serving-start -model_dir ./uncased_L-12_H-768_A-12 -num_worker=4 -max_seq_len=50

In [7]:
%%time

# set directories
my_dir_train = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/train'
my_dir_test  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/test'
my_dir_train_output = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/train_tokens50'
my_dir_test_output  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/test_tokens50'

# build cache
input_fn_train = cache_data(my_dir_train, my_dir_train_output, 0, 14)
input_fn_eval = cache_data(my_dir_test, my_dir_test_output, 0, 14)

# Preparation for testing
# BERT-small, uncased [more tokens]
dir_BSU_train_tokens50 = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/train_tokens50'
dir_BSU_test_tokens50  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/test_tokens50'

Chunk 8


here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


Chunk 9
Chunk 10
Chunk 11
Chunk 12
Chunk 13
Chunk 14
Chunk 15
Wrote data_000.p
Chunk 16
Chunk 17
Chunk 18
Chunk 19
Chunk 20
Chunk 21
Chunk 22
Chunk 23
Wrote data_001.p
Chunk 24
Chunk 25
Chunk 26
Chunk 27
Chunk 28
Chunk 29
Chunk 30
Chunk 31
Wrote data_002.p
Chunk 32
Chunk 33
Chunk 34
Chunk 35
Chunk 36
Chunk 37
Chunk 38
Chunk 39
Wrote data_003.p
Chunk 40
Chunk 41
Chunk 42
Chunk 43
Chunk 44
Chunk 45
Chunk 46
Chunk 47
Wrote data_004.p
Chunk 48
Chunk 49
Chunk 50
Chunk 51
Chunk 52
Chunk 53
Chunk 54
Chunk 55
Wrote data_005.p
Chunk 56
Chunk 57
Chunk 58
Chunk 59
Chunk 60
Chunk 61
Chunk 62
Chunk 63
Wrote data_006.p
Chunk 64
Chunk 65
Chunk 66
Chunk 67
Chunk 68
Chunk 69
Chunk 70
Chunk 71
Wrote data_007.p
Chunk 72
Chunk 73
Chunk 74
Chunk 75
Chunk 76
Chunk 77
Chunk 78
Chunk 79
Wrote data_008.p
Chunk 80
Chunk 81
Chunk 82
Chunk 83
Chunk 84
Chunk 85
Chunk 86
Chunk 87
Wrote data_009.p
Chunk 88
Chunk 89
Chunk 90
Chunk 91
Chunk 92
Chunk 93
Chunk 94
Chunk 95
Wrote data_010.p
Chunk 96
Chunk 97
Chunk 98
Chun

In [14]:
%%time

hidden_units = [10]
learning_rate = 0.01
bert_model = 'bert_uncased_small_tokens50'
classifier = 'DNN'
dropout_rate = 0.2
train_max_steps = 2000
input_size = None # all

dir_models = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate)
print ('\n\nsaving to:\n\n',dir_models, '\n\n')

config = tf.ConfigProto()
run_config = RunConfig(model_dir=dir_models, session_config=config,save_checkpoints_steps=100)
estimator = DNNClassifier(
    hidden_units=hidden_units,
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2,
    config=run_config,
    optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate),
    dropout=dropout_rate)
train_input_fn, dev_input_fn = get_input_fn(dir_BSU_train_tokens50, input_size) # BERT-small, uncased [tokens50]
estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)



saving to:

 /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_tokens50_DNN_inputNone_maxsteps2000_hu10_lr0.01_dropout0.2 


INFO:tensorflow:Using config: {'_model_dir': '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_tokens50_DNN_inputNone_maxsteps2000_hu10_lr0.01_dropout0.2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x131af5f60>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_rep

In [16]:
# result
dfTK_50 = pd.DataFrame.from_dict(estimator.evaluate(dev_input_fn), orient='index', columns=['TK_50'])
dfTK_50

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-07-09:24:54
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_tokens50_DNN_inputNone_maxsteps2000_hu10_lr0.01_dropout0.2/model.ckpt-1563
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-07-09:24:55
INFO:tensorflow:Saving dict for global step 1563: accuracy = 0.7546, accuracy_baseline = 0.5, auc = 0.83904177, auc_precision_recall = 0.84167016, average_loss = 0.49245283, global_step = 1563, label/mean = 0.5, loss = 61.556602, precision = 0.7524792, prediction/mean = 0.4926438, recall = 0.7588
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1563: /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_tokens50_DNN_inputNone_ma

Unnamed: 0,TK_50
accuracy,0.7546
accuracy_baseline,0.5
auc,0.839042
auc_precision_recall,0.84167
average_loss,0.492453
label/mean,0.5
loss,61.556602
precision,0.752479
prediction/mean,0.492644
recall,0.7588


### Increase to 100 tokens

In [18]:
# !bert-serving-start -model_dir ./uncased_L-12_H-768_A-12 -num_worker=4 -max_seq_len=100

In [19]:
%%time

# set directories
my_dir_train = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/train'
my_dir_test  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/test'
my_dir_train_output = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/train_tokens100'
my_dir_test_output  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/test_tokens100'

# build cache
input_fn_train = cache_data(my_dir_train, my_dir_train_output, 0, 14)
input_fn_eval = cache_data(my_dir_test, my_dir_test_output, 0, 14)

# Preparation for testing
# BERT-small, uncased [more tokens]
dir_BSU_train_tokens100 = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/train_tokens100'
dir_BSU_test_tokens100  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/test_tokens100'

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


Wrote data_000.p
Wrote data_001.p


KeyboardInterrupt: 

In [None]:
%%time

hidden_units = [10]
learning_rate = 0.01
bert_model = 'bert_uncased_small_tokens100'
classifier = 'DNN'
dropout_rate = 0.2
train_max_steps = 2000
input_size = None # all

dir_models = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate)
print ('\n\nsaving to:\n\n',dir_models, '\n\n')

config = tf.ConfigProto()
run_config = RunConfig(model_dir=dir_models, session_config=config,save_checkpoints_steps=100)
estimator = DNNClassifier(
    hidden_units=hidden_units,
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2,
    config=run_config,
    optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate),
    dropout=dropout_rate)
train_input_fn, dev_input_fn = get_input_fn(dir_BSU_train_tokens100, input_size) # BERT-small, uncased [tokens50]
estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)

In [15]:
# result
dfTK_100 = pd.DataFrame.from_dict(estimator.evaluate(dev_input_fn), orient='index', columns=['TK_100'])
dfTK_100

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-07-09:24:40
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_tokens50_DNN_inputNone_maxsteps2000_hu10_lr0.01_dropout0.2/model.ckpt-1563
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-07-09:24:41
INFO:tensorflow:Saving dict for global step 1563: accuracy = 0.7546, accuracy_baseline = 0.5, auc = 0.83904177, auc_precision_recall = 0.84167016, average_loss = 0.49245283, global_step = 1563, label/mean = 0.5, loss = 61.556602, precision = 0.7524792, prediction/mean = 0.4926438, recall = 0.7588
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1563: /Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_bert_uncased_small_tokens50_DNN_inputNone_ma

Unnamed: 0,TK_100
accuracy,0.7546
accuracy_baseline,0.5
auc,0.839042
auc_precision_recall,0.84167
average_loss,0.492453
label/mean,0.5
loss,61.556602
precision,0.752479
prediction/mean,0.492644
recall,0.7588


### Increase to 200 tokens

In [None]:
%%time

# set directories
my_dir_train = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/train'
my_dir_test  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/aclImdb/test'
my_dir_train_output = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/train_tokens200'
my_dir_test_output  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/test_tokens200'

# build cache
input_fn_train = cache_data(my_dir_train, my_dir_train_output, 0, 14)
input_fn_eval = cache_data(my_dir_test, my_dir_test_output, 0, 14)

# Preparation for testing
# BERT-small, uncased [more tokens]
dir_BSU_train_tokens200 = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/train_tokens200'
dir_BSU_test_tokens200  = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/uncased_L-12_H-768_A-12/cache/test_tokens200'

In [None]:
%%time

hidden_units = [10]
learning_rate = 0.01
bert_model = 'bert_uncased_small_tokens200'
classifier = 'DNN'
dropout_rate = 0.2
train_max_steps = 2000
input_size = None # all

dir_models = '/Users/jlc/Google Drive/_code/MIDS_W266/BERT_Imdb/trained_models/imdb_{}_{}_input{}_maxsteps{}_hu{}_lr{}_dropout{}'.format(
    bert_model, classifier, input_size, train_max_steps, '_'.join([str(x) for x in hidden_units]), learning_rate, dropout_rate)
print ('\n\nsaving to:\n\n',dir_models, '\n\n')

config = tf.ConfigProto()
run_config = RunConfig(model_dir=dir_models, session_config=config,save_checkpoints_steps=100)
estimator = DNNClassifier(
    hidden_units=hidden_units,
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2,
    config=run_config,
    optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate),
    dropout=dropout_rate)
train_input_fn, dev_input_fn = get_input_fn(dir_BSU_train_tokens200, input_size) # BERT-small, uncased [tokens50]
estimator.train(input_fn=train_input_fn, max_steps=train_max_steps)

In [None]:
# result
dfTK_200 = pd.DataFrame.from_dict(estimator.evaluate(dev_input_fn), orient='index', columns=['TK_200'])
dfTK_200

### Compare

In [None]:
vpd.concat([df_LR01, dfTK_50, dfTK_100, dfTK_200], axis=1)

---   
  
# Expiriment:

---  

---   
  
# Expiriment:

---  

---   
  
# Expiriment:

---  

---   
  
# Expiriment:

---  

---   
  
# Expiriment:

---  