## Why does changing to SGD from Adam with a lower learning rate allow it to train?

### Tensorboard helper functions

In [1]:
import tensorflow as tf
from datetime import datetime

def tensorboard_scalar(writer:tf.summary.SummaryWriter, 
                       name:str, data:float, step:int):
    with writer.as_default():
        tf.summary.scalar(name, data, step)
        
def tensorboard_histogram(writer:tf.summary.SummaryWriter, 
                       name:str, data:tf.Tensor, step:int):
    with writer.as_default():
        tf.summary.histogram(name, data, step)
        
def additional_notes(log_dir:str):
    notes = input("Additional training notes (Press enter to skip):")
    if notes:
        with open(log_dir + '/additional-notes.txt', 'w') as fh:
            fh.write(notes)

def model_summary_log(model: tf.keras.Model):
    with open(log_dir + '/model_summary.txt','w') as fh:
        # Pass the file handle in as a lambda function to make it callable
        model.summary(print_fn=lambda x: fh.write(x + '\n'))

def datasets_log(train_fname:str, dev_fname:str):
    with open(log_dir + '/datasets.txt', 'w') as fh:
        fh.write("Datasets used:\n{}\n{}".format(suffix_train, suffix_dev))

### Metrics to log helper functions

**Weights**

In [2]:
### Scalars to log ###
def get_avg_min_max_neuron(neuron_params):
    """ returns a tuple of constants containing the weights of a neuron. """
    assert len(neuron_params.shape) == 1, "must only be for a single neuron."
    
    # get average of neuron weights
    avg = tf.math.reduce_mean(neuron_params)
    min_ = tf.math.reduce_min(neuron_params)
    max_ = tf.math.reduce_max(neuron_params)
    
    return (avg, min_, max_)


def get_avg_min_max_layer(layer_params_matrix):
    """ returns a tuple of tensors containing the params (i.e. weights or grads) of a layer.
    
    avg -- avg[0] = average param value for each input element for neuron 0.
    max_ -- max_[0] = max param value of every input element for neuron 0.
    min_ -- min_[0] = min param value of every input element for neuron 0.
    """
    lpm = layer_params_matrix
    num_neurons = layer_params_matrix.shape[1]  #[0] are input elements (i.e. prev layer neurons)
    
    layer_info = {
        'avg': list(),
        'min': list(),
        'max': list()
    }
    
    # curate avg, min and max neuron weight tensors for the layer
    for i in range(num_neurons):        
        (n_avg, n_min, n_max) = get_avg_min_max_neuron(lpm[:, i])
        layer_info['avg'].append(n_avg)
        layer_info['min'].append(n_min)
        layer_info['max'].append(n_max)

    avg = tf.convert_to_tensor(layer_info['avg'])
    min_ = tf.convert_to_tensor(layer_info['min'])
    max_ = tf.convert_to_tensor(layer_info['max'])
    
    assert avg.shape[0] == num_neurons,\
            "Avg neuron param tensor should be equivalent to number of neurons in the layer."
    
    return (avg, min_, max_)

In [3]:
def tensorboard_hist_avg_min_max_for_layer(tensors_tuple:tuple, name_suffix:str, step:int):
    assert len(tensors_tuple) == 3, "There should be average, max and min tensors."
    (avg, min_, max_) = tensors_tuple
    tensorboard_histogram(writer, '{}_avg'.format(name_suffix), avg, step)
    tensorboard_histogram(writer, '{}_min'.format(name_suffix), min_, step)
    tensorboard_histogram(writer, '{}_max'.format(name_suffix), max_, step)

**Gradients**

In [4]:
##

### Data Pipeline

In [5]:
max_seq_length =  64
def _parse_and_transform(serialized)-> dict:
    feature_description = {
        'input_word_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
        'input_mask': tf.io.FixedLenFeature([max_seq_length], tf.int64),
        'segment_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
        'target': tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(serialized, feature_description)
    # transform
    target = example.pop('target')
    target = tf.reshape(target, ())
#     target = tf.cast(target, tf.float32)
    
    embeddings_dict = example
    for k, v in embeddings_dict.items():
        embeddings_dict[k] = tf.cast(v, tf.int32)

    target_dict = {'target': target}

    return (embeddings_dict, target_dict)

In [6]:
import os
os.path.exists("../dataset/tfrecords/train_64_balanced_1000_samples.tfrecord")

True

In [9]:
import tensorflow as tf
from fact_verification_system.classifier.scripts.train import _extract
import multiprocessing

suffix_train = "train_64_balanced_10000_samples.tfrecord"
file_pattern = "../dataset/tfrecords/" + suffix_train

ds_train = _extract(file_pattern)

num_cpus = multiprocessing.cpu_count()
ds_train = ds_train.map(_parse_and_transform, num_parallel_calls=num_cpus)
ds_train = ds_train.cache()

Reading from file pattern: ../dataset/tfrecords/train_64_balanced_10000_samples.tfrecord


Confirm? (y/n):  y


Extracted all TFRecords with pattern ../dataset/tfrecords/train_64_balanced_10000_samples.tfrecord


### Model

In [10]:
from fact_verification_system.classifier.models.textual_entailment import create_bert_model
model = create_bert_model(max_seq_length=64)

layer_indices = {l.name: i for i, l in enumerate(model.layers)}  ## to be used layer

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 64)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 64)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

### Training (Compute Grads)

In [11]:
## for debugging training
import tensorflow as tf
def zero_grads_percentage(layer_grads:tf.Tensor):
    assert len(layer_grads.shape) == 2, "There should be gradients for each input element weight for each neuron."
    
    (avg, min_, max_) = get_avg_min_max_layer(layer_grads)
    zero_mask = tf.math.equal(avg, 0)
    layer_zero_grads_percent = (len(list(filter(lambda x: x, zero_mask))))/layer_grads.shape[1]
    return layer_zero_grads_percent

In [12]:
### Training Step Function ### 
import tensorflow as tf
from tensorflow.keras.losses import Loss
from typing import Dict, List, Tuple

@tf.function
def compute_grads(train_batch: Dict[str,tf.Tensor], target_batch: tf.Tensor, 
                 loss_fn: Loss, model: tf.keras.Model):
    with tf.GradientTape(persistent=False) as tape:
        # forward pass
        outputs = model(train_batch)
        # calculate loss
        loss = loss_fn(y_true=target_batch, y_pred=outputs)
    
    # calculate gradients for each param
    grads = tape.gradient(loss, model.trainable_variables)
    return grads, loss

### ignore this -- for debugging

In [11]:
model.layers[3].trainable

True

In [12]:
tf.math.log(0.4554)

<tf.Tensor: id=24624, shape=(), dtype=float32, numpy=-0.78657913>

In [52]:
def bce_custom(target_batch, train_batch):
    assert len(train_batch) == len(target_batch), "num train and target must match."
    total_loss = 0.0
    i = 0
    for y_pred in train_batch:
        y_true = tf.cast(target_batch[i], dtype=tf.float32)
        total_loss += (y_true*tf.math.log(y_pred)) + ((1-y_true)*tf.math.log(1-y_pred))
        i += 1
    return total_loss/len(train_batch)

In [93]:
### Testing gradient function ###
from tensorflow.keras.losses import BinaryCrossentropy
x_batch, target_dict = next(iter(ds_train.batch(1)))
bce = BinaryCrossentropy()
with tf.GradientTape(persistent=False) as tape:
    outputs = model(x_batch)
    print("outputs: {}".format(outputs))
    loss = bce(y_true=target_dict['target'], y_pred=outputs)
    print("loss: {}".format(loss))
    loss_custom = bce_custom(target_dict['target'], outputs)
    print("loss_custom: {}".format(loss_custom))
    print("Loss is fine.")
    print(len(tape.watched_variables()))
    # loss is fine.

# check gradient
grads = tape.gradient(loss, model.trainable_variables)
print("target: {}".format(target_dict['target']))
print("dense_1_bias: {}".format(model.layers[-1].bias.numpy()))
print("dense_1_bias_grad: {}".format(grads[-1]))
# print(grads[-2])

outputs: [[0.4544241]]
loss: 0.7887241840362549
loss_custom: [-0.78872436]
Loss is fine.
205
target: [1]
dense_1_bias: [0.]
dense_1_bias_grad: [-0.54557574]


In [66]:
y_true = tf.cast(target_dict['target'], dtype=tf.float32)
y_pred = outputs[0]
-((y_true/y_pred) - ((1-y_true)/(1-y_pred)))

<tf.Tensor: id=92905, shape=(1,), dtype=float32, numpy=array([-2.2005875], dtype=float32)>

In [68]:
tf.math.reduce_sum(grads[-2])

<tf.Tensor: id=92907, shape=(), dtype=float32, numpy=-29.265625>

In [89]:
x = tf.constant([[0.4544241]])
y_true = tf.constant([1.0])
bce = BinaryCrossentropy()
with tf.GradientTape(persistent=False) as tape:
    tape.watch(x)
    loss = bce(y_true, x)
    print(dir(tape))
#     print(loss)
grad = tape.gradient(loss, x)
print(grad)

['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_created_eagerly', '_persistent', '_pop_tape', '_push_tape', '_recording', '_tape', '_tf_api_names', '_tf_api_names_v1', '_watch_accessed_variables', 'batch_jacobian', 'gradient', 'jacobian', 'reset', 'stop_recording', 'watch', 'watched_variables']
tf.Tensor([[-2.200587]], shape=(1, 1), dtype=float32)


In [79]:
x = tf.constant(3.0)
with tf.GradientTape() as g:
  g.watch(x)
  y = x * x
dy_dx = g.gradient(y, x)
dy_dx

<tf.Tensor: id=93206, shape=(), dtype=float32, numpy=6.0>

### Training

In [13]:
### Training ###
import code
from IPython.core.debugger import set_trace
import tensorflow as tf
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam, SGD
from tqdm.notebook import tqdm

assert tf.executing_eagerly(), "Tensorflow not in eager execution mode."

# MANUAL
DATASET_SIZE = 10000
BATCH_SIZE = 8
EPOCHS = 15

bce = BinaryCrossentropy()
optimizer = SGD(learning_rate=0.01)
# optimizer = Adam()

# tf.random.set_seed(1)  # reproducibility -- weights initialization

# tensorboard init
timestamp = datetime.now().strftime("%d.%m.%Y-%H.%M.%S")
train_log_dir = './train_logs_debug/{}'.format(timestamp)
writer = tf.summary.create_file_writer(train_log_dir)


# log initial weights
dense_0_w = model.layers[-3].weights[0]
dense_0_bias = model.layers[-3].bias
avg_min_max = get_avg_min_max_layer(dense_0_w)
tensorboard_hist_avg_min_max_for_layer(avg_min_max, 'd_0_weights', step=0)
tensorboard_histogram(writer, 'd_0_bias', data=dense_0_bias, step=0)

dense_1_w = model.layers[-2].weights[0]
dense_1_bias = model.layers[-2].bias
avg_min_max = get_avg_min_max_layer(dense_1_w)
tensorboard_hist_avg_min_max_for_layer(avg_min_max, 'd_1_weights', step=0)
tensorboard_histogram(writer, 'd_1_bias', data=dense_1_bias, step=0)


for epoch in tqdm(range(EPOCHS), desc='epoch'):
    # - accumulators
    epoch_loss = 0.0
    
    # - debug accumulators
    d_0_zero_grads = 0.0
    d_1_zero_grads = 0.0
    
    for (i, (train_batch, target_dict)) in tqdm(enumerate(ds_train.shuffle(1024).batch(BATCH_SIZE)), desc='step'):

        (grads, loss) = compute_grads(train_batch, target_dict['target'], bce, model)
#         print("outputs:", model.layers[-2].output)
#         tensorboard_histogram(writer, 'd_1_activations', data=model.layers[-2].output, step=(epoch+1))
#         set_trace()
        # debug -- track percentage of neurons receiving zero gradients
        d_0_zero_grads += zero_grads_percentage(grads[-4])
        d_1_zero_grads += zero_grads_percentage(grads[-2])
        
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        epoch_loss += loss
        if (i+1) % 250 == 0:
            print("average loss: {}".format(epoch_loss/(i+1)))
            
    avg_epoch_loss = epoch_loss/(i+1)
    tensorboard_scalar(writer, name='epoch_loss', data=avg_epoch_loss, step=epoch)
    print("Epoch {}: epoch_loss = {}".format(epoch, avg_epoch_loss))
    print(epoch)

    # % of zero gradients
    tensorboard_scalar(writer, name='d_0_zero_grads', data=d_0_zero_grads/(i+1), step=epoch+1)
    tensorboard_scalar(writer, name='d_1_zero_grads', data=d_1_zero_grads/(i+1), step=epoch+1)
    print("average dense_0 zero grad percentage: {}".format(d_0_zero_grads/(i+1)))
    print("average dense_1 zero grad percentage: {}".format(d_1_zero_grads/(i+1)))

    dense_0_w = model.layers[-3].weights[0]
    dense_0_bias = model.layers[-3].bias
    avg_min_max = get_avg_min_max_layer(dense_0_w)
    tensorboard_hist_avg_min_max_for_layer(avg_min_max, 'd_0_weights', step=(epoch+1))
    tensorboard_histogram(writer, 'd_0_bias', data=dense_0_bias, step=(epoch+1))

    dense_1_w = model.layers[-2].weights[0]
    dense_1_bias = model.layers[-2].bias
    avg_min_max = get_avg_min_max_layer(dense_1_w)
    tensorboard_hist_avg_min_max_for_layer(avg_min_max, 'd_1_weights', step=(epoch+1))
    tensorboard_histogram(writer, 'd_1_bias', data=dense_1_bias, step=(epoch+1))

HBox(children=(FloatProgress(value=0.0, description='epoch', max=15.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.7130096554756165
average loss: 0.7061865925788879
average loss: 0.7041476368904114
average loss: 0.7020866870880127
average loss: 0.700707733631134
average loss: 0.6933651566505432
average loss: 0.6903945207595825
average loss: 0.6871565580368042
average loss: 0.6844391822814941

Epoch 0: epoch_loss = 0.6837077140808105
0
average dense_0 zero grad percentage: 0.6512334290468011
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6938466429710388
average loss: 0.6944166421890259
average loss: 0.6945149898529053
average loss: 0.6944000720977783
average loss: 0.6943538784980774
average loss: 0.6888379454612732
average loss: 0.6859286427497864
average loss: 0.6840676069259644
average loss: 0.6806698441505432

Epoch 1: epoch_loss = 0.6803997755050659
1
average dense_0 zero grad percentage: 0.7132000724559897
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6967105865478516
average loss: 0.6948843598365784
average loss: 0.6948139071464539
average loss: 0.6946535706520081
average loss: 0.694497287273407
average loss: 0.6897436380386353
average loss: 0.6854881644248962
average loss: 0.6835901737213135
average loss: 0.6806273460388184

Epoch 2: epoch_loss = 0.6805641055107117
2
average dense_0 zero grad percentage: 0.7366224774581366
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6936800479888916
average loss: 0.6937586069107056
average loss: 0.6938440203666687
average loss: 0.6938292980194092
average loss: 0.6936783790588379
average loss: 0.6897039413452148
average loss: 0.6857832074165344
average loss: 0.6844136714935303
average loss: 0.6807544827461243

Epoch 3: epoch_loss = 0.6802563667297363
3
average dense_0 zero grad percentage: 0.7792356563975955
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.695762038230896
average loss: 0.6945033669471741
average loss: 0.6944289803504944
average loss: 0.6942062973976135
average loss: 0.6939288377761841
average loss: 0.6888313889503479
average loss: 0.6854494214057922
average loss: 0.68318772315979
average loss: 0.6801217198371887

Epoch 4: epoch_loss = 0.679723858833313
4
average dense_0 zero grad percentage: 0.6953745572133964
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6959211230278015
average loss: 0.6945236325263977
average loss: 0.6944687366485596
average loss: 0.6942733526229858
average loss: 0.6942042708396912
average loss: 0.6887772083282471
average loss: 0.6853187084197998
average loss: 0.683218240737915
average loss: 0.6798447370529175

Epoch 5: epoch_loss = 0.6794826984405518
5
average dense_0 zero grad percentage: 0.7459914394589953
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6956217885017395
average loss: 0.6945255994796753
average loss: 0.6941826343536377
average loss: 0.694109320640564
average loss: 0.693950355052948
average loss: 0.6894787549972534
average loss: 0.6856407523155212
average loss: 0.6832512021064758
average loss: 0.6803297996520996

Epoch 6: epoch_loss = 0.6800738573074341
6
average dense_0 zero grad percentage: 0.7510180737440961
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6952090859413147
average loss: 0.6938889622688293
average loss: 0.6936584711074829
average loss: 0.6936056017875671
average loss: 0.6936435103416443
average loss: 0.6888396143913269
average loss: 0.6852920055389404
average loss: 0.6827411651611328
average loss: 0.6793138980865479

Epoch 7: epoch_loss = 0.6791651248931885
7
average dense_0 zero grad percentage: 0.7401396119579219
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6949936747550964
average loss: 0.6945156455039978
average loss: 0.694232165813446
average loss: 0.6941329836845398
average loss: 0.6940593123435974
average loss: 0.6881676316261292
average loss: 0.6856600046157837
average loss: 0.6838046312332153
average loss: 0.6809674501419067

Epoch 8: epoch_loss = 0.6805286407470703
8
average dense_0 zero grad percentage: 0.7134415924216402
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6948533654212952
average loss: 0.6943889856338501
average loss: 0.6943912506103516
average loss: 0.6943027377128601
average loss: 0.6943728923797607
average loss: 0.689002275466919
average loss: 0.684881329536438
average loss: 0.6827519536018372
average loss: 0.6791368126869202

Epoch 9: epoch_loss = 0.6790810227394104
9
average dense_0 zero grad percentage: 0.6489188627093173
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6930645704269409
average loss: 0.6931971907615662
average loss: 0.6935930252075195
average loss: 0.693546712398529
average loss: 0.6936454772949219
average loss: 0.6874518394470215
average loss: 0.6833240389823914
average loss: 0.6809234023094177
average loss: 0.677990198135376

Epoch 10: epoch_loss = 0.6779751777648926
10
average dense_0 zero grad percentage: 0.685982114104766
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6953779458999634
average loss: 0.6943340301513672
average loss: 0.6943866610527039
average loss: 0.6942924857139587
average loss: 0.694065272808075
average loss: 0.6885058879852295
average loss: 0.6844488978385925
average loss: 0.6820282936096191
average loss: 0.6787698864936829

Epoch 11: epoch_loss = 0.6787858009338379
11
average dense_0 zero grad percentage: 0.7007466992271361
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6931320428848267
average loss: 0.6924148797988892
average loss: 0.6930054426193237
average loss: 0.6931935548782349
average loss: 0.6932249665260315
average loss: 0.6882827281951904
average loss: 0.6853450536727905
average loss: 0.6835426092147827
average loss: 0.6798926591873169

Epoch 12: epoch_loss = 0.6797335147857666
12
average dense_0 zero grad percentage: 0.7722886029411765
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6939787268638611
average loss: 0.6934921741485596
average loss: 0.6937398314476013
average loss: 0.6938366293907166
average loss: 0.6935799717903137
average loss: 0.6888647675514221
average loss: 0.6849583387374878
average loss: 0.6833866834640503
average loss: 0.6806986331939697

Epoch 13: epoch_loss = 0.68007493019104
13
average dense_0 zero grad percentage: 0.6990678000214684
average dense_1 zero grad percentage: 0.0


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6945400834083557
average loss: 0.6945468187332153
average loss: 0.6945108771324158
average loss: 0.6943650245666504
average loss: 0.69443678855896
average loss: 0.6887607574462891
average loss: 0.6851092576980591
average loss: 0.6827844977378845
average loss: 0.6796362996101379

Epoch 14: epoch_loss = 0.6794284582138062
14
average dense_0 zero grad percentage: 0.6469179368827823
average dense_1 zero grad percentage: 0.0



In [12]:
tf.test.is_gpu_available()

True

In [54]:
w = model.layers[-2].weights[0]
b = model.layers[-2].weights[0]
z = w
model.layers[-2].activation

<function tensorflow.python.keras.activations.relu(x, alpha=0.0, max_value=None, threshold=0)>

In [56]:
model.layers[-2].get_output_at(0)

<tf.Tensor 'dense_1/Identity:0' shape=(None, 256) dtype=float32>

In [62]:
tf.print(model.layers[-2].output)

AttributeError: 'Tensor' object has no attribute '_datatype_enum'

In [61]:
from tensorflow.keras import backend as K

inp = model.input                                           # input placeholder
outputs = [layer.output for layer in model.layers]          # all layer outputs
functor = K.function([inp, K.learning_phase()], outputs )   # evaluation function

AttributeError: 'int' object has no attribute 'op'

In [51]:
dir(model.layers[-2])

['_TF_MODULE_IGNORED_PROPERTIES',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_activity_regularizer',
 '_add_inbound_node',
 '_add_variable_with_custom_getter',
 '_autocast',
 '_call_accepts_kwargs',
 '_call_arg_was_passed',
 '_call_fn_args',
 '_callable_losses',
 '_checkpoint_dependencies',
 '_clear_losses',
 '_collect_input_masks',
 '_compute_dtype',
 '_dedup_weights',
 '_deferred_dependencies',
 '_dtype',
 '_dtype_defaulted_to_floatx',
 '_dtype_policy',
 '_dynamic',
 '_eager_add_metric',
 '_eager_losses',
 '_expects_mask_arg',
 '_expects_training_arg',
 '_flatten',
 '_gather_children_attribute',
 '_gather_saveables_for_checkpoint',
 '_get_call_arg_

In [None]:
train_log_dir

In [18]:
from tensorflow.keras.losses import BinaryCrossentropy
from tqdm.notebook import tqdm

bce = BinaryCrossentropy()

for (i, (embs_dict, target_dict)) in tqdm(enumerate(ds_train.shuffle(1024).batch(8)), desc='step'):
    outputs = model(embs_dict)
    loss = bce(target_dict.get('target'), outputs)
#     (grads, loss) = compute_grads(embs_dict, target_dict, bce, model)
#     optimizer.apply_gradients(zip(grads, model.trainable_variables))
    break

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

In [13]:
layer_indices

{'input_word_ids': 0,
 'input_mask': 1,
 'segment_ids': 2,
 'keras_layer': 3,
 'dense_0': 4,
 'dense_1': 5,
 'target': 6}

In [19]:
loss

<tf.Tensor: id=25490, shape=(), dtype=float32, numpy=0.6917711>