In [1]:
import tensorflow as tf
from tensorflow.keras.losses import BinaryCrossentropy

In [2]:
max_seq_length =  64
def _parse_and_transform(serialized)-> dict:
    feature_description = {
        'input_word_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
        'input_mask': tf.io.FixedLenFeature([max_seq_length], tf.int64),
        'segment_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
        'target': tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(serialized, feature_description)
    # transform
    target = example.pop('target')
    target = tf.reshape(target, ())
#     target = tf.cast(target, tf.float32)
    
    embeddings_dict = example
    for k, v in embeddings_dict.items():
        embeddings_dict[k] = tf.cast(v, tf.int32)

    target_dict = {'target': target}

    return (embeddings_dict, target_dict)

In [3]:
from fact_verification_system.classifier.scripts.train import _extract
import multiprocessing

suffix_train = "train_64_balanced_1000_samples.tfrecord"
file_pattern = "../dataset/tfrecords/" + suffix_train

ds_train = _extract(file_pattern)

num_cpus = multiprocessing.cpu_count()
ds_train = ds_train.map(_parse_and_transform, num_parallel_calls=num_cpus)

Reading from file pattern: ../dataset/tfrecords/train_64_balanced_1000_samples.tfrecord


Confirm? (y/n):  y


Extracted all TFRecords with pattern ../dataset/tfrecords/train_64_balanced_1000_samples.tfrecord


In [5]:
x = next(iter(ds_train.batch(1)))
train_batch = ((x[0].get('input_mask'), x[0].get('input_word_ids'), x[0].get('segment_ids')))
target_batch = (x[1].get('target'))

### Trying `tf.GradientTape`

In [6]:
from fact_verification_system.classifier.models.textual_entailment import create_bert_model

tf.random.set_seed(42)

model = create_bert_model(max_seq_length=64)
bce = BinaryCrossentropy()

with tf.GradientTape() as tape:
    # get model outputs
    outputs = model(train_batch)
    print("Output: {}\tLabel: {}".format(outputs, target_batch))
    # calculate loss between targets and outputs
    loss = bce(y_true=target_batch, y_pred=outputs)
    print(loss)
    
# get gradients for each weight variable w.r.t the loss function
grads = tape.gradient(loss, model.trainable_variables)

Output: [[0.55091435]]	Label: [1]
tf.Tensor(0.59617573, shape=(), dtype=float32)


In [7]:
# binary crossentropy sanity check
y_pred = tf.constant([[0.55091435]])
y_true = tf.constant([[1.0]])
loss_check = bce(y_true, y_pred)
print(loss_check)
assert loss_check == loss, "Their loss should be equivalent."

tf.Tensor(0.59617573, shape=(), dtype=float32)


In [7]:
import pandas as pd
dense_0 = model.trainable_variables[199]
dense_0_bias = model.trainable_variables[200]
dense_1 = model.trainable_variables[201]
dense_1_bias = model.trainable_variables[202]
target = model.trainable_variables[203]
target_bias = model.trainable_variables[204]

### Neuron Inspection
Each neuron of the layer has an **individual weight variable for each individual input element**.

i.e.
**For the one neuron, multiply each weight variable with their respective input element and sum them.**

hence, for a 512 neuron layer with the previous input embedding of 768 elements, there should be 512 X 768 weight variables in total.

In [8]:
dense_0.shape

TensorShape([768, 512])

In [9]:
# layer's first neuron's weights for their respective input element.
dense_0_w_0 = dense_0[:, 0]
dense_0_w_0.shape

TensorShape([768])

In [10]:
print(dense_0_w_0[0])
tf.math.reduce_mean(dense_0_w_0)

tf.Tensor(0.022533596, shape=(), dtype=float32)


<tf.Tensor: id=42792, shape=(), dtype=float32, numpy=0.00069414685>

### Applying gradient to the trainable variables

In [11]:
target_bias_grad = grads[-1]
target_grad = grads[-2]

In [12]:
print(target_bias)
print(target_bias_grad)
print(target[0])
print(target_grad[0])
print(target.shape)

<tf.Variable 'target/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>
tf.Tensor([-0.44908556], shape=(1,), dtype=float32)
tf.Tensor([0.07375957], shape=(1,), dtype=float32)
tf.Tensor([-0.23807847], shape=(1,), dtype=float32)
(256, 1)


In [13]:
from tensorflow.keras.optimizers import SGD
lr = 0.01
optimizer = SGD(lr=lr)

In [14]:
optimizer.apply_gradients(zip(grads, model.trainable_variables))
# result for target bias should become 0 - 0.01*-0.44908556
# result for target layer's 1st weight variable should become 0.0737 - 0.01*-0.23807847 = 
print(model.trainable_variables[204][0])  # new target_bias
print(model.trainable_variables[203][0])  # new target's 1st weight variable for the single neuron. Should have 256 weights.

tf.Tensor(0.0044908556, shape=(), dtype=float32)
tf.Tensor([0.07614036], shape=(1,), dtype=float32)


In [15]:
new_target_bias = model.trainable_variables[204][0]
assert new_target_bias.numpy() == (0 - lr*grads[-1]).numpy()[0], "This should be equal to applying the gradient."

In [15]:
grads[-6].shape

TensorShape([768, 512])

### Tensorboard Logging Template

In [17]:
# Logging Template
train_log_dir = './train_logs'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
with train_summary_writer.as_default():
    tf.summary.scalar('grad_target_bias', grads[-1][0], step=2)

In [18]:
from datetime import datetime
datetime.now().strftime("%d.%m.%Y-%H.%M.%S")

'05.03.2020-03.37.38'

### Tensorboard Histogram Logging

In [24]:
with train_summary_writer.as_default():
    tf.summary.histogram(name='histogram_0', data=grads[-1], step=5)

In [27]:
grads[-1]

<tf.Tensor: id=42526, shape=(1,), dtype=float32, numpy=array([-0.44908556], dtype=float32)>

In [26]:
grads[-2].shape

TensorShape([256, 1])

In [28]:
for i in range(5):
    with train_summary_writer.as_default():
        tf.summary.histogram(name='histogram_1', data=grads[-2], step=i)

In [29]:
# find how many between -0.0289 and -0.00962
accumulator = 0
for g in grads[-2]:
    if tf.math.greater(g, -0.0289) and tf.math.less(g, -0.00962): 
        accumulator += 1


TensorShape([256, 1])

In [34]:
grads[-2]
tf.math.reduce_max(grads[-2])

<tf.Tensor: id=43389, shape=(), dtype=float32, numpy=0.0>

In [39]:
mask = tf.math.not_equal(grads[-2], 0)
tf.math.reduce_max(tf.boolean_mask(grads[-2], mask))

<tf.Tensor: id=43482, shape=(), dtype=float32, numpy=-0.0054449686>

In [45]:
greater_mask = tf.math.greater(grads[-2], -0.0289)
greater_mask
lesser_mask = tf.math.less(grads[-2], -0.00962)
lesser_mask
mask = tf.math.logical_and(greater_mask, lesser_mask)
tf.boolean_mask(grads[-2], mask)

<tf.Tensor: id=43533, shape=(14,), dtype=float32, numpy=
array([-0.01714745, -0.01984281, -0.01187051, -0.01308818, -0.02760158,
       -0.02324713, -0.01013472, -0.01994801, -0.01852799, -0.02048325,
       -0.02513314, -0.01785563, -0.01581509, -0.02214511], dtype=float32)>

In [46]:
tf.executing_eagerly()

True

In [57]:
model.layers[-2].weights[0].shape

TensorShape([512, 256])

In [58]:
l = [1, 2, 3]
tf.convert_to_tensor(l)

<tf.Tensor: id=43550, shape=(3,), dtype=int32, numpy=array([1, 2, 3], dtype=int32)>