## Why does changing to SGD from Adam with a lower learning rate allow it to train?

### Tensorboard helper functions

In [1]:
import tensorflow as tf
from datetime import datetime

def tensorboard_scalar(writer:tf.summary.SummaryWriter, 
                       name:str, data:float, step:int):
    with writer.as_default():
        tf.summary.scalar(name, data, step)
        
def tensorboard_histogram(writer:tf.summary.SummaryWriter, 
                       name:str, data:tf.Tensor, step:int):
    with writer.as_default():
        tf.summary.histogram(name, data, step)
        
def additional_notes(log_dir:str):
    notes = input("Additional training notes (Press enter to skip):")
    if notes:
        with open(log_dir + '/additional-notes.txt', 'w') as fh:
            fh.write(notes)

def model_summary_log(model: tf.keras.Model):
    with open(log_dir + '/model_summary.txt','w') as fh:
        # Pass the file handle in as a lambda function to make it callable
        model.summary(print_fn=lambda x: fh.write(x + '\n'))

def datasets_log(train_fname:str, dev_fname:str):
    with open(log_dir + '/datasets.txt', 'w') as fh:
        fh.write("Datasets used:\n{}\n{}".format(suffix_train, suffix_dev))

### Metrics to log helper functions

**Weights**

In [2]:
### Scalars to log ###
def get_avg_min_max_neuron(neuron_params):
    """ returns a tuple of constants containing the weights of a neuron. """
    assert len(neuron_params.shape) == 1, "must only be for a single neuron."
    
    # get average of neuron weights
    avg = tf.math.reduce_mean(neuron_params)
    min_ = tf.math.reduce_min(neuron_params)
    max_ = tf.math.reduce_max(neuron_params)
    
    return (avg, min_, max_)


def get_avg_min_max_layer(layer_params_matrix):
    """ returns a tuple of tensors containing the params (i.e. weights or grads) of a layer.
    
    avg -- avg[0] = average param value for each input element for neuron 0.
    max_ -- max_[0] = max param value of every input element for neuron 0.
    min_ -- min_[0] = min param value of every input element for neuron 0.
    """
    lpm = layer_params_matrix
    num_neurons = layer_params_matrix.shape[1]  #[0] are input elements (i.e. prev layer neurons)
    
    layer_info = {
        'avg': list(),
        'min': list(),
        'max': list()
    }
    
    # curate avg, min and max neuron weight tensors for the layer
    for i in range(num_neurons):        
        (n_avg, n_min, n_max) = get_avg_min_max_neuron(lpm[:, i])
        layer_info['avg'].append(n_avg)
        layer_info['min'].append(n_min)
        layer_info['max'].append(n_max)

    avg = tf.convert_to_tensor(layer_info['avg'])
    min_ = tf.convert_to_tensor(layer_info['min'])
    max_ = tf.convert_to_tensor(layer_info['max'])
    
    assert avg.shape[0] == num_neurons,\
            "Avg neuron param tensor should be equivalent to number of neurons in the layer."
    
    return (avg, min_, max_)

In [3]:
def tensorboard_hist_avg_min_max_for_layer(tensors_tuple:tuple, name_suffix:str, epoch:int):
    assert len(tensors_tuple) == 3, "There should be average, max and min tensors."
    (avg, min_, max_) = tensors_tuple
    tensorboard_histogram(writer, '{}_avg'.format(name_suffix), avg, epoch)
    tensorboard_histogram(writer, '{}_min'.format(name_suffix), min_, epoch)
    tensorboard_histogram(writer, '{}_max'.format(name_suffix), max_, epoch)

**Gradients**

In [4]:
##

### Data Pipeline

In [5]:
max_seq_length =  64
def _parse_and_transform(serialized)-> dict:
    feature_description = {
        'input_word_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
        'input_mask': tf.io.FixedLenFeature([max_seq_length], tf.int64),
        'segment_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
        'target': tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(serialized, feature_description)
    # transform
    target = example.pop('target')
    target = tf.reshape(target, ())
#     target = tf.cast(target, tf.float32)
    
    embeddings_dict = example
    for k, v in embeddings_dict.items():
        embeddings_dict[k] = tf.cast(v, tf.int32)

    target_dict = {'target': target}

    return (embeddings_dict, target_dict)

In [6]:
import os
os.path.exists("../dataset/tfrecords/train_64_balanced_1000_samples.tfrecord")

True

In [7]:
import tensorflow as tf
from fact_verification_system.classifier.scripts.train import _extract
import multiprocessing

suffix_train = "train_64_balanced_10000_samples.tfrecord"
file_pattern = "../dataset/tfrecords/" + suffix_train

ds_train = _extract(file_pattern)

num_cpus = multiprocessing.cpu_count()
ds_train = ds_train.map(_parse_and_transform, num_parallel_calls=num_cpus)
ds_train = ds_train.cache()

Reading from file pattern: ../dataset/tfrecords/train_64_balanced_10000_samples.tfrecord


Confirm? (y/n):  y


Extracted all TFRecords with pattern ../dataset/tfrecords/train_64_balanced_10000_samples.tfrecord


### Model

In [8]:
from fact_verification_system.classifier.models.textual_entailment import create_bert_model
model = create_bert_model(max_seq_length=64)

layer_indices = {l.name: i for i, l in enumerate(model.layers)}  ## to be used layer

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 64)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 64)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

### Training

In [9]:
### Training Step Function ### 
import tensorflow as tf
from tensorflow.keras.losses import Loss
from typing import Dict, List, Tuple

@tf.function
def compute_grads(train_batch: Dict[str,tf.Tensor], target_batch: tf.Tensor, 
                 loss_fn: Loss, model: tf.keras.Model):
    with tf.GradientTape(persistent=False) as tape:
        # forward pass
        outputs = model(train_batch)
        # calculate loss
        loss = loss_fn(target_batch, outputs)
    
    # calculate gradients for each param
    grads = tape.gradient(loss, model.trainable_variables)
    return grads, loss

In [10]:
### Training ###
import code
from IPython.core.debugger import set_trace
import tensorflow as tf
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam, SGD
from tqdm.notebook import tqdm

assert tf.executing_eagerly(), "Tensorflow not in eager execution mode."

# MANUAL
DATASET_SIZE = 10000
BATCH_SIZE = 8
EPOCHS = 15

bce = BinaryCrossentropy()
optimizer = SGD(learning_rate=0.001)

tf.random.set_seed(1)  # reproducibility -- weights initialization
log_grads = False
log_weights = False

# tensorboard init
timestamp = datetime.now().strftime("%d.%m.%Y-%H.%M.%S")
train_log_dir = './train_logs/{}'.format(timestamp)
writer = tf.summary.create_file_writer(train_log_dir)

for epoch in tqdm(range(EPOCHS), desc='epoch'):
    # log weights
    if log_weights: 
        dense_0_w = model.layers[-3].weights[0]
        avg_min_max = get_avg_min_max_layer(dense_0_w)
        tensorboard_hist_avg_min_max_for_layer(avg_min_max, 'd_0_w', epoch)

        dense_1_w = model.layers[-2].weights[0]
        avg_min_max = get_avg_min_max_layer(dense_1_w)
        tensorboard_hist_avg_min_max_for_layer(avg_min_max, 'd_1_w', epoch)

    # accumulators
    accu_loss = 0.0
    accu_d_0_avg_grads = tf.zeros(model.layers[layer_indices.get('dense_0')].units)
    accu_d_1_avg_grads = tf.zeros(model.layers[layer_indices.get('dense_1')].units)
    accu_target_grads = tf.zeros(model.layers[layer_indices.get('target')].units)
    hist_grad_step = 0  # for marking steps on histogram
    
    for (i, (train_batch, target_dict)) in tqdm(enumerate(ds_train.shuffle(1024).batch(BATCH_SIZE)), desc='step'):

        (grads, loss) = compute_grads(train_batch, target_dict['target'], bce, model)
#         set_trace()
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
#         set_trace()
        
        accu_loss += loss
        if (i+1) % 250 == 0:
            print("average loss: {}".format(accu_loss/(i+1)))
        
        # accumulate average grads
        if log_grads:
            steps_to_accumulate = 625
            
            d_0_grads = grads[-6]
            (d_0_avg_grads, _, _) = get_avg_min_max_layer(d_0_grads)
            accu_d_0_avg_grads += d_0_avg_grads
            
            d_1_grads = grads[-4]
            (d_1_avg_grads, _, _) = get_avg_min_max_layer(d_1_grads)
            accu_d_1_avg_grads += d_1_avg_grads
            
            target_grads = grads[-2]
            accu_target_grads += target_grads
            
            if (i+1) % steps_to_accumulate == 0:
                tensorboard_histogram(writer, 'd_0_grads_avg', 
                                      accu_d_0_avg_grads/steps_to_accumulate, 
                                      hist_grad_step)
                tensorboard_histogram(writer, 'd_1_grads_avg', 
                                      accu_d_1_avg_grads/steps_to_accumulate, 
                                      hist_grad_step)
                tensorboard_histogram(writer, 'target_grads', 
                                      accu_target_grads/steps_to_accumulate, 
                                      hist_grad_step)
                
                # reset grads
                accu_d_0_avg_grads = tf.zeros(model.layers[layer_indices.get('dense_0')].units)
                accu_d_1_avg_grads = tf.zeros(model.layers[layer_indices.get('dense_1')].units)
                accu_target_grads = tf.zeros(model.layers[layer_indices.get('target')].units)
                
                hist_grad_step += 1

            
    avg_epoch_loss = accu_loss/(i+1)
    tensorboard_scalar(writer, name='epoch_loss', data=avg_epoch_loss, step=epoch)
    
    print("Epoch {}: epoch_loss = {}".format(epoch, avg_epoch_loss))

HBox(children=(FloatProgress(value=0.0, description='epoch', max=15.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6969003081321716
average loss: 0.6949845552444458
average loss: 0.6955817341804504
average loss: 0.6957056522369385
average loss: 0.6957424283027649
average loss: 0.6902496814727783
average loss: 0.6848475337028503
average loss: 0.6825650930404663
average loss: 0.6790742874145508

Epoch 0: epoch_loss = 0.6789683103561401


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6943986415863037
average loss: 0.6932730674743652
average loss: 0.6933279037475586
average loss: 0.6935257315635681
average loss: 0.693976879119873
average loss: 0.6884613633155823
average loss: 0.6854684948921204
average loss: 0.6834751963615417
average loss: 0.6803459525108337

Epoch 1: epoch_loss = 0.6789056062698364


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

average loss: 0.6968092322349548
average loss: 0.6956266164779663
average loss: 0.6957470774650574
average loss: 0.6956632137298584
average loss: 0.6946893334388733
average loss: 0.6884742975234985
average loss: 0.6860034465789795
average loss: 0.6831181645393372
average loss: 0.6804528832435608

Epoch 2: epoch_loss = 0.6799877882003784


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…




KeyboardInterrupt: 

In [None]:
train_log_dir

In [18]:
from tensorflow.keras.losses import BinaryCrossentropy
from tqdm.notebook import tqdm

bce = BinaryCrossentropy()

for (i, (embs_dict, target_dict)) in tqdm(enumerate(ds_train.shuffle(1024).batch(8)), desc='step'):
    outputs = model(embs_dict)
    loss = bce(target_dict.get('target'), outputs)
#     (grads, loss) = compute_grads(embs_dict, target_dict, bce, model)
#     optimizer.apply_gradients(zip(grads, model.trainable_variables))
    break

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='step', max=1.0, style=ProgressStyle(des…

In [13]:
layer_indices

{'input_word_ids': 0,
 'input_mask': 1,
 'segment_ids': 2,
 'keras_layer': 3,
 'dense_0': 4,
 'dense_1': 5,
 'target': 6}

In [19]:
loss

<tf.Tensor: id=25490, shape=(), dtype=float32, numpy=0.6917711>