In [4]:
import tensorflow as tf
from fil

# Attention-based Transformer Network


In [5]:
def get_layer_clones(layer, num_layers):
    return [layer for _i in range(num_layers)]


class SubLayerConnection(tf.keras.layers.Layer):
    def __init__(self):
        super(SubLayerConnection, self).__init__()
        self.norm_layer = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, _input, _residual_input):  # _input is sublayer(x), _residual_input is x
        output = self.norm_layer(tf.keras.layers.Add()([_input, _residual_input]))

        return output

class FeedForwardLayer(tf.keras.layers.Layer):
    def __init__(self, ffn_units, _d_model, dropout_rate=0.3):
        super(FeedForwardLayer, self).__init__()
        self.relu = tf.keras.layers.ReLU(ffn_units)
        self.ffn = tf.keras.layers.Dense(units=_d_model)
        self.drop = tf.keras.layers.Dropout(rate=dropout_rate)
        self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)


    def call(self, _inputs):
        _x = self.relu(_inputs)
        _x = self.drop(_x)
        output = self.norm(_x)

        return output

class EncoderLayer(tf.keras.layers.Layer): # Comprises of multi-headed self-attention & feed-forward
    def __init__(self, num_heads, key_dim, _d_model, _ffn_units):
        super(EncoderLayer, self).__init__()
        self.d_model = _d_model
        self.multi_head_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)
        self.feed_forward_layer = FeedForwardLayer(_d_model=self.d_model, ffn_units=_ffn_units)

        self.subconn_layers = get_layer_clones(layer=SubLayerConnection(), num_layers=2)

    def call(self, _inputs, _mask):
        print(f"encoderLayer mha query shape: {_inputs.shape}")
        self_attn = self.multi_head_attention(query=_inputs,
                                              value=_inputs,
                                              key = _inputs,
                                              return_attention_scores=False,
                                              training=False,
                                              attention_mask=_mask)
        print(f"encoder self attn shape: {self_attn.shape}")
        l1 = self.subconn_layers[0](_input=self_attn,
                                    _residual_input=_inputs)

        print(f"encoder l1 shape: {l1.shape}")

        output = self.subconn_layers[1](_input=self.feed_forward_layer(l1),
                                        _residual_input=l1)

        return output

class PositionalEncodingLayer(tf.keras.layers.Layer):
    """code adapted from https://towardsdatascience.com/attention-is-all-you-need-discovering-the-transformer-paper-73e5ff5e0634"""

    def __init__(self):
        super(PositionalEncodingLayer, self).__init__()

    @staticmethod
    def get_angles(pos, i, _d_model, _n=10000.): # pos: (seq_length, 1) i: (1, d_model)
        angles = 1 / np.power(_n, (2*(i//2)) / np.float32(_d_model))
        return pos * angles # (seq_length, d_model)

    def call(self, _inputs, _d_model, _seq_len, _type):

        # input shape batch_size, seq_length, d_model

        print(f"_inputs shape in pos_enc: {_inputs.shape}")
        # Calculate the angles given the input
        angles = self.get_angles(np.arange(_seq_len)[:, np.newaxis],
                                 np.arange(_d_model)[np.newaxis, :],
                                 _d_model)
        # Calculate the positional encodings
        # apply sin to even indices in the array; 2i
        angles[:, 0::2] = np.sin(angles[:, 0::2])

        # apply cos to odd indices in the array; 2i+1
        angles[:, 1::2] = np.cos(angles[:, 1::2])

        if _type == 'encoder':
            # Expand the encodings with a new dimension
            pos_encoding = angles[np.newaxis, ...]
            return _inputs + tf.cast(pos_encoding, tf.float32)
        elif _type == 'decoder':
            return tf.cast(angles, tf.float32)
        else:
            raise TypeError('Wrong type specified for PositionalEncodingLayer. Type must be one of "encoder" or "decoder".')


class Encoder(tf.keras.layers.Layer):
    def __init__(self, _num_heads, _key_dim, _embedding_size, _d_model, _seq_len, _ffn_units, num_layers=6, _embedding_layer=None):
        super(Encoder, self).__init__()

        self.d_model = _d_model
        self.seq_len = _seq_len

        print(f"size of vocab: {len(tokenizer.word_index)+1}")

        print(f"_embedding_layer: {_embedding_layer}")

        self._embedding_layer = _embedding_layer

        if _embedding_layer is None:
            self.word_embedding_layer = tf.keras.layers.Embedding(
                input_dim=len(tokenizer.word_index)+1, # size of vocab
                output_dim=_d_model, # size of vector space to be embedded in
                weights=[get_embedding_vectors(tokenizer, word2embedding_map)],
                trainable=False,
                input_length=self.seq_len, # length of input seqs
        )
        else:
            self.word_embedding_layer = _embedding_layer

        self.positional_encoding_layer = PositionalEncodingLayer()

        encoder_layer = EncoderLayer(num_heads=_num_heads,
                                     key_dim=_key_dim,
                                     _d_model=_d_model,
                                     _ffn_units=_ffn_units)

        self.layers = get_layer_clones(encoder_layer, num_layers)  # repeat the encoder_layer by num_layers many times

    def call(self, _inputs, _mask):
        if self._embedding_layer is None:
            word_embedding_layer = self.word_embedding_layer(_inputs)
        else:
            word_embedding_layer = self.word_embedding_layer

        print(f"word embedding shape: {word_embedding_layer.shape}")

        # Scale the embeddings by sqrt of d_model
        word_embedding_layer *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

        # Positional Encodings
        pos_encoded_inputs = self.positional_encoding_layer(_inputs=word_embedding_layer,
                                                            _d_model=self.d_model,
                                                            _seq_len=self.seq_len,
                                                            _type='encoder')

        print(f"pos_encoded_inputs shape: {pos_encoded_inputs.shape}")

        layer_count = 1

        for _layer in self.layers:
            if layer_count == 1:
                _x = _layer(_inputs=pos_encoded_inputs, _mask=_mask)
            else:
                _x = _layer(_inputs=_x, _mask=_mask)

            print(f"encoderLayer shape: {_x.shape}")

            layer_count += 1

        return _x

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, num_heads, key_dim, _d_model, _ffn_units):
        super(DecoderLayer, self).__init__()

        self.masked_multi_head_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)
        self.multi_head_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)
        self.feed_forward_layer = FeedForwardLayer(_d_model=_d_model, ffn_units=_ffn_units)

        subconn_layer = SubLayerConnection()
        self.subconn_layers = get_layer_clones(subconn_layer, 3)
    def call(self, _inputs, _encoder_output, _mask):
        print(f"query1: _inputs shape: {_inputs.shape}")
        masked_multihead_attention = self.masked_multi_head_attention(query=_inputs,
                                                                      key=_inputs,
                                                                      value=_inputs,
                                                                      attention_mask=_mask,
                                                                      return_attention_scores=False,
                                                                      training=True)
        l1 = self.subconn_layers[0](_input=masked_multihead_attention,
                                    _residual_input=_inputs)
        print(f"masked_multihead_attention shape: {masked_multihead_attention.shape}")
        print(f"_inputs shape: {_inputs.shape}")
        print(f"query2: decoder l1 shape: {l1.shape}")
        print(f"key2, val2: decoder enc_out shape: {_encoder_output.shape}")
        multi_head_attention = self.multi_head_attention(key=_encoder_output,
                                                         value=_encoder_output,
                                                         query=l1,
                                                         return_attention_scores=False,
                                                         training=False)
        l2 = self.subconn_layers[1](_input=multi_head_attention,
                                    _residual_input=l1)
        feed_forward = self.feed_forward_layer(_inputs=l2)

        l3 = self.subconn_layers[2](_input=feed_forward,
                                    _residual_input=l2)

        return l3


class Decoder(tf.keras.layers.Layer):
    def __init__(self, _num_heads, _key_dim, _seq_len, _d_model, _ffn_units):
        super(Decoder, self).__init__()

        self.seq_len = _seq_len
        self.d_model = _d_model

        self.positional_encoding_layer = PositionalEncodingLayer()

        decoder_layer = DecoderLayer(num_heads=_num_heads,
                                     key_dim=_key_dim,
                                     _d_model=self.d_model,
                                     _ffn_units=_ffn_units)
        self.decoder_layers = get_layer_clones(layer=decoder_layer, num_layers=6)


    def call(self, _inputs, _encoder_output, _mask):
        # Scale Embeddings by sqrt of d_model
        _inputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

        # Positional encoding
        pos_encoded_inputs = self.positional_encoding_layer(_inputs=_inputs,
                                                            _d_model=self.d_model,
                                                            _seq_len=self.seq_len,
                                                            _type='encoder')
        print(f"decoder pos_encoded_inputs shape: {pos_encoded_inputs}")


        layer_count = 1

        for _layer in self.decoder_layers:
            if layer_count==1:
                _x = _layer(_inputs=pos_encoded_inputs,
                            _encoder_output=_encoder_output,
                            _mask=_mask)
            else:
                _x = _layer(_inputs=_x,
                            _encoder_output=_encoder_output,
                            _mask=_mask)

            layer_count += 1



        return _x

class AttentionBasedTransformer(tf.keras.Model):
    def __init__(self, _embedding_size, _key_dim, _num_heads, _mask, _batch_size, _input_shape, _mem_matrix, _seq_len, _d_model, _embedding_layer=None):
        super(AttentionBasedTransformer, self).__init__()

        self.count = 0

        self.inputs_shape = _input_shape
        self.seq_len = _seq_len
        self.d_model = _d_model
        self.ffn_units = 64

        self.inputs = tf.keras.layers.InputLayer(input_shape=_input_shape, batch_size=_batch_size)

        self.encoder = Encoder(_embedding_size=_embedding_size,
                               _key_dim=_key_dim,
                               _num_heads=_num_heads,
                               _seq_len=self.seq_len,
                               _d_model=self.d_model,
                               _ffn_units=self.ffn_units,
                               _embedding_layer=_embedding_layer)
        self.mask = _mask

        print(f"_input_shape[0]: {_input_shape[0]}")

        self.memory_layer = tf.keras.layers.Embedding(input_dim=_input_shape[0],
                                                      output_dim=self.d_model,
                                                      trainable=True,
                                                      weights=[_mem_matrix],
                                                      input_length=self.seq_len)

        self.ones_matrix = tf.cast(tf.ones((_mem_matrix.shape[0], self.seq_len)), tf.float32)


        self.decoder = Decoder(_key_dim=_key_dim,
                               _num_heads=_num_heads,
                               _seq_len=self.seq_len,
                               _d_model=self.d_model,
                               _ffn_units=self.ffn_units)

        self.flatten_layer = tf.keras.layers.Flatten()

        self.linear = tf.keras.layers.Dense(units=1, activation='linear')  # outputs single neuron from this layer for binary class prediction (instead of seq prediction)
        self.sigmoid = tf.keras.layers.Dense(units=2, activation='sigmoid') # outputs single neuron from this layer for binary class prediction (instead of seq prediction)

    def create_padding_mask(self, seq): #seq: (batch_size, seq_length)
        # Create the mask for padding
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self):
        # Create the mask for the causal attention
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((self.seq_len, self.seq_len)), -1, 0)
        return look_ahead_mask

    def call(self, _inputs):
        tf.print(f"Pass: {self.count}")
        _input = self.inputs(_inputs)
        print(f"encoder input shape: {_input}")

        _enc_mask = self.create_padding_mask(seq=_input)

        _encoder_output = self.encoder(_inputs=_input, _mask=_enc_mask)

        print(f"_encoder_output shape: {_encoder_output.shape}")

        _dec_mask = self.create_look_ahead_mask()

        _mem_matrix = self.memory_layer(self.ones_matrix)

        _decoder_output = self.decoder(_inputs=_mem_matrix,
                                       _encoder_output=_encoder_output,
                                       _mask=_dec_mask)

        print(f"_decoder_output shape: {_decoder_output.shape}")

        flattened = self.flatten_layer(_decoder_output)

        print(f"flattened shape: {flattened.shape}")

        _linear = self.linear(flattened)

        output = self.sigmoid(_linear)

        tf.print(f"End of Pass: {self.count}")

        print(f"model output shape: {output.shape}")

        self.count += 1

        return output


In [6]:
@tf.function
def apply_gradients(optimizer, loss, model, labels, _inputs):
    with tf.GradientTape() as tape:
        logits = model(_inputs=_inputs)

        loss_val = loss(y_true=labels, y_pred=logits)

    gradients = tape.gradient(loss_val, model.trainable_weights)
    optimizer.apply_gradients(zip(gradients,  model.trainable_weights))

    return logits, loss_val

def train_one_epoch(train, train_acc_metric, optimizer, loss, model):
    losses = []
    pbar = tqdm.tqdm(total=len(list(enumerate(train))), position=0, leave=True, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}')

    for step, (x_batch_train, y_batch_train) in enumerate(train):
        print(f"step: {step}")
        logits, loss_value = apply_gradients(optimizer=optimizer,
                                             loss=loss,
                                             labels=y_batch_train,
                                             model=model,
                                             _inputs=x_batch_train)
        losses.append(loss_value)
        train_acc_metric(y_batch_train, logits)
        pbar.set_description("Training loss: %.4f" % (float(loss_value)))
        pbar.update()
    return losses


def perform_validation(test, model, loss, val_acc_metric):
    losses = []
    for x_val, y_val in test:
        val_logits = model(x_val, y_val)
        val_loss = loss(y_true=y_val, y_pred=val_logits)
        losses.append(val_loss)
        val_acc_metric(y_val, val_logits)

    return losses

def train_n_epochs(train, test, loss, optimizer, _model, epochs, train_acc_metric, val_acc_metric):
    epoch_train_losses, epoch_val_losses = [], []
    history = {}
    for epoch in range(epochs):
        print(f'Epoch: {epoch+1}')
        losses_train = train_one_epoch(train=train,
                                       train_acc_metric=train_acc_metric,
                                       optimizer=optimizer,
                                       loss=loss,
                                       model=_model)
        train_acc_metric.result()
        losses_val = perform_validation(test, _model, loss, val_acc_metric)
        val_acc_metric.result()
        mean_train_loss = np.mean(losses_train)
        mean_val_loss = np.mean(losses_val)
        epoch_train_losses.append(mean_train_loss)
        epoch_val_losses.append(mean_val_loss)
        history['epoch' + str(epoch+1) + 'train'] = mean_train_loss
        history['epoch' + str(epoch+1) + 'val'] = mean_val_loss
        print('\n Epoch %s: Train loss: %.4f  Validation Loss: %.4f, Train Accuracy: %.4f, Validation Accuracy %.4f' % (epoch, float(mean_train_loss), float(mean_val_loss), float(train_acc_metric.result()), float(val_acc_metric.result())))
        train_acc_metric.reset_states()
        val_acc_metric.reset_states()
    return _model

def compile_run_transformer(hyper_param_dict={}):

    # default values for hyperparams
    _embedding_size=300
    _optimizer='adam'
    _loss='binary_crossentropy'
    _accuracy='binary_accuracy'
    _epochs = 1
    _batch_size = 64
    _activation = 'sigmoid'
    _num_heads = 10
    _seq_len = 39
    input_shape=_x_train.shape

    print(f'_x_train.shape: {_x_train.shape}')

    # change hyperparams if supplied via hyper_param_dict
    if 'tokenizer' in hyper_param_dict:
        _tokenizer = hyper_param_dict['tokenizer']
    if 'sequence_length' in hyper_param_dict:
        sequence_length = hyper_param_dict['sequence_length']
    if 'input_shape' in hyper_param_dict:
        input_shape = hyper_param_dict['input_shape']
    if 'embedding_size' in hyper_param_dict:
        _embedding_size = hyper_param_dict['embedding_size']
    if 'accuracy' in hyper_param_dict:
        _accuracy = hyper_param_dict['accuracy']

    if 'optimizer' in hyper_param_dict:
        _optimizer = hyper_param_dict['optimizer']
    if 'loss' in hyper_param_dict:
        _loss = hyper_param_dict['loss']
    if 'epochs' in hyper_param_dict:
        _epochs = hyper_param_dict['epochs']
    if 'batch_size' in hyper_param_dict:
        _batch_size = hyper_param_dict['batch_size']

    # mask
    _y_mask = []
    for _i in _y_train:
        if _i[1]:  # indicates is spam, '1' in 2nd col indicates spam
            _tmp = []
            for __i in range(len(_x_train[0])):
                _tmp.append(True)
            _y_mask.append(_tmp)
        else:
            _tmp = []
            for __i in range(len(_x_train[0])):
                _tmp.append(False)
            _y_mask.append(_tmp)
    _mask = tf.boolean_mask(_x_train, mask=np.array(_y_mask), axis=None, name='boolean_mask')

    memory_matrix = np.random.randn(input_shape[0], 300) # uses this instead for input into decoder as we are not predicting sequences, but binary classes

    y_train_input = np.zeros((input_shape[0], _seq_len, 2))
    y_test_input = np.zeros((_y_test.shape[0], _seq_len, 2))

    small_memory_matrix = np.random.randn(20, 10) # uses this instead for input into decoder as we are not

    small_y_train = np.random.randn(20, 5, 2)
    small_y_test = np.random.randn(5, 5, 2)

    small_x_train = np.random.randn(20, 5, 10)
    small_x_test = np.random.randn(5, 5, 10)

    small_train = tf.data.Dataset.from_tensor_slices((small_x_train[np.newaxis, :, :], small_y_train[np.newaxis, :, :]))
    small_test = tf.data.Dataset.from_tensor_slices((small_x_test[np.newaxis, :, :], small_y_test[np.newaxis, :, :]))

    small_embeddings = np.random.randn(20, 5, 10)



    for i in range(_seq_len):
        y_train_input[:, i, :] = _y_train[:, :]
        y_test_input[:, i, :] = _y_test[:, :]

    print(f"y_train_input shape: {y_train_input.shape}")

    trfmer = AttentionBasedTransformer(_embedding_size=_embedding_size,
                                       _num_heads=_num_heads,
                                       _mask = _mask,
                                       _key_dim=int(_embedding_size/_num_heads),
                                       _batch_size=_batch_size,
                                       _input_shape=input_shape,
                                       _mem_matrix=memory_matrix,
                                       _d_model=300,
                                       _seq_len=39)
    print(f"_y_train shape: {_y_train.shape}")
    _train = tf.data.Dataset.from_tensor_slices((_x_train[np.newaxis, :, :], y_train_input[np.newaxis, :, :]))
    _test = tf.data.Dataset.from_tensor_slices((_x_test[np.newaxis, :, :], y_test_input[np.newaxis, :, :]))



    """
    train_n_epochs(_model=trfmer,
                   epochs=1,
                   loss= tf.keras.losses.BinaryCrossentropy(),
                   optimizer=tf.keras.optimizers.Adam(),
                   train=_train,
                   test=_test,
                   train_acc_metric=tf.keras.metrics.BinaryAccuracy(
                       name='binary_accuracy', dtype=None, threshold=0.5
                   ),
                   val_acc_metric=tf.keras.metrics.BinaryAccuracy(
                       name='binary_accuracy', dtype=None, threshold=0.5
                   ))
    """

    # initialize our TensorBoard callback for better visualization
    tensorboard = tf.keras.callbacks.TensorBoard(f"logs/spam_classifier_{time.time()}")

    # Clear backend
    tf.keras.backend.clear_session()

    #"""
    trfmer.compile(
        optimizer=_optimizer,
        loss=_loss,
        metrics=[_accuracy, tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]  # maybe define custom f1 score
    )

    # train the model
    trfmer.fit(_x_train,
               _y_train,
               validation_data=(_x_test, _y_test),
               epochs=_epochs,
               callbacks=[tensorboard],
               verbose=1)
    #"""

    trfmer.summary()

    trfmer.save(results_folder / f'trfmer_model_{_optimizer}_opt_{_loss}_loss_{_epochs}_epochs_{_embedding_size}_embedSize_{_batch_size}_batchSize')

    return trfmer


In [None]:
compile_run_transformer()
