# Original Transformer Testing

Author: Kara Ponder (SLAC)

This notebook was used to test and build the transformer model. It's contents have been put in a python script `transformer.py` but I'm including this in the repo if further model development is needed. 

This code was originally based on a Tensorflow Transformer example: https://www.tensorflow.org/text/tutorials/transformer


In [None]:
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import time
%matplotlib inline

In [None]:
# Set parameters
d_model = 128  # input vector must have length d_model
target_vocab_size = 6  # possible results to choose from

lc_length = 100 +1 # light curve length
input_vocab_size = lc_length

## hyperparameters:
num_layers = 8
dropout_rate = 0.0
dff = 64 # hidden layer size of the feed forward network, needs to be larger than 24
num_heads = 8 # d_model % num_heads == 0

# LC stuff
N = 10000 # number of objects
N_days = 100 + 1
Nf = 6 # number of filters
num_classes = 4


batch_size = 64
EPOCHS = 5

The model was tested using the simplified data in the repository

In [None]:
lc_data = np.load('lc_data.npy')
wgt_map = np.load('weightmap.npy')
real_lc_data = np.load('real_lc.npy')

dataset = tf.data.Dataset.from_tensor_slices((lc_data, real_lc_data, wgt_map))
batch_ds = dataset.batch(batch_size)

These functions below are based on the Transformer tutorial but have been modified to work with light curve data. 

In [None]:
# Positional Encoding
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
# MultiHeaded Attention
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
       q, k, v must have matching leading dimensions.
       k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
       The mask has different shapes depending on its type(padding or look ahead)
       but it must be broadcastable for addition.

       Args:
        q: query shape == (..., seq_len_q, depth)
        k: key shape == (..., seq_len_k, depth)
        v: value shape == (..., seq_len_v, depth_v)
        mask: Float tensor with shape broadcastable
              to (..., seq_len_q, seq_len_k). Defaults to None.
      Returns:
       output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
           Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        r = tf.transpose(x, perm=[0, 2, 1, 3])
        return r
  
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
        concat_attention = tf.reshape(scaled_attention,
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights


In [None]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
                                tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
                                tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
                                ])

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
  
    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
        return out2


In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)
 
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
  
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output,
                                               out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff,
                 maximum_position_encoding, rate=0.1, embed=False):
        super(Encoder, self).__init__()

        self.num_layers = num_layers
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.maximum_position_encoding = maximum_position_encoding
        self.rate = rate
        self.embed = embed
        
        if self.embed:
            self.embedding = tf.keras.layers.Dense(self.d_model) # linear embedding

        self.pos_encoding = positional_encoding(self.maximum_position_encoding,
                                                self.d_model)

        self.enc_layers = [EncoderLayer(self.d_model, self.num_heads, self.dff, self.rate)
                           for _ in range(self.num_layers)]

        self.dropout = tf.keras.layers.Dropout(self.rate)

    def call(self, x, training): #, mask):
        mask = create_padding_mask(x[:,:, 0])
        #print('mask', tf.shape(x))

        if self.embed:
            x = self.embedding(x)
        #print('after embed', x.shape)
        seq_len = tf.shape(x)[1]
        #print('seq_len', seq_len)

        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = tf.cast(x, dtype=tf.float32)
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)

    def get_config(self):
        config = super().get_config().copy()
        config.update({'num_layers': self.num_layers,
                       'd_model': self.d_model, 
                       'num_heads': self.num_heads, 
                       'dff': self.dff, 
                       'maximum_position_encoding': self.maximum_position_encoding, 
                       'rate': self.rate,
                       'embed': self.embed,
                      })
        return config

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
                maximum_position_encoding, rate=0.1, embed=False):
        super(Decoder, self).__init__()

        self.num_layers = num_layers
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.target_vocab_size = target_vocab_size
        self.maximum_position_encoding = maximum_position_encoding
        self.rate = rate
        self.embed = embed
        
        if self.embed:
            self.embedding = tf.keras.layers.Dense(self.d_model) # linear embedding

        self.pos_encoding = positional_encoding(self.maximum_position_encoding, self.d_model)

        self.dec_layers = [DecoderLayer(self.d_model, self.num_heads, self.dff, self.rate)
                           for _ in range(self.num_layers)]

        self.dropout = tf.keras.layers.Dropout(self.rate)

    def call(self, x, enc_output, training, mask): #, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        
        if self.embed:
            x = self.embedding(x) 
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        x = tf.cast(x, dtype=tf.float32)

        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                   mask[0], mask[1])
                                                   #look_ahead_mask, padding_mask)

        attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
        attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x #, attention_weights

    def get_config(self):
        config = super().get_config().copy()
        config.update({'num_layers': self.num_layers,
                       'd_model': self.d_model, 
                       'num_heads': self.num_heads, 
                       'dff': self.dff, 
                       'target_vocab_size': self.target_vocab_size,
                       'maximum_position_encoding': self.maximum_position_encoding, 
                       'rate': self.rate,
                       'embed': self.embed,
                      })
        return config

In [None]:
# Masking
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0.0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_masks(inp, tar):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(inp)
  
    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(inp)
  
    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
    return enc_padding_mask, combined_mask, dec_padding_mask

def create_decoder_masks(inp, tar):  
    inp = inp[:,:, 0]
    tar = tar[:,:, 0]
    
    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(inp)
  
    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
    return combined_mask, dec_padding_mask

## We've defined all functions. Now we can start to  compile the model

In [None]:
optimizer = tf.keras.optimizers.Adam(0.00001)
loss_object = tf.keras.losses.MeanSquaredError() #tf.keras.losses.MeanAbsoluteError() #

class RMSE(tf.keras.losses.Loss):
    def __init__(self, name="rmse"):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        print(y_true.shape, y_pred.shape)
        mse = tf.math.reduce_mean(tf.square(y_true - y_pred))
        return tf.math.sqrt(mse)


In [None]:
encoder = Encoder(num_layers, d_model, num_heads, dff,
                       lc_length, dropout_rate, embed=True)

decoder = Decoder(num_layers, d_model, num_heads, dff,
                        target_vocab_size, lc_length, dropout_rate, embed=True)

final_layer = tf.keras.layers.Dense(target_vocab_size)

In [None]:
inp = tf.keras.layers.Input(shape=(None,6))#shape=(None,None))# # None if d_model=6 with no embedding, 6 if using embedding
target = tf.keras.layers.Input(shape=(None,6))#shape=(None,None))#
wgts = tf.keras.layers.Input(shape=(None,6))

x = encoder(inp)
x = decoder(target, x, mask=create_decoder_masks(inp, target))
x = final_layer(x)

mx = tf.keras.layers.Multiply()([x, wgts])

model = tf.keras.models.Model(inputs=[inp, target, wgts], outputs=mx) #[inp, target, wgts]

model.summary()

In [None]:
model.compile(optimizer=optimizer, loss=RMSE(),
             )

Either load in weights below or fit the model again

In [None]:
#model.load_weights('some/saved/weights/transformer.h5')   


In [None]:
num_batches = 0
for (batch, _) in enumerate(batch_ds):
    num_batches = batch
    
#num_batches_VALID = 0
#for (batch, (_,_)) in enumerate(batch_ds_VALID):
#    num_batches_VALID = batch


def generator(data_set):
    while True:
        for in_batch, tar_batch, wgt_batch in data_set:
            yield ( [in_batch , in_batch[:, :-1, :], wgt_batch[:, 1:, :]] , in_batch[:, 1:, :])

history = model.fit(x = generator(batch_ds),
                    #validation_data = generator(batch_ds_VALID),
                    epochs=EPOCHS,
                    steps_per_epoch = num_batches,
                    #validation_steps = num_batches_VALID,
                    )

In [None]:
#model.save('/sdf/home/k/kap146/desc/transformer/testing_tfmodel.h5')
#model.save_weights('/sdf/home/k/kap146/desc/transformer/testing_tfmodel_weights.h5')

In [None]:
plt.plot(history.history['loss'])

Evaluate the model. 

In [None]:
## this doesn't work
## crashes the kernel
#@tf.function
def evaluate(lc_data):
    inp_lc = tf.expand_dims(lc_data, 0)
    decoder_input=tf.constant([[-1.0]*Nf]) # tf.multiply(-1., tf.ones(Nf, dtype=tf.float64)) #
    output = tf.expand_dims(decoder_input, 0)

    for i in range(N_days):
        predictions = model([inp_lc, output]) ## if batching may need predict..
        
        predictions = predictions[: ,-1:, :]
        output = tf.concat([output, predictions], axis=1)
        
    return tf.squeeze(output, axis=0)


check_lc = tf.constant(lc_data[1])
inp_lc = tf.expand_dims(check_lc, 0)

#start = time.time()
#pred2=evaluate(check_lc)

#print(time.time() - start)

In [None]:
start = time.time()

decoder_input=[[-1.0]*Nf]## or 0
output = tf.expand_dims(decoder_input, 0)

for i in range(N_days-1):
    predictions = model([lc_data[1][tf.newaxis, :, :], output]) #.predict
    
    predictions = predictions[: ,-1:, :] ## CHECKKK
    
    output = tf.concat([output, predictions], axis=1)
    tf.squeeze(output, axis=0)

print(time.time() - start) # 400s first time, # 6s after ## new:25s, 18s

In [None]:
i = 1

plt.plot(lc_data[i][1:], 'ro', ls = '-.', lw = 2, alpha=0.4)

plt.plot(output[0][1:, 0], lw=2, label='predicted lc') # this might finally be right!
plt.plot(output[0][1:, 1], lw=2)
plt.plot(output[0][1:, 2], lw=2)
plt.plot(output[0][1:, 3], lw=2)
plt.plot(output[0][1:, 4], lw=2)
plt.plot(output[0][1:, 5], lw=2)

plt.plot(real_lc_data[i][:], 'k', ls = '--', lw = 2, alpha=0.4)
plt.plot(real_lc_data[i][:][0], 'k', ls = '--', lw = 2, label='model lc') #, alpha=0.4)

plt.legend()

plt.xlabel('time')
plt.ylabel('brightness')