In [1]:
%run ./data.ipynb

                 0             1         2         3             4         5   \
0     -3.014600e-07  8.260300e-06 -0.000012 -0.000002 -1.438600e-06 -0.000021   
1      2.913200e-06 -5.247700e-06  0.000003 -0.000006  2.778900e-06 -0.000004   
2     -2.951700e-06 -3.184000e-06 -0.000016 -0.000001 -1.575300e-06  0.000017   
3     -1.322600e-06  8.820100e-06 -0.000016 -0.000005 -7.282900e-07  0.000004   
4     -6.836600e-08  5.666300e-07 -0.000026 -0.000006 -7.940600e-07  0.000013   
...             ...           ...       ...       ...           ...       ...   
58504 -9.982500e-06  1.854900e-06 -0.000021  0.000021 -5.910200e-06 -0.000098   
58505 -1.055400e-05  1.983500e-05  0.000031 -0.000018 -1.061600e-04  0.000292   
58506 -5.857700e-06  1.859400e-05 -0.000102 -0.000003  3.827500e-06  0.000117   
58507 -4.441100e-06  3.396900e-05 -0.000442  0.000005  6.500800e-06  0.000087   
58508 -8.853300e-06  5.225900e-05  0.000072  0.000010  3.795600e-06 -0.000032   

             6         7   

x_ar shape: (58509, 48)
x_val shape: (11702, 48)
x_test shape: (11702, 48)
x_train shape: (35105, 48)
y_ar shape: (58509, 11)
y_val shape: (11702, 11)
y_test shape: (11702, 11)
y_train shape: (35105, 11)


In [7]:
"""Create batch data sets from raw data."""

import tensorflow as tf


batch_size = 100

train_dataset = tf.data.Dataset.from_tensor_slices(
    (x_train, y_train)
)

train_dataset = train_dataset.shuffle(
    buffer_size=1024
).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices(
    (x_val, y_val)
)

val_dataset = val_dataset.batch(
    batch_size
)

print("train_dataset.element_spec: {}".format(train_dataset.element_spec))
print("val_dataset.element_spec: {}".format(val_dataset.element_spec))

train_dataset.element_spec: (TensorSpec(shape=(None, 48), dtype=tf.float64, name=None), TensorSpec(shape=(None, 11), dtype=tf.uint8, name=None))
val_dataset.element_spec: (TensorSpec(shape=(None, 48), dtype=tf.float64, name=None), TensorSpec(shape=(None, 11), dtype=tf.uint8, name=None))


In [None]:
"""Define loss and optimizer for training."""

loss_fn = tf.keras.losses.CategoricalCrossentropy(
    from_logits=False,  # Compute loss from softmax, not from logits.
)

optimizer = tf.keras.optimizers.Adam(
    learning_rate=1e-3,
)

In [None]:
"""Define metrics for training and validation."""

train_loss_metric = tf.keras.metrics.Mean(
    name='training_loss',
)

train_acc_metric = tf.keras.metrics.CategoricalAccuracy(
    name="training_accuracy",
)

val_loss_metric = tf.keras.metrics.Mean(
    name='validation_loss',
)

val_acc_metric = tf.keras.metrics.CategoricalAccuracy(
    name="validation_accuracy",
)

In [None]:
"""Define MultiHeadAttention layer."""

import tensorflow as tf


def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k == seq_len_v.
    
    The mask has different shapes depending on its type(padding or look ahead)
    but it must be broadcastable for addition.

    Args:
      q: query shape == (batch_size, num_heads, (seq_len_q,) depth)
      k: key shape   == (batch_size, num_heads, (seq_len_k,) depth)
      v: value shape == (batch_size, num_heads, (seq_len_k,) depth)
      mask: Float tensor with shape broadcastable
            to (batch_size, num_heads, seq_len_q, seq_len_k) (default: None)

    Returns:
      output, attention_weights
    """
    
    #print("BEGIN Attention")
    #print("q.shape == {}".format(q.shape))
    #print("k.shape == {}".format(k.shape))
    #print("v.shape == {}".format(v.shape))

    """Matrix multiplication of last two dimensions of query and key,
    if necessary after reshaping with dummy dimension to emulate outer product of two vectors
    """
    if len(q.shape) == 3:
        attention_logits = tf.matmul(
            tf.transpose(
                tf.reshape(
                    q, 
                    (
                        tf.shape(q)[0],  # batch size
                        tf.shape(q)[1],  # num_heads
                        1,               # dummy dimension for sequence of length=1
                        tf.shape(q)[2],  # depth
                    )
                ),
                perm=[0,1,3,2]  # transpose vector in penultima and ultima dimension for multiplication
            ),
            tf.reshape(
                k, 
                (
                    tf.shape(k)[0],  # batch size
                    tf.shape(k)[1],  # num_heads
                    1,               # dummy dimension for sequence of length=1
                    tf.shape(k)[2],  # depth
                )
            )
        )
    elif len(x.shape) == 4:
        attention_logits = tf.matmul(
            q,
            k, 
            transpose_b=True
        )
    else:
        raise NotImplementedError
    
    #print("attention_logits.shape == {}".format(attention_logits.shape))
    # (batch_size, num_heads, seq_len_q, seq_len_k)

    """Scale matmul_qk element-wise by depth."""
    scaled_attention_logits = attention_logits / tf.math.sqrt(
        tf.cast(
            tf.shape(k)[-1], 
            tf.float32
        )
    )

    """Add mask to scaled tensor."""
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    """Softmax the last axis (seq_len_k)."""
    attention_weights = tf.nn.softmax(
        scaled_attention_logits, 
        axis=-1
    )
    
    #print("attention_weights.shape == {}".format(attention_weights.shape))
    # (batch_size, num_heads, seq_len_q, seq_len_k)

    """Scale value with attention weights."""
    if len(v.shape) == 3:
        output = tf.matmul(
            attention_weights, 
            tf.transpose(
                tf.reshape(
                    v,
                    (
                        tf.shape(v)[0],  # batch size
                        tf.shape(v)[1],  # num_heads
                        1,               # dummy dimension for sequence of length=1
                        tf.shape(v)[2],  # depth
                    )
                ),
                perm=[0,1,3,2]  # transpose vector in penultima and ultima dimension for multiplication
            )
        )
    elif len(v.shape) == 4:
        output = tf.matmul(
            attention_weights, 
            v
        )
    else:
        raise NotImplementedError
        
    #print("output.shape == {}".format(output.shape))
    # (batch_size, num_heads, seq_len_q, depth_v)
    
    #print("END Attention")

    return output, attention_weights


class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.dense_q = tf.keras.layers.Dense(d_model)
        self.dense_k = tf.keras.layers.Dense(d_model)
        self.dense_v = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size, do_transpose=False):
        """
        Split last dimension into num_heads times depth
        and ensure that num_heads is 2nd dimension.
        """      
        if do_transpose:
            """Shape length of 3 indicates an additional dimension for sequence.
            
            (batch_size, seq_length, d_model)
            ->
            (batch_size, seq_length, num_heads, depth)
            ->
            (batch_size, num_heads, seq_length, depth)
            """
            return tf.transpose(
                tf.reshape(
                    x, 
                    (
                        batch_size, 
                        -1,
                        self.num_heads, 
                        self.depth,
                    )
                ), 
                perm=[0, 2, 1, 3]
            )
        
        else:
            """
            (batch_size, d_model)
            ->
            (batch_size, num_heads, depth)
            """
            return tf.reshape(
                x, 
                (
                    batch_size, 
                    self.num_heads, 
                    self.depth,
                )
            )

    def call(self, x):
        batch_size = tf.shape(x)[0]  # (batch_size, ..., x_dim)
        
        q = x  # Query
        k = x  # Key
        v = x  # Value
        
        #print("q.shape == {}".format(q.shape))

        dense_q = self.dense_q(q)  # (batch_size, (seq_len,) d_model)
        dense_k = self.dense_k(k)  # (batch_size, (seq_len,) d_model)
        dense_v = self.dense_v(v)  # (batch_size, (seq_len,) d_model)
        
        #print("dense_q.shape == {}".format(dense_q.shape))
        
        """Shape length of 3 indicates an additional dimension for sequence 
        which requires transposition in split.
        """
        do_transpose = len(dense_q.shape) == 3

        split_q = self.split_heads(dense_q, batch_size, do_transpose)  # (batch_size, num_heads, (seq_len,) depth)
        split_k = self.split_heads(dense_k, batch_size, do_transpose)  # (batch_size, num_heads, (seq_len,) depth)
        split_v = self.split_heads(dense_v, batch_size, do_transpose)  # (batch_size, num_heads, (seq_len,) depth)
        
        #print("split_q.shape == {}".format(split_q.shape))

        # scaled_attention.shape == (batch_size, num_heads, (seq_len,) depth)
        # attention_weights.shape == (batch_size, num_heads, )
        scaled_attention, attention_weights = scaled_dot_product_attention(
            split_q, 
            split_k, 
            split_v, 
            mask=None
        )
        
        #print("scaled_attention.shape == {}".format(scaled_attention.shape))
        #print("attention_weights.shape == {}".format(attention_weights.shape))

        if do_transpose:
            """Revert transposition from head split.
            
            (batch_size, num_heads, seq_len, depth)
            ->
            (batch_size, seq_length, num_heads, depth)
            """
            scaled_attention = tf.transpose(
                scaled_attention,
                perm=[0, 2, 1, 3]
            ) 
            #print("transposed scaled_attention.shape == {}".format(scaled_attention.shape))

        concat_attention = tf.reshape(
            scaled_attention,
            (
                -1, 
                self.d_model
            )
        )  # (batch_size, d_model)
        
        #print("concat_attention.shape == {}".format(concat_attention.shape))

        output = self.dense(
            concat_attention
        )  # (batch_size, d_model)

        return output, attention_weights


In [None]:
"""Define model."""

import keras
import numpy as np

seed = 10
np.random.seed(seed)


class OverlyComplicatedModel(tf.keras.Model):
    def __init__(self, input_dim: int, output_dim: int):
        super().__init__()
        
        self.input_layer = keras.Input(
            shape=(input_dim,),
        )
        
        self.multi_head_attention = MultiHeadAttention(
            48,
            num_heads=4,
        )
        
        self.normalization_1 = keras.layers.BatchNormalization(
            epsilon=1e-6,
        )
        
        self.dense = keras.layers.Dense(
            48,
            activation="relu",
            name="dense",
        )
        
        self.dropout = keras.layers.Dropout(
            0.5,
        )
        
        self.normalization_2 = keras.layers.BatchNormalization(
            epsilon=1e-6,
        )
        
        self.outputs = keras.layers.Dense(
            output_dim,
            activation="softmax",
        )
        
    def call(self, inputs, training):
        #x = self.input_layer(inputs)
        
        x_attn, attention_weights = self.multi_head_attention(inputs)
        
        x_attn_res = keras.layers.add(
            [
                inputs,  # Residual connection
                x_attn,  
            ]
        )
        
        x_attn_norm = self.normalization_1(x_attn_res)
        
        #print("x_attn shape: {}".format(x_attn.shape))
        #print("Inputs shape: {}".format(inputs.shape))
        
        x_dense = self.dense(x_attn_norm)
        
        x_dense_res = keras.layers.add(
            [
                x_attn_norm,  # Residual connection
                x_dense,  
            ]
        )
        
        #x = self.dropout(
        #    x_dense_res, 
        #    training=training,
        #)
        
        x = self.normalization_2(x_dense_res)
        
        outputs = self.outputs(x)
        
        return outputs, attention_weights

In [None]:
"""Create model."""

model = OverlyComplicatedModel(
    input_dim=x_ar.shape[1],
    output_dim=y_ar.shape[1],
)

"""The model has NOT yet been built."""

In [None]:
"""Define the step function as a callable TensorFlow graph."""

@tf.function()
def train_step(x_batch, y_batch):
    """Record operations for automatic differentiation."""
    with tf.GradientTape() as gradient_tape:
        #print(x_batch.shape)
        #print(y_batch.shape)
        
        predictions, _ = model(x_batch, training=True) 

        loss = loss_fn(
            y_batch, 
            predictions,
        )
        
    """Compute gradients."""
    gradients = gradient_tape.gradient(loss, model.trainable_weights)
        
    """Compute and apply deltas."""
    optimizer.apply_gradients(
        zip(
            gradients, 
            model.trainable_weights,
        )
    )
    
    train_loss_metric(loss)
    train_acc_metric(y_batch, predictions)
    
    return loss

In [None]:
"""Train the model."""

import time
from collections import defaultdict


epochs = 3

history = defaultdict(list)
        
for epoch in range(epochs):
    start_time = time.time()
    
    train_loss_metric.reset_states()
    train_acc_metric.reset_states()
    val_loss_metric.reset_states()
    val_acc_metric.reset_states()

    """Training"""
    for step, (x_batch, y_batch) in enumerate(train_dataset):
        loss = train_step(x_batch, y_batch)
        
        if step % 200 == 0:
            print(
                "Epoch %02d | step %04d | %06s samples | loss: %.4f" % (
                    epoch,
                    step,
                    step * batch_size,
                    float(loss)
                )
            )
            
    history["loss"].append(train_loss_metric.result())
    history["accuracy"].append(train_acc_metric.result())

    print(
        "Epoch %02d | training accuracy: %.4f" % (
            epoch,
            float(train_acc_metric.result())
        )
    )
    
    """Validation"""
    for x_batch, y_batch in val_dataset:
        predictions, attention_weights = model(x_batch, training=False)
        
        loss = loss_fn(y_batch, predictions)
        
        val_loss_metric.update_state(loss)
        val_acc_metric.update_state(y_batch, predictions)
        
        #print(attention_weights.shape)
        
    history["val_loss"].append(val_loss_metric.result())
    history["val_accuracy"].append(val_acc_metric.result())
    history["attention_weights"].append(attention_weights)

    print(
        "Epoch %02d | validation accuracy: %.4f" % (
            epoch,
            float(val_acc_metric.result())
        )
    )
    
    print(
        "Epoch %02d | time: %.2fs\n" % (
            epoch,
            time.time() - start_time
        )
    )

In [None]:
"""Plot accuracies."""

import matplotlib.pyplot as plt


def plot_accuracies(history):
    plt.plot(history['accuracy'])
    plt.plot(history['val_accuracy'])
    
    plt.title('model accuracy')
    
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    
    plt.legend(['train', 'validation'])
    
    plt.savefig('accuracy')    
    
plot_accuracies(history)

In [None]:
"""Plot losses."""

import matplotlib.pyplot as plt


def plot_losses(history):
    plt.plot(history['loss'])
    plt.plot(history['val_loss'])
    
    plt.title('model loss')
    
    plt.ylabel('loss')
    plt.xlabel('epoch')
    
    plt.legend(['train', 'validation'])
    
    plt.savefig('loss')
    
plot_losses(history)