In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [2]:
# penalize model in proportion to previous layers activation
# encouraging activations to be sparse
class ActivityRegularizationLayer(layers.Layer):
    def call(self, inputs):
        self.add_loss(1e-2 * tf.reduce_sum(inputs))
        return inputs

In [3]:
inputs = keras.Input(shape=(784,), name="digits")
x = layers.Dense(64, activation="relu")(inputs)
x = ActivityRegularizationLayer()(x)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(10, name="predictions")(x)
model = keras.Model(inputs=inputs, outputs=outputs)

In [4]:
train_acc_metric = keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = keras.metrics.SparseCategoricalAccuracy()

In [5]:
optimizer = keras.optimizers.SGD(learning_rate=1e-3)

In [6]:
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [7]:
batch_size = 64
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

In [8]:
x_train.shape

(60000, 28, 28)

In [9]:
x_train = np.reshape (x_train, (-1, 784))

In [10]:
x_train.shape

(60000, 784)

In [11]:
x_test = np.reshape(x_test, (-1, 784))

In [12]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))

In [13]:
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

In [14]:
# prepare validation dataset
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]

In [15]:
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(64)

In [16]:
x_train.shape

(50000, 784)

In [17]:
x_val.shape

(10000, 784)

Any function that takes tensors as input can be compiled into a graph.

In [18]:
@tf.function
def train_step(x, y):
    # with statement does not create a scope
    with tf.GradientTape() as tape:
        # forward pass
        logits = model(x, training=True)
        # loss
        loss_value = loss_fn(y, logits)
        # add any extra losses created during forward pass
        loss_value += sum(model.losses)

    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))

    # update train metric
    train_acc_metric.update_state(y, logits)
    
    return loss_value

In [19]:
@tf.function
def test_step(x,y):
    val_logits = model(x, training=False)
    # update val metrics
    val_acc_metric.update_state(y, val_logits)

In [20]:
import time

In [21]:
epochs = 2
for epoch in range(epochs):
    print("start of epoch %d" % epoch)
    start_time = time.time()
    
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        loss_value = train_step(x_batch_train, y_batch_train)
        
        if step % 200 == 0:
            print("train loss: %.4f at step: %d" % (float(loss_value), step))
        
    # display train metric at end of each epoch
    train_acc = train_acc_metric.result()
    train_acc_metric.reset_states()
    print("train acc over epoch: %.2f" % float(train_acc))
        
    # run a validation loop at the end of each epoch
    for x_batch_val, y_batch_val in val_dataset:
        test_step(x_batch_val, y_batch_val)

    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print("val acc: %.4f" % float(val_acc))
    print("time taken: %.2fs" % (time.time() - start_time))

start of epoch 0
train loss: 1933.7369 at step: 0
train loss: 2.3028 at step: 200
train loss: 2.3025 at step: 400
train loss: 2.3032 at step: 600
train loss: 2.3022 at step: 800
train acc over epoch: 0.11
val acc: 0.1064
time taken: 9.90s
start of epoch 1
train loss: 2.3030 at step: 0
train loss: 2.3029 at step: 200
train loss: 2.3024 at step: 400
train loss: 2.3050 at step: 600
train loss: 2.3004 at step: 800
train acc over epoch: 0.11
val acc: 0.1064
time taken: 5.48s


In [22]:
s = 0
for w in model.trainable_weights:
    s += tf.reduce_sum(tf.abs(w))

s.numpy()

2729.7004