In [1]:
'''
使用裝飾器@tf.function、keras api，皆是使用靜態圖，運算效能高
純粹使用 tf.GradientTape() 則是使用動態圖，效能較低
tf.GradientTape 是自動微分 (Record operations for automatic differentiation.)
tf.gradients(ys, xs, ...) 是符號微分 (Constructs symbolic derivatives of sum of ys w.r.t. x in xs.)
'''
'''靜態/動態圖、符號/自動微分'''

'使用裝飾器@tf.function、keras api，皆是使用靜態圖，運算效能高\n使用 tf.GradientTape() 則是使用動態圖，效能較低\n'

In [2]:
'''
名詞翻譯: 
dimension reduction: 維度縮減 https://terms.naer.edu.tw/detail/3648035/?index=1
Hadamard product: 哈德瑪得乘積 (定義兩同維度矩陣，相應元素計算乘積)，
又稱 element-wise product(逐元乘積)、entrywise product(逐項乘積)
---
reduce(縮減) 
entrywise product(逐項乘積)
---
tf.matmul() 兩張量(矩陣)相乘
tf.multiply() 兩張量(矩陣)逐項乘積
tf.reduce_sum() 
對張量(矩陣)指定維度的元素進行相加(omputes the sum of elements across dimensions of a tensor.)
numpy.ufunc.reduce() 
對同一維度的元素套用相同的操作，將陣列的維度縮減(較少翻譯為:歸約)至1維 (Reduces array’s dimension by one, by applying ufunc along one axis.)
tf.gradients(ys, xs, ...)
Constructs symbolic derivatives (符號導數) of sum of ys w.r.t. x in xs.
symbolic differentiation 符號微分法
'''
'名詞翻譯'

'名詞翻譯'

In [3]:
'''
https://stackoverflow.com/questions/43455320/difference-between-symbolic-differentiation-and-automatic-differentiation

There are 3 popular methods to calculate the derivative:
1. Numerical differentiation: 數值方法，定義合理的方程式，透過多次迭代來減少誤差項，逼近理論解析解
2. Symbolic differentiation: 透過連鎖律獲得導函數表達式，計算微分值
3. Automatic differentiation:  Automatic differentiation is the same as 
   Symbolic differentiation (in one place they operate on math expression, 
   in another on computer programs). And yes, they are sometimes very similar. 
   But for control flow statements (`if, while, loops) the results can be very different:
   symbolic differentiation leads to inefficient code (unless carefully done) and faces the difficulty of converting a computer program into a single expression
'''
'''Derivative'''

'Derivative'

In [4]:
'''
https://ithelp.ithome.com.tw/articles/10217112
https://ithelp.ithome.com.tw/articles/10216085
https://pytorch.org/tutorials/beginner/examples_autograd/tf_two_layer_net.html
最初 Tensorflow 是以靜態計算圖（static computational graph）的方式進行 gradient 計算
Tensorflow 制定的 tensor 結構可以放置 CPU 或 GPU，而 numpy ndarray 指能以 CPU 計算
故 Tensorflow 可將 ndarray 轉換至 tensor、tf.graph 也可放置於 GPU

相較於 Pytorch 的動態計算圖（dynamic computational graph）會在執行期間
自動微分(Runtime Automatic Differentation)；
TF 也是自動微分

大約從 TF 1.5 版開始推出 Eager Execution，Eager_tensor 就是可動態圖的方法
'''
'''TF Eager Execution'''

'TF Eager Execution'

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
import numpy as np
import time

In [6]:
current_time_str = time.strftime("%H:%M:%S", time.localtime())
print(current_time_str)
#print(type(current_time_str))

22:06:23


In [7]:
# Prepare the training dataset.
batch_size = 64
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
x_train = np.reshape(x_train, (-1, 784))
x_test = np.reshape(x_test, (-1, 784))

# Reserve 10,000 samples for validation.
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]

# Prepare the training dataset.
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

# Prepare the validation dataset.
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)

In [8]:
# Get model
inputs = keras.Input(shape=(784,), name="digits")
x = layers.Dense(64, activation="relu", name="dense_1")(inputs)
x = layers.Dense(64, activation="relu", name="dense_2")(x)
outputs = layers.Dense(10, name="predictions")(x)
model = keras.Model(inputs=inputs, outputs=outputs)

# Instantiate an optimizer to train the model.
optimizer = keras.optimizers.SGD(learning_rate=1e-3)
# Instantiate a loss function.
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Prepare the metrics.
train_acc_metric = keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = keras.metrics.SparseCategoricalAccuracy()

# Eager mode

In [9]:
# *** 第一步: 設定迭代次數 *** 
epochs = 2
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()

    # *** 第二步: 取出 batches 的資料，
    #            向前傳播紀錄權重運算結果、得到預測結果，(預設)計算平均loss值，
    #            此時可再加入 loss 的正規化
    #            再計算Loss_fn對權重的導數(自動微分: 先前紀錄所有運算過程，再反向傳播運算得微分值)
    #            最後用最佳化器更新權重
    # *** 
    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            # Forwarding
            pred_logits = model(x_batch_train, training=True)
            # loss_object = loos_fn(y_true, y_pred)
            # `loss_object` can get loss value and regards as a function can compute gradient 
            loss_object = loss_fn(y_batch_train, pred_logits)
        grads = tape.gradient(loss_object, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        # Update training metric.
        train_acc_metric.update_state(y_batch_train, pred_logits)

        # Log every 200 batches.
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_object))
            )
            print("Seen so far: %d samples" % ((step + 1) * batch_size))

    # Display metrics at the end of each epoch.
    train_acc = train_acc_metric.result()
    print("Training acc over epoch: %.4f" % (float(train_acc),))

    # Reset training metrics at the end of each epoch
    train_acc_metric.reset_states()

    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_dataset:
        val_logits = model(x_batch_val, training=False)
        # Update val metrics
        val_acc_metric.update_state(y_batch_val, val_logits)
    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print("Validation acc: %.4f" % (float(val_acc),))
    print("Time taken: %.2fs" % (time.time() - start_time))


Start of epoch 0
Training loss (for one batch) at step 0: 102.7714
Seen so far: 64 samples
Training loss (for one batch) at step 200: 1.4990
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 1.2216
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.9557
Seen so far: 38464 samples
Training acc over epoch: 0.6387
Validation acc: 0.7874
Time taken: 3.95s

Start of epoch 1
Training loss (for one batch) at step 0: 1.1476
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.9257
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.7298
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.7865
Seen so far: 38464 samples
Training acc over epoch: 0.8113
Validation acc: 0.8509
Time taken: 3.75s


# Static mode?

## Appetizer

In [10]:
@tf.function
def MyStaticGradientor(func, var):
    return func(var)

In [11]:
A = tf.constant([[2,2]])

In [12]:
def square_fn(x):
    res = x**3
    print(res)
    return res

In [13]:
square_fn(A)

tf.Tensor([[8 8]], shape=(1, 2), dtype=int32)


<tf.Tensor: shape=(1, 2), dtype=int32, numpy=array([[8, 8]])>

In [14]:
MyStaticGradientor(square_fn, A)

Tensor("pow:0", shape=(1, 2), dtype=int32)


<tf.Tensor: shape=(1, 2), dtype=int32, numpy=array([[8, 8]])>

## Main course

In [15]:
model_efficient = keras.Model(inputs=inputs, outputs=outputs)

In [16]:
@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        pred_logits = model_efficient(x, training=True)
        loss_value = loss_fn(y, pred_logits)
    grads = tape.gradient(loss_value, model_efficient.trainable_weights)
    optimizer.apply_gradients(zip(grads, model_efficient.trainable_weights))
    train_acc_metric.update_state(y, pred_logits)
    return loss_value

In [17]:
@tf.function
def test_step(x, y):
    pred_logits = model_efficient(x, training=False)
    val_acc_metric.update_state(y, pred_logits)

In [18]:
epochs = 2
for epoch in range(epochs):
    print(f"Start of epoch {epoch}")
    start_time = time.time()
    
    # ---- Training ----
    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        loss_value = train_step(x_batch_train, y_batch_train)

        # Log every 200 batches.
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %d samples" % ((step + 1) * batch_size))

    # Display metrics at the end of each epoch.
    train_acc = train_acc_metric.result()
    print("Training acc over epoch: %.4f" % (float(train_acc),))
    # Reset training metrics at the end of each epoch
    train_acc_metric.reset_states()

    # ---- Testing (Validating) ----
    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_dataset:
        test_step(x_batch_val, y_batch_val)

    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print("Validation acc: %.4f" % (float(val_acc),))
    
    print("Time taken: %.2fs" % (time.time() - start_time))

Start of epoch 0
Training loss (for one batch) at step 0: 0.9626
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.3760
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.5383
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.4413
Seen so far: 38464 samples
Training acc over epoch: 0.8542
Validation acc: 0.8769
Time taken: 1.11s
Start of epoch 1
Training loss (for one batch) at step 0: 0.8899
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.4732
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.2711
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.6874
Seen so far: 38464 samples
Training acc over epoch: 0.8796
Validation acc: 0.8410
Time taken: 0.70s
