In [17]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB or IS_KAGGLE:
    !echo "deb http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" > /etc/apt/sources.list.d/tensorflow-serving.list
    !curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
    !apt update && apt-get install -y tensorflow-model-server
    %pip install -q -U tensorflow-serving-api

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "deploy"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2943  100  2943    0     0   5877      0 --:--:-- --:--:-- --:--:--  5886
OK
Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://storage.googleapis.com/tensorflow-serving-apt stable InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis

In [18]:
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()
X_train_full = X_train_full[..., np.newaxis].astype(np.float32) / 255.
X_test = X_test[..., np.newaxis].astype(np.float32) / 255.
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_new = X_test[:3]

In [19]:
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28, 1]),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-2),
              metrics=["accuracy"])
model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

  super().__init__(**kwargs)


Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.7071 - loss: 1.0970 - val_accuracy: 0.9006 - val_loss: 0.3746
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.8956 - loss: 0.3743 - val_accuracy: 0.9174 - val_loss: 0.3004
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.9123 - loss: 0.3121 - val_accuracy: 0.9260 - val_loss: 0.2655
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9218 - loss: 0.2778 - val_accuracy: 0.9336 - val_loss: 0.2417
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9290 - loss: 0.2529 - val_accuracy: 0.9382 - val_loss: 0.2230
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9350 - loss: 0.2328 - val_accuracy: 0.9438 - val_loss: 0.2077
Epoch 7/10
[1

<keras.src.callbacks.history.History at 0x78d34f277b10>

In [20]:
np.round(model.predict(X_new), 2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step


array([[0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.99, 0.  , 0.  ],
       [0.  , 0.  , 0.99, 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.98, 0.01, 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  ]],
      dtype=float32)

In [21]:
model_version = "0001"
model_name = "my_mnist_model"
model_path = os.path.join(model_name, model_version)
model_path

'my_mnist_model/0001'

In [25]:
tf.saved_model.save(model, model_path)

In [26]:
for root, dirs, files in os.walk(model_name):
    indent = '    ' * root.count(os.sep)
    print('{}{}/'.format(indent, os.path.basename(root)))
    for filename in files:
        print('{}{}'.format(indent + '    ', filename))

my_mnist_model/
    0001/
        fingerprint.pb
        saved_model.pb
        variables/
            variables.data-00000-of-00001
            variables.index
        assets/


In [27]:
!saved_model_cli show --dir {model_path}

2025-06-20 14:31:37.486600: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750429897.570757    7958 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750429897.583383    7958 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750429897.665831    7958 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750429897.665967    7958 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750429897.665978    7958 computation_placer.cc:177] computation placer alr

In [28]:
!saved_model_cli show --dir {model_path} --tag_set serve

2025-06-20 14:31:48.860247: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750429908.881952    8018 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750429908.888553    8018 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750429908.904754    8018 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750429908.904800    8018 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750429908.904805    8018 computation_placer.cc:177] computation placer alr

In [29]:
!saved_model_cli show --dir {model_path} --tag_set serve \
                      --signature_def serving_default

2025-06-20 14:31:57.311216: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750429917.349892    8058 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750429917.360487    8058 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750429917.386032    8058 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750429917.386093    8058 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750429917.386099    8058 computation_placer.cc:177] computation placer alr

In [30]:
!saved_model_cli show --dir {model_path} --all

2025-06-20 14:32:05.958290: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750429925.996336    8098 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750429926.007643    8098 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750429926.031555    8098 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750429926.031609    8098 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750429926.031617    8098 computation_placer.cc:177] computation placer alr

In [31]:
np.save("my_mnist_tests.npy", X_new)

In [32]:
input_name = model.layers[0].name
input_name

'flatten_1'

In [33]:
!saved_model_cli run --dir {model_path} --tag_set serve \
                     --signature_def serving_default    \
                     --inputs {input_name}=my_mnist_tests.npy

2025-06-20 14:32:14.555854: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750429934.578216    8147 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750429934.585462    8147 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750429934.602575    8147 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750429934.602636    8147 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750429934.602641    8147 computation_placer.cc:177] computation placer alr

In [34]:
np.round([[1.1347984e-04, 1.5187356e-07, 9.7032893e-04, 2.7640699e-03, 3.7826971e-06,
           7.6876910e-05, 3.9140293e-08, 9.9559116e-01, 5.3502394e-05, 4.2665208e-04],
          [8.2443521e-04, 3.5493889e-05, 9.8826385e-01, 7.0466995e-03, 1.2957400e-07,
           2.3389691e-04, 2.5639210e-03, 9.5886099e-10, 1.0314899e-03, 8.7952529e-08],
          [4.4693781e-05, 9.7028232e-01, 9.0526715e-03, 2.2641101e-03, 4.8766597e-04,
           2.8800720e-03, 2.2714981e-03, 8.3753867e-03, 4.0439744e-03, 2.9759688e-04]], 2)

array([[0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  ],
       [0.  , 0.  , 0.99, 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.97, 0.01, 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  ]])

In [35]:
os.environ["MODEL_DIR"] = os.path.split(os.path.abspath(model_path))[0]

In [36]:
%%bash --bg
nohup tensorflow_model_server \
     --rest_api_port=8601 \
     --rest_api_host=0.0.0.0 \
     --model_name=my_mnist_model \
     --model_base_path="${MODEL_DIR}" >server.log 2>&1

In [37]:
!tail server.log

	--num_tflite_interpreters_per_pool=1	int32	EXPERIMENTAL; CAN BE REMOVED ANYTIME! Number of TFLite interpreters in an interpreter pool of TfLiteSession. Typically there is one TfLiteSession for each TF Lite model that is loaded. If not set, will be 1.
	--enable_signature_method_name_check=false	bool	Enable method_name check for SignatureDef. Disable this if serving native TF2 regression/classification models.
	--xla_cpu_compilation_enabled=false	bool	EXPERIMENTAL; CAN BE REMOVED ANYTIME! Enable XLA:CPU JIT (default is disabled). With XLA:CPU JIT disabled, models utilizing this feature will return bad Status on first compilation request.
I0000 00:00:1750429953.425457    8243 loader_harness.cc:71] Approving load for servable version {name: my_mnist_model version: 1}
I0000 00:00:1750429953.425621    8243 loader_harness.cc:79] Loading servable version {name: my_mnist_model version: 1}
I0000 00:00:1750429953.508848    8243 mlir_graph_optimization_pass.cc:425] MLIR V1 optimization pass is no

In [38]:
import json

input_data_json = json.dumps({
    "signature_name": "serving_default",
    "instances": X_new.tolist(),
})

In [39]:
repr(input_data_json)[:1500] + "..."

'\'{"signature_name": "serving_default", "instances": [[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]], [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]], [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]], [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]], [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0

# **Exercises**
1. What does a SavedModel contain? How do you inspect its content?
2. When should you use TF Serving? What are its main features? What are some
tools you can use to deploy it?
3. How do you deploy a model across multiple TF Serving instances?
4. When should you use the gRPC API rather than the REST API to query a model
served by TF Serving?
5. What are the different ways TFLite reduces a model’s size to make it run on a
mobile or embedded device?
6. What is quantization-aware training, and why would you need it?
7. What are model parallelism and data parallelism? Why is the latter generally
recommended?
8. When training a model across multiple servers, what distribution strategies can
you use? How do you choose which one to use?
9. Train a model (any model you like) and deploy it to TF Serving or Google Cloud
AI Platform. Write the client code to query it using the REST API or the gRPC API. Update the model and deploy the new version. Your client code will now
query the new version. Roll back to the first version.
10. Train any model across multiple GPUs on the same machine using the Mirrored
Strategy (if you do not have access to GPUs, you can use Colaboratory with a
GPU Runtime and create two virtual GPUs). Train the model again using the
CentralStorageStrategy and compare the training time.
11. Train a small model on Google Cloud AI Platform, using black box hyperpara‐
meter tuning.

# **Jawaban**
1. SavedModel adalah format penyimpanan model TensorFlow yang berisi arsitektur (graf komputasi) dan bobot model. Disimpan dalam folder yang berisi file saved_model.pb (graf komputasi) dan folder variables (nilai variabel/bobot). Jika bobotnya banyak, bisa terpecah dalam beberapa file. Ada juga folder assets untuk data tambahan (seperti vocab atau contoh data). SavedModel bisa memuat satu atau lebih metagraph (graf + definisi input/output). SavedModel bisa dilihat pakai saved_model_cli atau di-load dengan tf.saved_model.load().

2. TF Serving memudahkan kita untuk deploy banyak model TensorFlow (atau versi berbeda dari satu model) lewat REST API atau gRPC API. Jadi, nggak perlu repot update model di tiap aplikasi. TF Serving bisa otomatis memantau folder, memuat model baru tanpa restart aplikasi, performa cepat, mendukung A/B testing, canary release, dan batching request ke GPU. Instalasi paling mudah pakai Docker, bisa di-orchestrate dengan Kubernetes atau pakai layanan cloud seperti Google Cloud AI Platform.\

3. Untuk deploy model ke banyak instance TF Serving, cukup atur semua instance agar memantau folder model yang sama, lalu ekspor model baru sebagai SavedModel ke subfolder di dalamnya.

4. gRPC API lebih efisien dari REST API, tapi library-nya tidak seluas REST. Jika REST API pakai kompresi, performanya hampir setara. Jadi, gRPC cocok dipakai jika butuh performa maksimal dan kliennya bisa pakai gRPC

5. TFLite mengecilkan ukuran model agar bisa jalan di perangkat mobile/embedded dengan:

* Menggunakan converter untuk optimasi SavedModel (memangkas operasi tak perlu, menggabungkan operasi).

* Melakukan post-training quantization agar ukuran model jauh lebih kecil dan cepat diunduh.

* Menyimpan model dalam format FlatBuffer yang langsung bisa dimuat ke RAM, sehingga loading lebih cepat dan hemat memori

6. Quantization-aware training menambahkan operasi kuantisasi palsu saat training, supaya model belajar mengabaikan noise kuantisasi. Hasilnya, bobot model jadi lebih tahan terhadap kuantisasi.

7. Model parallelism membagi model ke beberapa bagian dan menjalankannya di banyak perangkat secara paralel.
Data parallelism menggandakan model jadi beberapa replika di berbagai perangkat, lalu tiap replika melatih batch data berbeda.

* Pada synchronous data parallelism, semua gradien digabung lalu di-update bersama.

* Pada asynchronous data parallelism, tiap replika update parameter pusat secara mandiri tanpa menunggu lainnya.

8. Saat melatih model di banyak server, ada dua strategi distribusi utama:

* MultiWorkerMirroredStrategy: pakai data parallelism sinkron. Model direplikasi di semua server & device, tiap replika dapat batch data berbeda. Gradien dihitung, dirata-ratakan (AllReduce), lalu semua replika update bareng. Ini paling simpel & umumnya disarankan. Syaratnya: model harus muat di RAM tiap replika.

* ParameterServerStrategy: pakai data parallelism asinkron. Model direplikasi di worker, parameter dibagi di server parameter. Tiap worker jalan mandiri, ambil param terbaru, hitung gradien, kirim ke server. Server update parameter. Biasanya lebih lambat dari strategi pertama.