## Serving models with TFServing

In [26]:
import tensorflow as tf
import tensorflow.keras as K
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os
import json

In [None]:
tf.config.list_physical_devices('GPU')

## Simple FashionMNIST model example

In [3]:
fashion_mnist=tf.keras.datasets.fashion_mnist.load_data()

In [4]:
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist

In [5]:
X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]

In [6]:
# Data normalization
X_train, X_valid, X_test = X_train/255., X_valid/255., X_test/255. 

In [None]:
model = K.Sequential()

In [8]:
model.add(K.layers.Input(shape=[28, 28]))
model.add(K.layers.Flatten())
model.add(K.layers.Dense(300, activation='relu'))
model.add(K.layers.Dense(100, activation='relu'))
model.add(K.layers.Dense(10, activation='softmax'))

In [9]:
model.compile(loss=K.losses.sparse_categorical_crossentropy,
             optimizer=K.optimizers.SGD(),
             metrics=[K.metrics.sparse_categorical_accuracy])

In [11]:
history = model.fit(X_train, y_train, 
         epochs=3,
         validation_data=[X_valid, y_valid])

Epoch 1/3
Epoch 2/3
Epoch 3/3


## Exporting SavedModel

In [14]:
model_name = 'my_mnist_model'
model_version = '0001'
model_path = Path(model_name) / model_version

In [15]:
model.save(model_path, save_format='tf')

INFO:tensorflow:Assets written to: my_mnist_model/0001/assets


INFO:tensorflow:Assets written to: my_mnist_model/0001/assets


## One should also embedd preprocessing in model and save it together. 

## However ***SavedModel saves the coputational graph*** so this limits preprocessing operations to TF Ops EXCLUDING tf.py_function(). 

In [16]:
!ls

Ch19-TFServing.ipynb  my_mnist_model


## Small CLI from TF to inspect models

In [18]:
!saved_model_cli show --dir my_mnist_model/0001

2024-02-14 12:45:38.608750: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-14 12:45:38.608779: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-14 12:45:38.609418: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
The given SavedModel contains the following tag-sets:
'serve'


## SavedModel contains *metagraphs* which are computational graphs with signatures specifying input names, types and shapes.

## Metagraphs are identified by tags, which allows to specify what operations are bundled together.

## E.g. metagraphs with full computational graph including training operations can be called 'train'

## *Prunned graph* with only prediction operations with some GPU specific operations may be called 'serve'

## Keras .save() method saves only the default 'serve' tag

In [21]:
!saved_model_cli show --dir  my_mnist_model/0001 --tag_set serve

2024-02-14 12:53:17.743594: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-14 12:53:17.743628: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-14 12:53:17.744284: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
The given SavedModel MetaGraphDef contains SignatureDefs with the following keys:
SignatureDef key: "__saved_model_init_op"
SignatureDef key: "serving_default"


## Default serving function for a saved Keras model is its .call() method

## It can be inspected in more detail

In [23]:
!saved_model_cli show --dir  my_mnist_model/0001 --tag_set serve --signature_def serving_default

2024-02-14 12:55:48.038717: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-14 12:55:48.038747: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-14 12:55:48.039395: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
The given SavedModel SignatureDef contains the following input(s):
  inputs['input_1'] tensor_info:
      dtype: DT_FLOAT
      shape: (-1, 28, 28)
      name: serving_default_input_1:0
The given SavedModel SignatureDef contains the following output(s):
  outputs['dense_2'] tensor_info:
      dtype: DT_FLOAT
      shape: (-1, 10)
      name: StatefulPartitionedCa

## Installing TensorFlow Serving - Ubuntu

In [None]:
url = "https://storage.googleapis.com/tensorflow-serving-apt"
src = "stable tensorflow-model-server tensorflow-model-server-universal"
!echo 'deb {url} {src}' > /etc/apt/sources.list.d/tensorflow-serving.list
!curl '{url}/tensorflow-serving.release.pub.gpg' | apt-key add -
!apt update -q && apt-get install -y tensorflow-model-server

# API needed to communicate with the server    
%pip install -q -U tensorflow-serving-api

## Starting the server

In [None]:
# Absolute path to the model directory (with versions as subdirectories)
os.environ["MODEL_DIR"] = str(model_path.parent.absolute())

In [None]:
%%bash --bg
tensorflow_model_server \
--port=8500 \ # gRPC
--rest_api_port=8501 \ # REST
--model_name=my_mnist_model \
--model_base_path="${MODEL_DIR}" > my_server.log 2>&1

## Docker version

In [None]:
!docker run -it --rm -v "/path/to/my_mnist_model:/models/my_mnist_model" \
-p 8500:8500 -p 8501:8501 -e MODEL_NAME=my_mnist_model tensorflow/serving

## Querying TF Serving with REST

### Query must be a JSON containing:
* ### Name of a function signature to call
* ### Input data

In [27]:
X_new = X_test[:3]

In [29]:
request_json = json.dumps({
    "signature_name": "serving_default",
    "instances": X_new.tolist(), # Data serialization
})

## Dumped data is pure text

In [30]:
request_json

'{"signature_name": "serving_default", "instances": [[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

## Requests to TF Serving are sent with HTTP POST using requests lib

In [None]:
import requests

In [None]:
server_url = 'http://localhost:8501/v1/models/my_mnist_model:predict'
resposne = requests.post(server_url, data=request_json)
reposnse.raise_for_status() # Raise an exception on error
resposne = resposne.json()

## The response is a dict with a single 'predictions' key

## The value to that key is a list of predictions so it must be converted do np array

In [None]:
y_proba = np.array(response['predictions'])
y_proba.round(2)

### REST with JSON are very inefficient as e.g. all floats are transformed do strings

### This turns 32 bit variable into >120 bit encoding of the corresponfing 15 chars representing the float

### Also serialization conversions float-str-float (both ways) are time consuming

### For larger data payloads gRPC should be used

## Preparing gRPC query

In [None]:
# Importing ProtoBuffer module
from tensorflow_serving.apis.predict_pb2 import PredictRequest

In [None]:
request = Predictrequest()
request.model_spec.name = model_name
request.model_spec.signature_name = 'serving_default'
input_name = model.input_names[0] ## Name of the model entry point, should be adjusted accordingly

# tf.make_tensor_proto() creates a protobuf from a tensor or numpy array
request.inputs[input_name].CopyFrom(tf.make_tensor_proto(X_New))

## Making the call

In [None]:
import grpc
from tensorflow_serving.apis import prediction_service_pb2_grpc

In [None]:
channel = grpc.insecure_channel('localhost:8500')
predict_service = prediction_service_pb2_grpc.PredictionServiceStub(channel)

# Request is TCP syncronous, will block until response or timeout
response = predict_service.Predict(request, timeout=10.0)

In [None]:
output_name = model.output_names[0] # Appropriate model output name for response readout
outputs_proto = response.outputs[output_name]
y_proba = tf.make_ndarray(outputs_proto)

## New model deployment

In [None]:
model = [...]
model_version = "0002"
model_path = Path(model_name) / model_version
model.save(model_path, save_format="tf")

## TF Serving server priodically checks for new model versions

## After finding a new version it handles new queries with the new model and waits for all pending queries to complete with the old model

## After old queries are completed the old model is unloaded

## To unroll the deployment it suffices to remove the directory with the stored model

## One can also test and warmup model with example instances in the *assets/extra*

## TF Serving has also requests batching (--enable_batching) and multiple servers deployment with load balancing