This notebook shows how to deploy a vision model in TensorFlow from 🤗 Transformers with TensorFlow Serving. It uses [this blog post](https://huggingface.co/blog/tf-serving) as a reference. 

## Setup

In [None]:
!pip install -q transformers
!pip install -q tensorflow_serving_api

[K     |████████████████████████████████| 4.4 MB 4.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 23.1 MB/s 
[K     |████████████████████████████████| 596 kB 39.9 MB/s 
[K     |████████████████████████████████| 101 kB 9.9 MB/s 
[K     |████████████████████████████████| 511.7 MB 5.5 kB/s 
[K     |████████████████████████████████| 438 kB 46.6 MB/s 
[K     |████████████████████████████████| 5.8 MB 33.5 MB/s 
[K     |████████████████████████████████| 1.6 MB 41.1 MB/s 
[?25h

## Imports

In [None]:
from transformers import ViTFeatureExtractor, TFViTForImageClassification
import tensorflow as tf
import tempfile
import requests
import base64
import json
import os

In [None]:
import transformers

print(transformers.__version__)

4.20.1


## Save the ViT model and investigate its inputs

In [None]:
# the saved_model parameter is a flag to create a saved model version of the model
temp_model_dir = "vit"
model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
model.save_pretrained(temp_model_dir, saved_model=True)

Downloading:   0%|          | 0.00/68.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/330M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFViTForImageClassification.

All the layers of TFViTForImageClassification were initialized from the model checkpoint at google/vit-base-patch16-224.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFViTForImageClassification for predictions without further training.


INFO:tensorflow:Assets written to: resnet/saved_model/1/assets


INFO:tensorflow:Assets written to: resnet/saved_model/1/assets


In [None]:
!saved_model_cli show --dir {temp_model_dir}/saved_model/1 --tag_set serve --signature_def serving_default

The given SavedModel SignatureDef contains the following input(s):
  inputs['pixel_values'] tensor_info:
      dtype: DT_FLOAT
      shape: (-1, -1, -1, -1)
      name: serving_default_pixel_values:0
The given SavedModel SignatureDef contains the following output(s):
  outputs['logits'] tensor_info:
      dtype: DT_FLOAT
      shape: (-1, 1000)
      name: StatefulPartitionedCall:0
Method name is: tensorflow/serving/predict


## Save the model embedding pre-processing and post-processing ops

In [None]:
feature_extractor = ViTFeatureExtractor()
feature_extractor

ViTFeatureExtractor {
  "do_normalize": true,
  "do_resize": true,
  "feature_extractor_type": "ViTFeatureExtractor",
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "size": 224
}

In [None]:
CONCRETE_INPUT = "pixel_values"
SIZE = feature_extractor.size
INPUT_SHAPE = (SIZE, SIZE, 3)

In [None]:
def normalize_img(
    img, mean=feature_extractor.image_mean, std=feature_extractor.image_std
):
    # Scale to the value range of [0, 1] first and then normalize.
    img = img / 255
    mean = tf.constant(mean)
    std = tf.constant(std)
    return (img - mean) / std


def preprocess(string_input):
    decoded_input = tf.io.decode_base64(string_input)
    decoded = tf.io.decode_jpeg(decoded_input, channels=3)
    resized = tf.image.resize(decoded, size=(SIZE, SIZE))
    normalized = normalize_img(resized)
    normalized = tf.transpose(
        normalized, (2, 0, 1)
    )  # Since HF models are channel-first.
    return normalized


@tf.function(input_signature=[tf.TensorSpec([None], tf.string)])
def preprocess_fn(string_input):
    decoded_images = tf.map_fn(
        preprocess, string_input, dtype=tf.float32, back_prop=False
    )
    return {CONCRETE_INPUT: decoded_images}


def model_exporter(model: tf.keras.Model):
    m_call = tf.function(model.call).get_concrete_function(
        tf.TensorSpec(
            shape=[None, 3, SIZE, SIZE], dtype=tf.float32, name=CONCRETE_INPUT
        )
    )

    @tf.function(input_signature=[tf.TensorSpec([None], tf.string)])
    def serving_fn(string_input):
        labels = tf.constant(list(model.config.id2label.values()), dtype=tf.string)
        images = preprocess_fn(string_input)

        predictions = m_call(**images)
        indices = tf.argmax(predictions.logits, axis=1)
        pred_source = tf.gather(params=labels, indices=indices)
        probs = tf.nn.softmax(predictions.logits, axis=1)
        pred_confidence = tf.reduce_max(probs, axis=1)
        return {"label": pred_source, "confidence": pred_confidence}

    return serving_fn

**Notes on making the model accept string inputs**:

When dealing with images via REST or gRPC requests the size of the request payload can easily spiral up depending on the resolution of the images being passed. This is why, it is good practice to compress them reliably and then prepare the request payload.

In [None]:
MODEL_DIR = tempfile.gettempdir()
VERSION = 1

tf.saved_model.save(
    model,
    os.path.join(MODEL_DIR, str(VERSION)),
    signatures={"serving_default": model_exporter(model)},
)
os.environ["MODEL_DIR"] = MODEL_DIR

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.map_fn(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.map_fn(fn, elems))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.map_fn(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.map_fn(fn, elems))


Instructions for updating:
Use fn_output_signature instead


Instructions for updating:
Use fn_output_signature instead


INFO:tensorflow:Assets written to: /tmp/1/assets


INFO:tensorflow:Assets written to: /tmp/1/assets


Investigate the `SavedModel` once again. 

In [None]:
!saved_model_cli show --dir {MODEL_DIR}/1 --tag_set serve --signature_def serving_default

The given SavedModel SignatureDef contains the following input(s):
  inputs['string_input'] tensor_info:
      dtype: DT_STRING
      shape: (-1)
      name: serving_default_string_input:0
The given SavedModel SignatureDef contains the following output(s):
  outputs['confidence'] tensor_info:
      dtype: DT_FLOAT
      shape: (-1)
      name: StatefulPartitionedCall:0
  outputs['label'] tensor_info:
      dtype: DT_STRING
      shape: (-1)
      name: StatefulPartitionedCall:1
Method name is: tensorflow/serving/predict


## Install TensorFlow Model Server

In [None]:
# Deviates from the original installation instructions.
# https://issuemode.com/issues/tensorflow/serving/92945160
!wget 'http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-universal-2.8.0/t/tensorflow-model-server-universal/tensorflow-model-server-universal_2.8.0_all.deb'
!dpkg -i tensorflow-model-server-universal_2.8.0_all.deb

--2022-07-15 04:11:11--  http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-universal-2.8.0/t/tensorflow-model-server-universal/tensorflow-model-server-universal_2.8.0_all.deb
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.8.128, 74.125.23.128, 74.125.203.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.8.128|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 335421916 (320M) [application/x-debian-package]
Saving to: ‘tensorflow-model-server-universal_2.8.0_all.deb’


2022-07-15 04:11:16 (68.7 MB/s) - ‘tensorflow-model-server-universal_2.8.0_all.deb’ saved [335421916/335421916]

Selecting previously unselected package tensorflow-model-server-universal.
(Reading database ... 155653 files and directories currently installed.)
Preparing to unpack tensorflow-model-server-universal_2.8.0_all.deb ...
Unpacking tensorflow-model-server-universal (2.8.0) ...
Setting up tensorflow-model-serve

## Deploy the model 

By default TF Serving exposes two APIs: REST and gRPC. We will see how to infer with both. Each has their own pros and cons.

In [None]:
%%bash --bg 
nohup tensorflow_model_server \
  --rest_api_port=8501 \
  --model_name=vit \
  --model_base_path=$MODEL_DIR >server.log 2>&1


Starting job # 0 in a separate thread.


In [None]:
!cat server.log

[warn] getaddrinfo: address family for nodename not supported
[evhttp_server.cc : 245] NET_LOG: Entering the event loop ...


In [None]:
!sudo lsof -i -P -n | grep LISTEN

node        8 root   21u  IPv6  26436      0t0  TCP *:8080 (LISTEN)
colab-fil  30 root    5u  IPv6  26409      0t0  TCP *:3453 (LISTEN)
colab-fil  30 root    6u  IPv4  26410      0t0  TCP *:3453 (LISTEN)
jupyter-n  43 root    6u  IPv4  27130      0t0  TCP 172.28.0.2:9000 (LISTEN)
python3    60 root   15u  IPv4  30327      0t0  TCP 127.0.0.1:46129 (LISTEN)
python3    60 root   18u  IPv4  30331      0t0  TCP 127.0.0.1:58207 (LISTEN)
python3    60 root   21u  IPv4  30335      0t0  TCP 127.0.0.1:44103 (LISTEN)
python3    60 root   24u  IPv4  30339      0t0  TCP 127.0.0.1:53393 (LISTEN)
python3    60 root   30u  IPv4  30345      0t0  TCP 127.0.0.1:46873 (LISTEN)
python3    60 root   43u  IPv4  31046      0t0  TCP 127.0.0.1:59625 (LISTEN)
python3    80 root    3u  IPv4  31602      0t0  TCP 127.0.0.1:20352 (LISTEN)
python3    80 root    4u  IPv4  31603      0t0  TCP 127.0.0.1:34417 (LISTEN)
python3    80 root    9u  IPv4  32828      0t0  TCP 127.0.0.1:36819 (LISTEN)
tensorflo 259 root    5u  

## REST API

In [None]:
image_path = tf.keras.utils.get_file(
    "image.jpg", "http://images.cocodataset.org/val2017/000000039769.jpg"
)
bytes_inputs = tf.io.read_file(image_path)
b64str = base64.urlsafe_b64encode(bytes_inputs.numpy()).decode("utf-8")

data = json.dumps({"signature_name": "serving_default", "instances": [b64str]})
print("Data: {} ... {}".format(data[:50], data[len(data) - 52 :]))

Downloading data from http://images.cocodataset.org/val2017/000000039769.jpg
Data: {"signature_name": "serving_default", "instances": ... TRmYgEHbbrYWv0A6b4o2n1HZgYLq91nP-o7O2pcNa6r__2Q=="]}


In [None]:
headers = {"content-type": "application/json"}
json_response = requests.post(
    "http://localhost:8501/v1/models/vit:predict", data=data, headers=headers
)
print(json.loads(json_response.text))

{'predictions': [{'label': 'Egyptian cat', 'confidence': 0.896659195}]}


## gRPC 

In [None]:
import grpc
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2_grpc

In [None]:
channel = grpc.insecure_channel("localhost:8500")
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)

In [None]:
loaded = tf.saved_model.load(f"{MODEL_DIR}/{VERSION}")
serving_input = list(
    loaded.signatures["serving_default"].structured_input_signature[1].keys()
)[0]
print("Serving function input:", serving_input)

Serving function input: string_input


In [None]:
request = predict_pb2.PredictRequest()
request.model_spec.name = "vit"
request.model_spec.signature_name = "serving_default"
request.inputs[serving_input].CopyFrom(tf.make_tensor_proto([b64str]))

In [None]:
grpc_predictions = stub.Predict(request, 10.0)  # 10 secs timeout
print(grpc_predictions)

outputs {
  key: "confidence"
  value {
    dtype: DT_FLOAT
    tensor_shape {
      dim {
        size: 1
      }
    }
    float_val: 0.8966591954231262
  }
}
outputs {
  key: "label"
  value {
    dtype: DT_STRING
    tensor_shape {
      dim {
        size: 1
      }
    }
    string_val: "Egyptian cat"
  }
}
model_spec {
  name: "resnet"
  version {
    value: 1
  }
  signature_name: "serving_default"
}

In [None]:
grpc_predictions.outputs["label"].string_val, grpc_predictions.outputs[
    "confidence"
].float_val

([b'Egyptian cat'], [0.8966591954231262])

## Next steps

* Deploy the SavedModel to Vertex AI 
* Deploy with TF Serving + Kubernetes (via GKE)