jina-ai · JoanFM · Oct 11, 2022 · Sep 15, 2022 · Sep 15, 2022 · Sep 16, 2022
diff --git a/docs/fundamentals/flow/executor-args.md b/docs/fundamentals/flow/executor-args.md
@@ -35,6 +35,12 @@
 | `port_monitoring` | The port on which the prometheus server is exposed, default is a random port between [49152, 65535] | `string` | `random in [49152, 65535]` |
 | `retries` | Number of retries per gRPC call. If <0 it defaults to max(3, num_replicas) | `number` | `-1` |
 | `floating` | If set, the current Pod/Deployment can not be further chained, and the next `.add()` will chain after the last Pod/Deployment not this current one. | `boolean` | `False` |
+| `tracing` | If set, the sdk implementation of the OpenTelemetry tracer will be available and will be enabled for automatic tracing of requests and customer span creation. Otherwise a no-op implementation will be provided. | `boolean` | `False` |
+| `span_exporter_host` | If tracing is enabled, this hostname will be used to configure the trace exporter agent. | `string` | `None` |
+| `span_exporter_port` | If tracing is enabled, this port will be used to configure the trace exporter agent. | `number` | `None` |
+| `metrics` | If set, the sdk implementation of the OpenTelemetry metrics will be available for default monitoring and custom measurements. Otherwise a no-op implementation will be provided. | `boolean` | `False` |
+| `metrics_exporter_host` | If tracing is enabled, this hostname will be used to configure the metrics exporter agent. | `string` | `None` |
+| `metrics_exporter_port` | If tracing is enabled, this port will be used to configure the metrics exporter agent. | `number` | `None` |
 | `install_requirements` | If set, install `requirements.txt` in the Hub Executor bundle to local | `boolean` | `False` |
 | `force_update` | If set, always pull the latest Hub Executor bundle even it exists on local | `boolean` | `False` |
 | `compression` | The compression mechanism used when sending requests from the Head to the WorkerRuntimes. For more details, check https://grpc.github.io/grpc/python/grpc.html#compression. | `string` | `None` |

diff --git a/docs/fundamentals/flow/gateway-args.md b/docs/fundamentals/flow/gateway-args.md
@@ -49,4 +49,10 @@
 | `monitoring` | If set, spawn an http server with a prometheus endpoint to expose metrics | `boolean` | `False` |
 | `port_monitoring` | The port on which the prometheus server is exposed, default is a random port between [49152, 65535] | `string` | `random in [49152, 65535]` |
 | `retries` | Number of retries per gRPC call. If <0 it defaults to max(3, num_replicas) | `number` | `-1` |
-| `floating` | If set, the current Pod/Deployment can not be further chained, and the next `.add()` will chain after the last Pod/Deployment not this current one. | `boolean` | `False` |
+| `floating` | If set, the current Pod/Deployment can not be further chained, and the next `.add()` will chain after the last Pod/Deployment not this current one. | `boolean` | `False` |
+| `tracing` | If set, the sdk implementation of the OpenTelemetry tracer will be available and will be enabled for automatic tracing of requests and customer span creation. Otherwise a no-op implementation will be provided. | `boolean` | `False` |
+| `span_exporter_host` | If tracing is enabled, this hostname will be used to configure the trace exporter agent. | `string` | `None` |
+| `span_exporter_port` | If tracing is enabled, this port will be used to configure the trace exporter agent. | `number` | `None` |
+| `metrics` | If set, the sdk implementation of the OpenTelemetry metrics will be available for default monitoring and custom measurements. Otherwise a no-op implementation will be provided. | `boolean` | `False` |
+| `metrics_exporter_host` | If tracing is enabled, this hostname will be used to configure the metrics exporter agent. | `string` | `None` |
+| `metrics_exporter_port` | If tracing is enabled, this port will be used to configure the metrics exporter agent. | `number` | `None` |
@@ -35,8 +35,17 @@ packaging>=20.0:            core
 docarray>=0.16.4:           core
 jina-hubble-sdk>=0.19.0:    core
 jcloud>=0.0.35:             core
+opentelemetry-api>=1.12.0:  core
+opentelemetry-instrumentation-grpc>=0.33b0:  core 
 uvloop:                     perf,standard,devel
 prometheus_client:          perf,standard,devel
+opentelemetry-sdk>=1.12.0:   perf,standard,devel
+opentelemetry-exporter-otlp>=1.12.0:  perf,standard,devel
+opentelemetry-exporter-prometheus>=1.12.0rc1:  perf,standard,devel
+opentelemetry-semantic-conventions>=0.33b0:    perf,standard,devel
+opentelemetry-instrumentation-aiohttp-client>=0.33b0:    perf,standard,devel
+opentelemetry-instrumentation-fastapi>=0.33b0: perf,standard,devel
+opentelemetry-exporter-otlp-proto-grpc>=1.13.0: perf,standrad,devel
 fastapi>=0.76.0:            standard,devel
 uvicorn[standard]:          standard,devel
 docarray[common]>=0.16.3:   standard,devel
@@ -77,3 +86,4 @@ bs4:                        cicd
 jsonschema:                 cicd
 portforward>=0.2.4:         cicd
 tensorflow>=2.0:            cicd
+opentelemetry-test-utils>=0.33b0:  test
diff --git a/jina/clients/__init__.py b/jina/clients/__init__.py
@@ -20,10 +20,16 @@ def Client(
     *,
     asyncio: Optional[bool] = False,
     host: Optional[str] = '0.0.0.0',
+    metrics: Optional[bool] = False,
+    metrics_exporter_host: Optional[str] = None,
+    metrics_exporter_port: Optional[int] = None,
     port: Optional[int] = None,
     protocol: Optional[str] = 'GRPC',
     proxy: Optional[bool] = False,
+    span_exporter_host: Optional[str] = None,
+    span_exporter_port: Optional[int] = None,
     tls: Optional[bool] = False,
+    tracing: Optional[bool] = False,
     **kwargs
 ) -> Union[
     'AsyncWebSocketClient',
@@ -37,10 +43,16 @@ def Client(
 
     :param asyncio: If set, then the input and output of this Client work in an asynchronous manner.
     :param host: The host address of the runtime, by default it is 0.0.0.0. In the case of an external Executor (`--external` or `external=True`) this can be a list of hosts, separated by commas. Then, every resulting address will be considered as one replica of the Executor.
+    :param metrics: If set, the sdk implementation of the OpenTelemetry metrics will be available for default monitoring and custom measurements. Otherwise a no-op implementation will be provided.
+    :param metrics_exporter_host: If tracing is enabled, this hostname will be used to configure the metrics exporter agent.
+    :param metrics_exporter_port: If tracing is enabled, this port will be used to configure the metrics exporter agent.
     :param port: The port of the Gateway, which the client should connect to.
     :param protocol: Communication protocol between server and client.
     :param proxy: If set, respect the http_proxy and https_proxy environment variables. otherwise, it will unset these proxy variables before start. gRPC seems to prefer no proxy
+    :param span_exporter_host: If tracing is enabled, this hostname will be used to configure the trace exporter agent.
+    :param span_exporter_port: If tracing is enabled, this port will be used to configure the trace exporter agent.
     :param tls: If set, connect to gateway using tls encryption
+    :param tracing: If set, the sdk implementation of the OpenTelemetry tracer will be available and will be enabled for automatic tracing of requests and customer span creation. Otherwise a no-op implementation will be provided.
     :return: the new Client object
 
     .. # noqa: DAR202
@@ -81,10 +93,16 @@ def Client(
 
     :param asyncio: If set, then the input and output of this Client work in an asynchronous manner.
     :param host: The host address of the runtime, by default it is 0.0.0.0. In the case of an external Executor (`--external` or `external=True`) this can be a list of hosts, separated by commas. Then, every resulting address will be considered as one replica of the Executor.
+    :param metrics: If set, the sdk implementation of the OpenTelemetry metrics will be available for default monitoring and custom measurements. Otherwise a no-op implementation will be provided.
+    :param metrics_exporter_host: If tracing is enabled, this hostname will be used to configure the metrics exporter agent.
+    :param metrics_exporter_port: If tracing is enabled, this port will be used to configure the metrics exporter agent.
     :param port: The port of the Gateway, which the client should connect to.
     :param protocol: Communication protocol between server and client.
     :param proxy: If set, respect the http_proxy and https_proxy environment variables. otherwise, it will unset these proxy variables before start. gRPC seems to prefer no proxy
+    :param span_exporter_host: If tracing is enabled, this hostname will be used to configure the trace exporter agent.
+    :param span_exporter_port: If tracing is enabled, this port will be used to configure the trace exporter agent.
     :param tls: If set, connect to gateway using tls encryption
+    :param tracing: If set, the sdk implementation of the OpenTelemetry tracer will be available and will be enabled for automatic tracing of requests and customer span creation. Otherwise a no-op implementation will be provided.
     :return: the new Client object
 
     .. # noqa: DAR102

diff --git a/jina/clients/base/__init__.py b/jina/clients/base/__init__.py
@@ -17,9 +17,10 @@
 
     InputType = Union[GeneratorSourceType, Callable[..., GeneratorSourceType]]
     CallbackFnType = Optional[Callable[[Response], None]]
+from jina.serve.instrumentation import InstrumentationMixin
 
 
-class BaseClient(ABC):
+class BaseClient(InstrumentationMixin, ABC):
     """A base client for connecting to the Flow Gateway.
 
     :param args: the Namespace from argparse
@@ -46,6 +47,17 @@ def __init__(
             os.unsetenv('http_proxy')
             os.unsetenv('https_proxy')
         self._inputs = None
+        self._setup_instrumentation(
+            name=self.args.name
+            if hasattr(self.args, 'name')
+            else self.__class__.__name__,
+            tracing=self.args.tracing,
+            span_exporter_host=self.args.span_exporter_host,
+            span_exporter_port=self.args.span_exporter_port,
+            metrics=self.args.metrics,
+            metrics_exporter_host=self.args.metrics_exporter_host,
+            metrics_exporter_port=self.args.metrics_exporter_port,
+        )
         send_telemetry_event(event='start', obj=self)
 
     @staticmethod

diff --git a/jina/clients/base/grpc.py b/jina/clients/base/grpc.py
@@ -1,8 +1,8 @@
 import asyncio
+import json
 from typing import TYPE_CHECKING, Optional
 
 import grpc
-import json
 
 from jina.clients.base import BaseClient
 from jina.clients.helper import callback_exec
@@ -82,19 +82,27 @@ async def _get_results(
             # while loop with retries, check in which state the `iterator` remains after failure
             options = GrpcConnectionPool.get_default_grpc_options()
             if max_attempts > 1:
-                service_config_json = json.dumps({
-                                "methodConfig": [{
-                                    # To apply retry to all methods, put [{}] in the "name" field
-                                    "name": [{}],
-                                    "retryPolicy": {
-                                        "maxAttempts": max_attempts,
-                                        "initialBackoff": f"{initial_backoff}s",
-                                        "maxBackoff": f"{max_backoff}s",
-                                        "backoffMultiplier": {backoff_multiplier},
-                                        "retryableStatusCodes": ["UNAVAILABLE", "DEADLINE_EXCEEDED", "INTERNAL"],
-                                    },
-                                }]
-                            })
+                service_config_json = json.dumps(
+                    {
+                        "methodConfig": [
+                            {
+                                # To apply retry to all methods, put [{}] in the "name" field
+                                "name": [{}],
+                                "retryPolicy": {
+                                    "maxAttempts": max_attempts,
+                                    "initialBackoff": f"{initial_backoff}s",
+                                    "maxBackoff": f"{max_backoff}s",
+                                    "backoffMultiplier": {backoff_multiplier},
+                                    "retryableStatusCodes": [
+                                        "UNAVAILABLE",
+                                        "DEADLINE_EXCEEDED",
+                                        "INTERNAL",
+                                    ],
+                                },
+                            }
+                        ]
+                    }
+                )
                 # NOTE: the retry feature will be enabled by default >=v1.40.0
                 options.append(("grpc.enable_retries", 1))
                 options.append(("grpc.service_config", service_config_json))
@@ -104,6 +112,7 @@ async def _get_results(
                 options=options,
                 asyncio=True,
                 tls=self.args.tls,
+                aio_tracing_client_interceptors=self.aio_tracing_client_interceptors(),
             ) as channel:
                 stub = jina_pb2_grpc.JinaRPCStub(channel)
                 self.logger.debug(f'connected to {self.args.host}:{self.args.port}')
@@ -146,11 +155,13 @@ async def _get_results(
                             )
                             raise ConnectionError(my_details)
                         elif my_code == grpc.StatusCode.INTERNAL:
-                            self.logger.error(f'{msg}\ninternal error on the server side')
+                            self.logger.error(
+                                f'{msg}\ninternal error on the server side'
+                            )
                             raise err
                         elif (
-                                my_code == grpc.StatusCode.UNKNOWN
-                                and 'asyncio.exceptions.TimeoutError' in my_details
+                            my_code == grpc.StatusCode.UNKNOWN
+                            and 'asyncio.exceptions.TimeoutError' in my_details
                         ):
                             raise BadClientInput(
                                 f'{msg}\n'

diff --git a/jina/clients/base/helper.py b/jina/clients/base/helper.py
@@ -1,7 +1,7 @@
 import asyncio
 import random
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 from aiohttp import WSMsgType
 
@@ -12,19 +12,25 @@
 from jina.types.request.status import StatusMessage
 
 if TYPE_CHECKING:
+    from opentelemetry import trace
+
     from jina.logging.logger import JinaLogger
 
 
 class AioHttpClientlet(ABC):
     """aiohttp session manager"""
 
-    def __init__(self, url: str,
-                 logger: 'JinaLogger',
-                 max_attempts: int = 1,
-                 initial_backoff: float = 0.5,
-                 max_backoff: float = 0.1,
-                 backoff_multiplier: float = 1.5,
-                 **kwargs) -> None:
+    def __init__(
+        self,
+        url: str,
+        logger: 'JinaLogger',
+        max_attempts: int = 1,
+        initial_backoff: float = 0.5,
+        max_backoff: float = 0.1,
+        backoff_multiplier: float = 1.5,
+        tracer_provider: Optional['trace.TraceProvider'] = None,
+        **kwargs,
+    ) -> None:
         """HTTP Client to be used with the streamer
 
         :param url: url to send http/websocket request to
@@ -33,12 +39,19 @@ def __init__(self, url: str,
         :param initial_backoff: The first retry will happen with a delay of random(0, initial_backoff)
         :param max_backoff: The maximum accepted backoff after the exponential incremental delay
         :param backoff_multiplier: The n-th attempt will occur at random(0, min(initialBackoff*backoffMultiplier**(n-1), maxBackoff))
+        :param tracer_provider: Optional tracer_provider that will be used to configure aiohttp tracing.
         :param kwargs: kwargs  which will be forwarded to the `aiohttp.Session` instance. Used to pass headers to requests
         """
         self.url = url
         self.logger = logger
         self.msg_recv = 0
         self.msg_sent = 0
+        if tracer_provider:
+            from opentelemetry.instrumentation.aiohttp_client import create_trace_config
+
+            self._trace_config = [create_trace_config(tracer_provider=tracer_provider)]
+        else:
+            self._trace_config = None
         self.session = None
         self._session_kwargs = {}
         if kwargs.get('headers', None):
@@ -90,7 +103,9 @@ async def start(self):
         with ImportExtensions(required=True):
             import aiohttp
 
-        self.session = aiohttp.ClientSession(**self._session_kwargs)
+        self.session = aiohttp.ClientSession(
+            **self._session_kwargs, trace_configs=self._trace_config
+        )
         await self.session.__aenter__()
         return self
 
@@ -125,7 +140,14 @@ async def send_message(self, request: 'Request'):
                 if retry == self.max_attempts:
                     raise
                 else:
-                    wait_time = random.uniform(0, min(self.initial_backoff*self.backoff_multiplier**(retry-1), self.max_backoff))
+                    wait_time = random.uniform(
+                        0,
+                        min(
+                            self.initial_backoff
+                            * self.backoff_multiplier ** (retry - 1),
+                            self.max_backoff,
+                        ),
+                    )
                     await asyncio.sleep(wait_time)
 
     async def send_dry_run(self, **kwargs):
@@ -194,7 +216,14 @@ async def send_message(self, request: 'Request'):
                     self.logger.critical(f'server connection closed already!')
                     raise
                 else:
-                    wait_time = random.uniform(0, min(self.initial_backoff*self.backoff_multiplier**(retry-1), self.max_backoff))
+                    wait_time = random.uniform(
+                        0,
+                        min(
+                            self.initial_backoff
+                            * self.backoff_multiplier ** (retry - 1),
+                            self.max_backoff,
+                        ),
+                    )
                     await asyncio.sleep(wait_time)
 
     async def send_dry_run(self, **kwargs):