From 4bc785fad3034d10376462bf3cea551e5138fed6 Mon Sep 17 00:00:00 2001
From: Lucain Pouget <lucainp@gmail.com>
Date: Wed, 10 Jul 2024 11:47:34 +0200
Subject: [PATCH 1/6] True OpenAI drop-in replacement by InferenceClient

---
 src/huggingface_hub/inference/_client.py      |  85 ++++++---
 .../inference/_generated/_async_client.py     |  86 ++++++---
 ...mpatibility.test_base_url_and_api_key.yaml |  75 ++++++++
 ...AICompatibility.test_with_stream_true.yaml | 173 ++++++++++++++++++
 ...AICompatibility.test_without_base_url.yaml |  72 ++++++++
 ...ai_compatibility_base_url_and_api_key.yaml |  39 ++++
 ...openai_compatibility_with_stream_true.yaml | 148 +++++++++++++++
 ...openai_compatibility_without_base_url.yaml |  37 ++++
 tests/test_inference_async_client.py          |  57 ++++++
 tests/test_inference_client.py                |  49 +++++
 utils/generate_async_inference_client.py      |  10 +
 11 files changed, 787 insertions(+), 44 deletions(-)
 create mode 100644 tests/cassettes/TestOpenAICompatibility.test_base_url_and_api_key.yaml
 create mode 100644 tests/cassettes/TestOpenAICompatibility.test_with_stream_true.yaml
 create mode 100644 tests/cassettes/TestOpenAICompatibility.test_without_base_url.yaml
 create mode 100644 tests/cassettes/test_openai_compatibility_base_url_and_api_key.yaml
 create mode 100644 tests/cassettes/test_openai_compatibility_with_stream_true.yaml
 create mode 100644 tests/cassettes/test_openai_compatibility_without_base_url.yaml

diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
index fbba38bc5c..48262b5929 100644
--- a/src/huggingface_hub/inference/_client.py
+++ b/src/huggingface_hub/inference/_client.py
@@ -113,6 +113,7 @@
     get_session,
     hf_raise_for_status,
 )
+from huggingface_hub.utils._deprecation import _deprecate_positional_args
 
 
 if TYPE_CHECKING:
@@ -148,26 +149,40 @@ class InferenceClient:
             Values in this dictionary will override the default values.
         cookies (`Dict[str, str]`, `optional`):
             Additional cookies to send to the server.
+        base_url (`str`, `optional`):
+            Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClient`]
+            follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None.
+        api_key (`str`, `optional`):
+            Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClient`]
+            follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
     """
 
+    @_deprecate_positional_args(version="0.26")
     def __init__(
         self,
         model: Optional[str] = None,
+        *,
         token: Union[str, bool, None] = None,
         timeout: Optional[float] = None,
         headers: Optional[Dict[str, str]] = None,
         cookies: Optional[Dict[str, str]] = None,
         proxies: Optional[Any] = None,
+        # OpenAI compatibility
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
     ) -> None:
         self.model: Optional[str] = model
-        self.token: Union[str, bool, None] = token
-        self.headers = CaseInsensitiveDict(build_hf_headers(token=token))  # contains 'authorization' + 'user-agent'
+        self.token: Union[str, bool, None] = token or api_key
+        self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token))  # 'authorization' + 'user-agent'
         if headers is not None:
             self.headers.update(headers)
         self.cookies = cookies
         self.timeout = timeout
         self.proxies = proxies
 
+        # OpenAI compatibility
+        self.base_url = base_url
+
     def __repr__(self):
         return f"<InferenceClient(model='{self.model if self.model else ''}', timeout={self.timeout})>"
 
@@ -441,7 +456,6 @@ def chat_completion(  # type: ignore
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        model_id: Optional[str] = None,
     ) -> ChatCompletionOutput: ...
 
     @overload
@@ -465,7 +479,6 @@ def chat_completion(  # type: ignore
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        model_id: Optional[str] = None,
     ) -> Iterable[ChatCompletionStreamOutput]: ...
 
     @overload
@@ -489,7 +502,6 @@ def chat_completion(
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        model_id: Optional[str] = None,
     ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]: ...
 
     def chat_completion(
@@ -513,7 +525,6 @@ def chat_completion(
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        model_id: Optional[str] = None,
     ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]:
         """
         A method for completing conversations using a specified language model.
@@ -525,6 +536,9 @@ def chat_completion(
                 The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used.
                 See https://huggingface.co/tasks/text-generation for more details.
+
+                If `model` is a model ID, it is passed to the server as the `model` parameter. If you want to define a
+                custom URL while setting `model` in the request payload, you must set `base_url` when initializing [`InferenceClient`].
             frequency_penalty (`float`, *optional*):
                 Penalizes new tokens based on their existing frequency
                 in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.
@@ -568,10 +582,6 @@ def chat_completion(
             tools (List of [`ChatCompletionInputTool`], *optional*):
                 A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
                 provide a list of functions the model may generate JSON inputs for.
-            model_id (`str`, *optional*):
-                The model ID to use for chat-completion. Only used when `model` is a URL to a deployed Text Generation Inference server.
-                It is passed to the server as the `model` parameter. This parameter has no impact on the URL that will be used to
-                send the request.
 
         Returns:
             [`ChatCompletionOutput`] or Iterable of [`ChatCompletionStreamOutput`]:
@@ -625,8 +635,10 @@ def chat_completion(
         ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504)
         (...)
         ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        ```
 
-        # Chat example with tools
+        Example using tools:
+        ```py
         >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
         >>> messages = [
         ...     {
@@ -708,8 +720,11 @@ def chat_completion(
         )
         ```
         """
-        # determine model
-        model = model or self.model or self.get_recommended_model("text-generation")
+        # Determine model
+        # `self.xxx` takes precedence over the method argument only in `chat_completion`
+        # since `chat_completion(..., model=xxx)` is also a payload parameter for the
+        # server, we need to handle it differently
+        model = self.base_url or self.model or model or self.get_recommended_model("text-generation")
 
         if _is_chat_completion_server(model):
             # First, let's consider the server has a `/v1/chat/completions` endpoint.
@@ -718,14 +733,13 @@ def chat_completion(
             if not model_url.endswith("/chat/completions"):
                 model_url += "/v1/chat/completions"
 
-            # `model_id` sent in the payload. Not used by the server but can be useful for debugging/routing.
-            if model_id is None:
-                if not model.startswith("http") and model.count("/") == 1:
-                    # If it's a ID on the Hub => use it
-                    model_id = model
-                else:
-                    # Otherwise, we use a random string
-                    model_id = "tgi"
+            # `model` is sent in the payload. Not used by the server but can be useful for debugging/routing.
+            if not model.startswith("http") and model.count("/") == 1:
+                # If it's a ID on the Hub => use it
+                model_id = model
+            else:
+                # Otherwise, we use a random string
+                model_id = "tgi"
 
             try:
                 data = self.post(
@@ -2562,7 +2576,7 @@ def zero_shot_image_classification(
         return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response)
 
     def _resolve_url(self, model: Optional[str] = None, task: Optional[str] = None) -> str:
-        model = model or self.model
+        model = model or self.model or self.base_url
 
         # If model is already a URL, ignore `task` and return directly
         if model is not None and (model.startswith("http://") or model.startswith("https://")):
@@ -2754,3 +2768,30 @@ def get_model_status(self, model: Optional[str] = None) -> ModelStatus:
             compute_type=response_data["compute_type"],
             framework=response_data["framework"],
         )
+
+    @property
+    def chat(self) -> "ProxyClientChat":
+        return ProxyClientChat(self)
+
+
+class _ProxyClient:
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    def __init__(self, client: InferenceClient):
+        self._client = client
+
+
+class ProxyClientChat(_ProxyClient):
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    @property
+    def completions(self) -> "ProxyClientChatCompletions":
+        return ProxyClientChatCompletions(self._client)
+
+
+class ProxyClientChatCompletions(_ProxyClient):
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    @property
+    def create(self):
+        return self._client.chat_completion
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
index 502e79155b..c47b7ec2b7 100644
--- a/src/huggingface_hub/inference/_generated/_async_client.py
+++ b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -96,6 +96,7 @@
 from huggingface_hub.utils import (
     build_hf_headers,
 )
+from huggingface_hub.utils._deprecation import _deprecate_positional_args
 
 from .._common import _async_yield_from, _import_aiohttp
 
@@ -133,26 +134,40 @@ class AsyncInferenceClient:
             Values in this dictionary will override the default values.
         cookies (`Dict[str, str]`, `optional`):
             Additional cookies to send to the server.
+        base_url (`str`, `optional`):
+            Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClient`]
+            follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None.
+        api_key (`str`, `optional`):
+            Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClient`]
+            follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None.
     """
 
+    @_deprecate_positional_args(version="0.26")
     def __init__(
         self,
         model: Optional[str] = None,
+        *,
         token: Union[str, bool, None] = None,
         timeout: Optional[float] = None,
         headers: Optional[Dict[str, str]] = None,
         cookies: Optional[Dict[str, str]] = None,
         proxies: Optional[Any] = None,
+        # OpenAI compatibility
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
     ) -> None:
         self.model: Optional[str] = model
-        self.token: Union[str, bool, None] = token
-        self.headers = CaseInsensitiveDict(build_hf_headers(token=token))  # contains 'authorization' + 'user-agent'
+        self.token: Union[str, bool, None] = token or api_key
+        self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token))  # 'authorization' + 'user-agent'
         if headers is not None:
             self.headers.update(headers)
         self.cookies = cookies
         self.timeout = timeout
         self.proxies = proxies
 
+        # OpenAI compatibility
+        self.base_url = base_url
+
     def __repr__(self):
         return f"<InferenceClient(model='{self.model if self.model else ''}', timeout={self.timeout})>"
 
@@ -442,7 +457,6 @@ async def chat_completion(  # type: ignore
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        model_id: Optional[str] = None,
     ) -> ChatCompletionOutput: ...
 
     @overload
@@ -466,7 +480,6 @@ async def chat_completion(  # type: ignore
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        model_id: Optional[str] = None,
     ) -> AsyncIterable[ChatCompletionStreamOutput]: ...
 
     @overload
@@ -490,7 +503,6 @@ async def chat_completion(
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        model_id: Optional[str] = None,
     ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]: ...
 
     async def chat_completion(
@@ -514,7 +526,6 @@ async def chat_completion(
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
-        model_id: Optional[str] = None,
     ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]:
         """
         A method for completing conversations using a specified language model.
@@ -526,6 +537,9 @@ async def chat_completion(
                 The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used.
                 See https://huggingface.co/tasks/text-generation for more details.
+
+                If `model` is a model ID, it is passed to the server as the `model` parameter. If you want to define a
+                custom URL while setting `model` in the request payload, you must set `base_url` when initializing [`InferenceClient`].
             frequency_penalty (`float`, *optional*):
                 Penalizes new tokens based on their existing frequency
                 in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0.
@@ -569,10 +583,6 @@ async def chat_completion(
             tools (List of [`ChatCompletionInputTool`], *optional*):
                 A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
                 provide a list of functions the model may generate JSON inputs for.
-            model_id (`str`, *optional*):
-                The model ID to use for chat-completion. Only used when `model` is a URL to a deployed Text Generation Inference server.
-                It is passed to the server as the `model` parameter. This parameter has no impact on the URL that will be used to
-                send the request.
 
         Returns:
             [`ChatCompletionOutput`] or Iterable of [`ChatCompletionStreamOutput`]:
@@ -627,8 +637,11 @@ async def chat_completion(
         ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504)
         (...)
         ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
+        ```
 
-        # Chat example with tools
+        Example using tools:
+        ```py
+        # Must be run in an async context
         >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
         >>> messages = [
         ...     {
@@ -710,8 +723,11 @@ async def chat_completion(
         )
         ```
         """
-        # determine model
-        model = model or self.model or self.get_recommended_model("text-generation")
+        # Determine model
+        # `self.xxx` takes precedence over the method argument only in `chat_completion`
+        # since `chat_completion(..., model=xxx)` is also a payload parameter for the
+        # server, we need to handle it differently
+        model = self.base_url or self.model or model or self.get_recommended_model("text-generation")
 
         if _is_chat_completion_server(model):
             # First, let's consider the server has a `/v1/chat/completions` endpoint.
@@ -720,14 +736,13 @@ async def chat_completion(
             if not model_url.endswith("/chat/completions"):
                 model_url += "/v1/chat/completions"
 
-            # `model_id` sent in the payload. Not used by the server but can be useful for debugging/routing.
-            if model_id is None:
-                if not model.startswith("http") and model.count("/") == 1:
-                    # If it's a ID on the Hub => use it
-                    model_id = model
-                else:
-                    # Otherwise, we use a random string
-                    model_id = "tgi"
+            # `model` is sent in the payload. Not used by the server but can be useful for debugging/routing.
+            if not model.startswith("http") and model.count("/") == 1:
+                # If it's a ID on the Hub => use it
+                model_id = model
+            else:
+                # Otherwise, we use a random string
+                model_id = "tgi"
 
             try:
                 data = await self.post(
@@ -2597,7 +2612,7 @@ async def zero_shot_image_classification(
         return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response)
 
     def _resolve_url(self, model: Optional[str] = None, task: Optional[str] = None) -> str:
-        model = model or self.model
+        model = model or self.model or self.base_url
 
         # If model is already a URL, ignore `task` and return directly
         if model is not None and (model.startswith("http://") or model.startswith("https://")):
@@ -2795,3 +2810,30 @@ async def get_model_status(self, model: Optional[str] = None) -> ModelStatus:
             compute_type=response_data["compute_type"],
             framework=response_data["framework"],
         )
+
+    @property
+    def chat(self) -> "ProxyClientChat":
+        return ProxyClientChat(self)
+
+
+class _ProxyClient:
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    def __init__(self, client: AsyncInferenceClient):
+        self._client = client
+
+
+class ProxyClientChat(_ProxyClient):
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    @property
+    def completions(self) -> "ProxyClientChatCompletions":
+        return ProxyClientChatCompletions(self._client)
+
+
+class ProxyClientChatCompletions(_ProxyClient):
+    """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client."""
+
+    @property
+    def create(self):
+        return self._client.chat_completion
diff --git a/tests/cassettes/TestOpenAICompatibility.test_base_url_and_api_key.yaml b/tests/cassettes/TestOpenAICompatibility.test_base_url_and_api_key.yaml
new file mode 100644
index 0000000000..4a4fd8456c
--- /dev/null
+++ b/tests/cassettes/TestOpenAICompatibility.test_base_url_and_api_key.yaml
@@ -0,0 +1,75 @@
+interactions:
+- request:
+    body: '{"model": "tgi", "messages": [{"role": "system", "content": "You are a
+      helpful assistant."}, {"role": "user", "content": "Count to 10"}], "frequency_penalty":
+      null, "logit_bias": null, "logprobs": null, "max_tokens": 1024, "n": null, "presence_penalty":
+      null, "seed": null, "stop": null, "temperature": null, "tool_choice": null,
+      "tool_prompt": null, "tools": null, "top_logprobs": null, "top_p": null, "stream":
+      false}'
+    headers:
+      Accept:
+      - '*/*'
+      Accept-Encoding:
+      - gzip, deflate, br
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '419'
+      Content-Type:
+      - application/json
+      X-Amzn-Trace-Id:
+      - 0edbb1d5-e548-491b-8781-3c413bff44e9
+      authorization:
+      - Bearer my-api-key
+      user-agent:
+      - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0;
+        fastcore/1.5.23
+    method: POST
+    uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
+  response:
+    body:
+      string: '{"id":"","object":"text_completion","created":1720603621,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"message":{"role":"assistant","content":"1,
+        2, 3, 4, 5, 6, 7, 8, 9, 10!"},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":25,"completion_tokens":30,"total_tokens":55}}'
+    headers:
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Wed, 10 Jul 2024 09:27:01 GMT
+      Transfer-Encoding:
+      - chunked
+      access-control-allow-credentials:
+      - 'true'
+      access-control-allow-origin:
+      - '*'
+      vary:
+      - origin, Origin, Access-Control-Request-Method, Access-Control-Request-Headers
+      x-compute-characters:
+      - '209'
+      x-compute-time:
+      - '0.669188581'
+      x-compute-type:
+      - 2-a10-g
+      x-generated-tokens:
+      - '30'
+      x-inference-time:
+      - '655'
+      x-prompt-tokens:
+      - '26'
+      x-queue-time:
+      - '13'
+      x-request-id:
+      - N9kl_-NDF8oojD5QSbqPN
+      x-sha:
+      - e1945c40cd546c78e41f1151f4db032b271faeaa
+      x-time-per-token:
+      - '21'
+      x-total-time:
+      - '669'
+      x-validation-time:
+      - '0'
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/cassettes/TestOpenAICompatibility.test_with_stream_true.yaml b/tests/cassettes/TestOpenAICompatibility.test_with_stream_true.yaml
new file mode 100644
index 0000000000..0ab91101cf
--- /dev/null
+++ b/tests/cassettes/TestOpenAICompatibility.test_with_stream_true.yaml
@@ -0,0 +1,173 @@
+interactions:
+- request:
+    body: '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role":
+      "system", "content": "You are a helpful assistant."}, {"role": "user", "content":
+      "Count to 10"}], "frequency_penalty": null, "logit_bias": null, "logprobs":
+      null, "max_tokens": 1024, "n": null, "presence_penalty": null, "seed": null,
+      "stop": null, "temperature": null, "tool_choice": null, "tool_prompt": null,
+      "tools": null, "top_logprobs": null, "top_p": null, "stream": true}'
+    headers:
+      Accept:
+      - '*/*'
+      Accept-Encoding:
+      - gzip, deflate, br
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '450'
+      Content-Type:
+      - application/json
+      X-Amzn-Trace-Id:
+      - 70c5f6b7-a74b-411a-94cd-c632b0d856e7
+      user-agent:
+      - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0;
+        fastcore/1.5.23
+    method: POST
+    uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
+  response:
+    body:
+      string: 'data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"Here"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        it"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        goes"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":":\n\n"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"1"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"2"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"3"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"4"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"5"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"6"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"7"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"8"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"9"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"10"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":"eos_token"}]}
+
+
+        '
+    headers:
+      Connection:
+      - keep-alive
+      Content-Type:
+      - text/event-stream
+      Date:
+      - Wed, 10 Jul 2024 09:39:59 GMT
+      Transfer-Encoding:
+      - chunked
+      access-control-allow-credentials:
+      - 'true'
+      access-control-allow-origin:
+      - '*'
+      cache-control:
+      - no-cache
+      vary:
+      - origin, Origin, Access-Control-Request-Method, Access-Control-Request-Headers
+      x-accel-buffering:
+      - 'no'
+      x-compute-characters:
+      - '209'
+      x-compute-type:
+      - 2-a10-g
+      x-request-id:
+      - sZyDhIrhOAL-aoenNyHaw
+      x-sha:
+      - e1945c40cd546c78e41f1151f4db032b271faeaa
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/cassettes/TestOpenAICompatibility.test_without_base_url.yaml b/tests/cassettes/TestOpenAICompatibility.test_without_base_url.yaml
new file mode 100644
index 0000000000..b16bd835fb
--- /dev/null
+++ b/tests/cassettes/TestOpenAICompatibility.test_without_base_url.yaml
@@ -0,0 +1,72 @@
+interactions:
+- request:
+    body: '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role":
+      "system", "content": "You are a helpful assistant."}, {"role": "user", "content":
+      "Count to 10"}], "frequency_penalty": null, "logit_bias": null, "logprobs":
+      null, "max_tokens": 1024, "n": null, "presence_penalty": null, "seed": null,
+      "stop": null, "temperature": null, "tool_choice": null, "tool_prompt": null,
+      "tools": null, "top_logprobs": null, "top_p": null, "stream": false}'
+    headers:
+      Accept:
+      - '*/*'
+      Accept-Encoding:
+      - gzip, deflate, br
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '451'
+      Content-Type:
+      - application/json
+      X-Amzn-Trace-Id:
+      - c1242095-c567-48d6-9438-f311c11b7c4b
+      user-agent:
+      - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0;
+        fastcore/1.5.23
+    method: POST
+    uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
+  response:
+    body:
+      string: '{"id":"","object":"text_completion","created":1720603859,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"message":{"role":"assistant","content":"1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":25,"completion_tokens":34,"total_tokens":59}}'
+    headers:
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Wed, 10 Jul 2024 09:30:59 GMT
+      Transfer-Encoding:
+      - chunked
+      access-control-allow-credentials:
+      - 'true'
+      access-control-allow-origin:
+      - '*'
+      vary:
+      - origin, Origin, Access-Control-Request-Method, Access-Control-Request-Headers
+      x-compute-characters:
+      - '209'
+      x-compute-time:
+      - '0.79170697'
+      x-compute-type:
+      - 2-a10-g
+      x-generated-tokens:
+      - '34'
+      x-inference-time:
+      - '791'
+      x-prompt-tokens:
+      - '26'
+      x-queue-time:
+      - '0'
+      x-request-id:
+      - LlOpMjxIPx4kEhMdUYLfC
+      x-sha:
+      - e1945c40cd546c78e41f1151f4db032b271faeaa
+      x-time-per-token:
+      - '23'
+      x-total-time:
+      - '791'
+      x-validation-time:
+      - '0'
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/cassettes/test_openai_compatibility_base_url_and_api_key.yaml b/tests/cassettes/test_openai_compatibility_base_url_and_api_key.yaml
new file mode 100644
index 0000000000..5bd733cd6d
--- /dev/null
+++ b/tests/cassettes/test_openai_compatibility_base_url_and_api_key.yaml
@@ -0,0 +1,39 @@
+interactions:
+- request:
+    body: null
+    headers:
+      authorization:
+      - Bearer my-api-key
+      user-agent:
+      - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0;
+        fastcore/1.5.23
+    method: POST
+    uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
+  response:
+    body:
+      string: '{"id":"","object":"text_completion","created":1720603621,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"message":{"role":"assistant","content":"1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":25,"completion_tokens":30,"total_tokens":55}}'
+    headers:
+      Access-Control-Allow-Credentials:
+      - 'true'
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '356'
+      Content-Type:
+      - application/json
+      Date:
+      - Wed, 10 Jul 2024 09:36:14 GMT
+      Vary:
+      - Origin, Access-Control-Request-Method, Access-Control-Request-Headers
+      x-compute-time:
+      - '0.669188581'
+      x-compute-type:
+      - cache
+      x-request-id:
+      - XoBs6zJdqFuEeJmIE7apH
+      x-sha:
+      - e1945c40cd546c78e41f1151f4db032b271faeaa
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/cassettes/test_openai_compatibility_with_stream_true.yaml b/tests/cassettes/test_openai_compatibility_with_stream_true.yaml
new file mode 100644
index 0000000000..e7b049ec78
--- /dev/null
+++ b/tests/cassettes/test_openai_compatibility_with_stream_true.yaml
@@ -0,0 +1,148 @@
+interactions:
+- request:
+    body: null
+    headers:
+      user-agent:
+      - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0;
+        fastcore/1.5.23
+    method: POST
+    uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
+  response:
+    body:
+      string: 'data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"Here"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        it"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        goes"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":":\n\n"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"1"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"2"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"3"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"4"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"5"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"6"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"7"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"8"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"9"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"
+        "},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"10"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":null}]}
+
+
+        data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":"eos_token"}]}
+
+
+        '
+    headers:
+      Access-Control-Allow-Credentials:
+      - 'true'
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '8690'
+      Content-Type:
+      - text/event-stream
+      Date:
+      - Wed, 10 Jul 2024 09:43:22 GMT
+      Vary:
+      - Origin, Access-Control-Request-Method, Access-Control-Request-Headers
+      x-compute-type:
+      - cache
+      x-request-id:
+      - 5ZjP6frO9z95MsZ37xFRk
+      x-sha:
+      - e1945c40cd546c78e41f1151f4db032b271faeaa
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/cassettes/test_openai_compatibility_without_base_url.yaml b/tests/cassettes/test_openai_compatibility_without_base_url.yaml
new file mode 100644
index 0000000000..0ed0c7cf48
--- /dev/null
+++ b/tests/cassettes/test_openai_compatibility_without_base_url.yaml
@@ -0,0 +1,37 @@
+interactions:
+- request:
+    body: null
+    headers:
+      user-agent:
+      - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0;
+        fastcore/1.5.23
+    method: POST
+    uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions
+  response:
+    body:
+      string: '{"id":"","object":"text_completion","created":1720603859,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"message":{"role":"assistant","content":"1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":25,"completion_tokens":34,"total_tokens":59}}'
+    headers:
+      Access-Control-Allow-Credentials:
+      - 'true'
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '371'
+      Content-Type:
+      - application/json
+      Date:
+      - Wed, 10 Jul 2024 09:37:03 GMT
+      Vary:
+      - Origin, Access-Control-Request-Method, Access-Control-Request-Headers
+      x-compute-time:
+      - '0.79170697'
+      x-compute-type:
+      - cache
+      x-request-id:
+      - 5i2r4xvmzRAE5J2qdwQOi
+      x-sha:
+      - e1945c40cd546c78e41f1151f4db032b271faeaa
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/test_inference_async_client.py b/tests/test_inference_async_client.py
index 047c708da7..61d67c99ba 100644
--- a/tests/test_inference_async_client.py
+++ b/tests/test_inference_async_client.py
@@ -367,3 +367,60 @@ async def test_close_connection_on_post_error(mock_close: Mock, mock_post: Mock)
         await async_client.post(model="http://127.0.0.1/api", json={})
 
     mock_close.assert_called_once()
+
+
+@pytest.mark.vcr
+@pytest.mark.asyncio
+@with_production_testing
+async def test_openai_compatibility_base_url_and_api_key():
+    client = AsyncInferenceClient(
+        base_url="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct",
+        api_key="my-api-key",
+    )
+    output = await client.chat.completions.create(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Count to 10"},
+        ],
+        stream=False,
+        max_tokens=1024,
+    )
+    assert output.choices[0].message.content == "1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"
+
+
+@pytest.mark.vcr
+@pytest.mark.asyncio
+@with_production_testing
+async def test_openai_compatibility_without_base_url():
+    client = AsyncInferenceClient()
+    output = await client.chat.completions.create(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Count to 10"},
+        ],
+        stream=False,
+        max_tokens=1024,
+    )
+    assert output.choices[0].message.content == "1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"
+
+
+@pytest.mark.vcr
+@pytest.mark.asyncio
+@with_production_testing
+async def test_openai_compatibility_with_stream_true():
+    client = AsyncInferenceClient(token="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")
+    output = await client.chat.completions.create(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Count to 10"},
+        ],
+        stream=True,
+        max_tokens=1024,
+    )
+
+    chunked_text = [chunk.choices[0].delta.content async for chunk in output]
+    assert len(chunked_text) == 34
+    assert "".join(chunked_text) == "Here it goes:\n\n1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"
diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py
index 06fdc705c2..1f17b06527 100644
--- a/tests/test_inference_client.py
+++ b/tests/test_inference_client.py
@@ -854,3 +854,52 @@ def test_list_deployed_models_single_frameworks(self) -> None:
 
         self.assertIn("text-generation", models_by_task)
         self.assertIn("bigscience/bloom", models_by_task["text-generation"])
+
+
+@pytest.mark.vcr
+@with_production_testing
+class TestOpenAICompatibility(unittest.TestCase):
+    def test_base_url_and_api_key(self):
+        client = InferenceClient(
+            base_url="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct",
+            api_key="my-api-key",
+        )
+        output = client.chat.completions.create(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count to 10"},
+            ],
+            stream=False,
+            max_tokens=1024,
+        )
+        assert output.choices[0].message.content == "1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"
+
+    def test_without_base_url(self):
+        client = InferenceClient()
+        output = client.chat.completions.create(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count to 10"},
+            ],
+            stream=False,
+            max_tokens=1024,
+        )
+        assert output.choices[0].message.content == "1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"
+
+    def test_with_stream_true(self):
+        client = InferenceClient()
+        output = client.chat.completions.create(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count to 10"},
+            ],
+            stream=True,
+            max_tokens=1024,
+        )
+
+        chunked_text = [chunk.choices[0].delta.content for chunk in output]
+        assert len(chunked_text) == 34
+        assert "".join(chunked_text) == "Here it goes:\n\n1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"
diff --git a/utils/generate_async_inference_client.py b/utils/generate_async_inference_client.py
index d9b40c427c..cb8136f951 100644
--- a/utils/generate_async_inference_client.py
+++ b/utils/generate_async_inference_client.py
@@ -68,6 +68,9 @@ def generate_async_client_code(code: str) -> str:
     # Adapt /info and /health endpoints
     code = _adapt_info_and_health_endpoints(code)
 
+    # Adapt the proxy client (for client.chat.completions.create)
+    code = _adapt_proxy_client(code)
+
     return code
 
 
@@ -482,6 +485,13 @@ def _adapt_info_and_health_endpoints(code: str) -> str:
     return code.replace(health_sync_snippet, health_async_snippet)
 
 
+def _adapt_proxy_client(code: str) -> str:
+    return code.replace(
+        "def __init__(self, client: InferenceClient):",
+        "def __init__(self, client: AsyncInferenceClient):",
+    )
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(

From 7215d61ed90e62fdcd1d0c897b8e9126ffcbcf67 Mon Sep 17 00:00:00 2001
From: Lucain Pouget <lucainp@gmail.com>
Date: Wed, 10 Jul 2024 12:09:45 +0200
Subject: [PATCH 2/6] boulet

---
 tests/test_inference_async_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_inference_async_client.py b/tests/test_inference_async_client.py
index 61d67c99ba..c9ad204ec3 100644
--- a/tests/test_inference_async_client.py
+++ b/tests/test_inference_async_client.py
@@ -410,7 +410,7 @@ async def test_openai_compatibility_without_base_url():
 @pytest.mark.asyncio
 @with_production_testing
 async def test_openai_compatibility_with_stream_true():
-    client = AsyncInferenceClient(token="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")
+    client = AsyncInferenceClient()
     output = await client.chat.completions.create(
         model="meta-llama/Meta-Llama-3-8B-Instruct",
         messages=[

From 0316248823cd4d2303f9f0b49295e79a77d7c6a3 Mon Sep 17 00:00:00 2001
From: Lucain Pouget <lucainp@gmail.com>
Date: Wed, 10 Jul 2024 15:03:24 +0200
Subject: [PATCH 3/6] document openai compatibility

---
 docs/source/en/guides/inference.md            | 61 ++++++++++++++++++-
 src/huggingface_hub/inference/_client.py      | 56 ++++++++++++++++-
 .../inference/_generated/_async_client.py     | 57 ++++++++++++++++-
 tests/test_inference_client.py                |  8 +++
 4 files changed, 177 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md
index c2e2ffbd5b..4f20e4213e 100644
--- a/docs/source/en/guides/inference.md
+++ b/docs/source/en/guides/inference.md
@@ -43,7 +43,7 @@ Let's get started with a text-to-image task:
 
 In the example above, we initialized an [`InferenceClient`] with the default parameters. The only thing you need to know is the [task](#supported-tasks) you want to perform. By default, the client will connect to the Inference API and select a model to complete the task. In our example, we generated an image from a text prompt. The returned value is a `PIL.Image` object that can be saved to a file. For more details, check out the [`~InferenceClient.text_to_image`] documentation.
 
-Let's now see an example using the `chat_completion` API. This task uses an LLM to generate a response from a list of messages:
+Let's now see an example using the [~`InferenceClient.chat_completion`] API. This task uses an LLM to generate a response from a list of messages:
 
 ```python
 >>> from huggingface_hub import InferenceClient
@@ -147,6 +147,65 @@ endpoints.
 
 </Tip>
 
+## OpenAI compatibility
+
+The `chat_completion` task follows [OpenAI's Python client](https://github.com/openai/openai-python) syntax. What does it mean for you? It means that if you are used to play with `OpenAI`'s APIs you will be able to switch to `huggingface_hub.InferenceClient` to work with open-source models by updating just 2 line of code!
+
+```py
+# instead of `from openai import OpenAI`
+from huggingface_hub import InferenceClient
+
+# instead of `client = OpenAI(...)`
+client = InferenceClient(
+    base_url=...,
+    api_key=...,
+)
+
+
+output = client.chat.completions.create(
+    model="meta-llama/Meta-Llama-3-8B-Instruct",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Count to 10"},
+    ],
+    stream=True,
+    max_tokens=1024,
+)
+
+for chunk in output:
+    print(chunk.choices[0].delta.content)
+```
+
+And that's it! The only required changes are to replace `from openai import OpenAI` by `from huggingface_hub import InferenceClient` and `client = OpenAI(...)` by `client = InferenceClient(...)`. You can chose any LLM model from the Hugging Face Hub by passing its model id as `model` parameter. [Here is a list](https://huggingface.co/models?pipeline_tag=text-generation&other=conversational,text-generation-inference&sort=trending) of supported models. For authentication, you should pass a valid [User Access Token](https://huggingface.co/settings/tokens) as `api_key` or authenticate using `huggingface_hub` (see the [authentication guide](https://huggingface.co/docs/huggingface_hub/quick-start#authentication)).
+
+All input parameters and output format are strictly the same. In particular, you can pass `stream=True` to receive tokens as they are generated. You can also use the [`AsyncInferenceClient`] to run inference using `asyncio`:
+
+```py
+import asyncio
+# instead of `from openai import AsyncOpenAI`
+from huggingface_hub import AsyncInferenceClient
+
+# instead of `client = AsyncOpenAI()`
+client = AsyncOpenAI()
+
+async def main():
+    stream = await client.chat.completions.create(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        messages=[{"role": "user", "content": "Say this is a test"}],
+        stream=True,
+    )
+    async for chunk in stream:
+        print(chunk.choices[0].delta.content or "", end="")
+
+asyncio.run(main())
+```
+
+<Tip>
+
+`InferenceClient.chat.completions.create` is simply an alias for `InferenceClient.chat_completion`. Check out the package reference of [`~InferenceClient.chat_completion`] for more details. `base_url` and `api_key` parameters when instantiating the client are also aliases for `model` and `token`. These aliases have been defined to reduce friction when switching from `OpenAI` to `InferenceClient`.
+
+</Tip>
+
 ## Supported tasks
 
 [`InferenceClient`]'s goal is to provide the easiest interface to run inference on Hugging Face models. It
diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
index 48262b5929..b7e431e26e 100644
--- a/src/huggingface_hub/inference/_client.py
+++ b/src/huggingface_hub/inference/_client.py
@@ -135,12 +135,16 @@ class InferenceClient:
 
     Args:
         model (`str`, `optional`):
-            The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `bigcode/starcoder`
+            The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `meta-llama/Meta-Llama-3-8B-Instruct`
             or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
             automatically selected for the task.
+            Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2
+            arguments are mutually exclusive and have the exact same behavior.
         token (`str` or `bool`, *optional*):
             Hugging Face token. Will default to the locally saved token if not provided.
             Pass `token=False` if you don't want to send your token to the server.
+            Note: for better compatibility with OpenAI's client, `token` has been aliased as `api_key`. Those 2
+            arguments are mutually exclusive and have the exact same behavior.
         timeout (`float`, `optional`):
             The maximum number of seconds to wait for a response from the server. Loading a new model in Inference
             API can take up to several minutes. Defaults to None, meaning it will loop until the server is available.
@@ -171,6 +175,19 @@ def __init__(
         base_url: Optional[str] = None,
         api_key: Optional[str] = None,
     ) -> None:
+        if model is not None and base_url is not None:
+            raise ValueError(
+                "Received both `model` and `base_url` arguments. Please provide only one of them."
+                " `base_url` is an alias for `model` to make the API compatible with OpenAI's client."
+                " It has the exact same behavior as `model`."
+            )
+        if token is not None and api_key is not None:
+            raise ValueError(
+                "Received both `token` and `api_key` arguments. Please provide only one of them."
+                " `api_key` is an alias for `token` to make the API compatible with OpenAI's client."
+                " It has the exact same behavior as `token`."
+            )
+
         self.model: Optional[str] = model
         self.token: Union[str, bool, None] = token or api_key
         self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token))  # 'authorization' + 'user-agent'
@@ -529,6 +546,15 @@ def chat_completion(
         """
         A method for completing conversations using a specified language model.
 
+        <Tip>
+
+        The `client.chat_completion` method is aliased as `client.chat.completions.create` for compatibility with OpenAI's client.
+        Inputs and outputs are strictly the same and using either syntax will yield the same results.
+        Check out the [Inference guide](https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility)
+        for more details about OpenAI's compatibility.
+
+        </Tip>
+
         Args:
             messages (List[Union[`SystemMessage`, `UserMessage`, `AssistantMessage`]]):
                 Conversation history consisting of roles and content pairs.
@@ -637,6 +663,32 @@ def chat_completion(
         ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
         ```
 
+        Example using OpenAI's syntax:
+        ```py
+        # instead of `from openai import OpenAI`
+        from huggingface_hub import InferenceClient
+
+        # instead of `client = OpenAI(...)`
+        client = InferenceClient(
+            base_url=...,
+            api_key=...,
+        )
+
+
+        output = client.chat.completions.create(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count to 10"},
+            ],
+            stream=True,
+            max_tokens=1024,
+        )
+
+        for chunk in output:
+            print(chunk.choices[0].delta.content)
+        ```
+
         Example using tools:
         ```py
         >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
@@ -2744,7 +2796,7 @@ def get_model_status(self, model: Optional[str] = None) -> ModelStatus:
         ```py
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
-        >>> client.get_model_status("bigcode/starcoder")
+        >>> client.get_model_status("meta-llama/Meta-Llama-3-8B-Instruct")
         ModelStatus(loaded=True, state='Loaded', compute_type='gpu', framework='text-generation-inference')
         ```
         """
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
index c47b7ec2b7..7193076c08 100644
--- a/src/huggingface_hub/inference/_generated/_async_client.py
+++ b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -120,12 +120,16 @@ class AsyncInferenceClient:
 
     Args:
         model (`str`, `optional`):
-            The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `bigcode/starcoder`
+            The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `meta-llama/Meta-Llama-3-8B-Instruct`
             or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
             automatically selected for the task.
+            Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2
+            arguments are mutually exclusive and have the exact same behavior.
         token (`str` or `bool`, *optional*):
             Hugging Face token. Will default to the locally saved token if not provided.
             Pass `token=False` if you don't want to send your token to the server.
+            Note: for better compatibility with OpenAI's client, `token` has been aliased as `api_key`. Those 2
+            arguments are mutually exclusive and have the exact same behavior.
         timeout (`float`, `optional`):
             The maximum number of seconds to wait for a response from the server. Loading a new model in Inference
             API can take up to several minutes. Defaults to None, meaning it will loop until the server is available.
@@ -156,6 +160,19 @@ def __init__(
         base_url: Optional[str] = None,
         api_key: Optional[str] = None,
     ) -> None:
+        if model is not None and base_url is not None:
+            raise ValueError(
+                "Received both `model` and `base_url` arguments. Please provide only one of them."
+                " `base_url` is an alias for `model` to make the API compatible with OpenAI's client."
+                " It has the exact same behavior as `model`."
+            )
+        if token is not None and api_key is not None:
+            raise ValueError(
+                "Received both `token` and `api_key` arguments. Please provide only one of them."
+                " `api_key` is an alias for `token` to make the API compatible with OpenAI's client."
+                " It has the exact same behavior as `token`."
+            )
+
         self.model: Optional[str] = model
         self.token: Union[str, bool, None] = token or api_key
         self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token))  # 'authorization' + 'user-agent'
@@ -530,6 +547,15 @@ async def chat_completion(
         """
         A method for completing conversations using a specified language model.
 
+        <Tip>
+
+        The `client.chat_completion` method is aliased as `client.chat.completions.create` for compatibility with OpenAI's client.
+        Inputs and outputs are strictly the same and using either syntax will yield the same results.
+        Check out the [Inference guide](https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility)
+        for more details about OpenAI's compatibility.
+
+        </Tip>
+
         Args:
             messages (List[Union[`SystemMessage`, `UserMessage`, `AssistantMessage`]]):
                 Conversation history consisting of roles and content pairs.
@@ -639,6 +665,33 @@ async def chat_completion(
         ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
         ```
 
+        Example using OpenAI's syntax:
+        ```py
+        # Must be run in an async context
+        # instead of `from openai import OpenAI`
+        from huggingface_hub import AsyncInferenceClient
+
+        # instead of `client = OpenAI(...)`
+        client = AsyncInferenceClient(
+            base_url=...,
+            api_key=...,
+        )
+
+
+        output = await client.chat.completions.create(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count to 10"},
+            ],
+            stream=True,
+            max_tokens=1024,
+        )
+
+        for chunk in output:
+            print(chunk.choices[0].delta.content)
+        ```
+
         Example using tools:
         ```py
         # Must be run in an async context
@@ -2785,7 +2838,7 @@ async def get_model_status(self, model: Optional[str] = None) -> ModelStatus:
         # Must be run in an async context
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
-        >>> await client.get_model_status("bigcode/starcoder")
+        >>> await client.get_model_status("meta-llama/Meta-Llama-3-8B-Instruct")
         ModelStatus(loaded=True, state='Loaded', compute_type='gpu', framework='text-generation-inference')
         ```
         """
diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py
index 1f17b06527..43c91ff3d2 100644
--- a/tests/test_inference_client.py
+++ b/tests/test_inference_client.py
@@ -903,3 +903,11 @@ def test_with_stream_true(self):
         chunked_text = [chunk.choices[0].delta.content for chunk in output]
         assert len(chunked_text) == 34
         assert "".join(chunked_text) == "Here it goes:\n\n1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"
+
+    def test_token_and_api_key_mutually_exclusive(self):
+        with self.assertRaises(ValueError):
+            InferenceClient(token="my-token", api_key="my-api-key")
+
+    def test_model_and_base_url_mutually_exclusive(self):
+        with self.assertRaises(ValueError):
+            InferenceClient(model="meta-llama/Meta-Llama-3-8B-Instruct", base_url="http://127.0.0.1:8000")

From 5f71531305f3925c3281fee313548ddfba3ea2ee Mon Sep 17 00:00:00 2001
From: Lucain Pouget <lucainp@gmail.com>
Date: Wed, 10 Jul 2024 15:18:06 +0200
Subject: [PATCH 4/6] typo

---
 src/huggingface_hub/inference/_client.py                  | 1 -
 src/huggingface_hub/inference/_generated/_async_client.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
index b7e431e26e..5f3cca1b1b 100644
--- a/src/huggingface_hub/inference/_client.py
+++ b/src/huggingface_hub/inference/_client.py
@@ -674,7 +674,6 @@ def chat_completion(
             api_key=...,
         )
 
-
         output = client.chat.completions.create(
             model="meta-llama/Meta-Llama-3-8B-Instruct",
             messages=[
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
index 7193076c08..982adf618e 100644
--- a/src/huggingface_hub/inference/_generated/_async_client.py
+++ b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -677,7 +677,6 @@ async def chat_completion(
             api_key=...,
         )
 
-
         output = await client.chat.completions.create(
             model="meta-llama/Meta-Llama-3-8B-Instruct",
             messages=[

From 3e30e6f390fbe09812cb5ce43811cbefd50ec38c Mon Sep 17 00:00:00 2001
From: Lucain Pouget <lucainp@gmail.com>
Date: Wed, 10 Jul 2024 16:41:10 +0200
Subject: [PATCH 5/6] use diff in code snippets

---
 docs/source/en/guides/inference.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md
index 4f20e4213e..7eb48ec4c5 100644
--- a/docs/source/en/guides/inference.md
+++ b/docs/source/en/guides/inference.md
@@ -151,12 +151,12 @@ endpoints.
 
 The `chat_completion` task follows [OpenAI's Python client](https://github.com/openai/openai-python) syntax. What does it mean for you? It means that if you are used to play with `OpenAI`'s APIs you will be able to switch to `huggingface_hub.InferenceClient` to work with open-source models by updating just 2 line of code!
 
-```py
-# instead of `from openai import OpenAI`
-from huggingface_hub import InferenceClient
+```diff
+- from openai import OpenAI
++ from huggingface_hub import InferenceClient
 
-# instead of `client = OpenAI(...)`
-client = InferenceClient(
+- client = OpenAI(
++ client = InferenceClient(
     base_url=...,
     api_key=...,
 )
@@ -180,13 +180,13 @@ And that's it! The only required changes are to replace `from openai import Open
 
 All input parameters and output format are strictly the same. In particular, you can pass `stream=True` to receive tokens as they are generated. You can also use the [`AsyncInferenceClient`] to run inference using `asyncio`:
 
-```py
+```diff
 import asyncio
-# instead of `from openai import AsyncOpenAI`
-from huggingface_hub import AsyncInferenceClient
+- from openai import AsyncOpenAI
++ from huggingface_hub import AsyncInferenceClient
 
-# instead of `client = AsyncOpenAI()`
-client = AsyncOpenAI()
+- client = AsyncOpenAI()
++ client = AsyncInferenceClient()
 
 async def main():
     stream = await client.chat.completions.create(

From dc97b0c9d6fafcfd9b7c5cb70ccd5d260af74deb Mon Sep 17 00:00:00 2001
From: Lucain Pouget <lucainp@gmail.com>
Date: Thu, 11 Jul 2024 12:08:04 +0200
Subject: [PATCH 6/6] why using us

---
 docs/source/en/guides/inference.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md
index 7eb48ec4c5..92207658cd 100644
--- a/docs/source/en/guides/inference.md
+++ b/docs/source/en/guides/inference.md
@@ -200,6 +200,11 @@ async def main():
 asyncio.run(main())
 ```
 
+You might wonder why using [`InferenceClient`] instead of OpenAI's client? There are a few reasons for that:
+1. [`InferenceClient`] is configured for Hugging Face services. You don't need to provide a `base_url` to run models on the serverless Inference API. You also don't need to provide a `token` or `api_key` if you machine is already correctly logged in.
+2. [`InferenceClient`] is tailored for both Text-Generation-Inference (TGI) and `transformers` frameworks, meaning you are assured it will always be on-par with the latest updates.
+3. [`InferenceClient`] is integrated with our Inference Endpoints service, making it easier to launch an Inference Endpoint, check its status and run inference on it. Check out the [Inference Endpoints](./inference_endpoints.md) guide for more details.
+
 <Tip>
 
 `InferenceClient.chat.completions.create` is simply an alias for `InferenceClient.chat_completion`. Check out the package reference of [`~InferenceClient.chat_completion`] for more details. `base_url` and `api_key` parameters when instantiating the client are also aliases for `model` and `token`. These aliases have been defined to reduce friction when switching from `OpenAI` to `InferenceClient`.