From 4bc785fad3034d10376462bf3cea551e5138fed6 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 10 Jul 2024 11:47:34 +0200 Subject: [PATCH 1/6] True OpenAI drop-in replacement by InferenceClient --- src/huggingface_hub/inference/_client.py | 85 ++++++--- .../inference/_generated/_async_client.py | 86 ++++++--- ...mpatibility.test_base_url_and_api_key.yaml | 75 ++++++++ ...AICompatibility.test_with_stream_true.yaml | 173 ++++++++++++++++++ ...AICompatibility.test_without_base_url.yaml | 72 ++++++++ ...ai_compatibility_base_url_and_api_key.yaml | 39 ++++ ...openai_compatibility_with_stream_true.yaml | 148 +++++++++++++++ ...openai_compatibility_without_base_url.yaml | 37 ++++ tests/test_inference_async_client.py | 57 ++++++ tests/test_inference_client.py | 49 +++++ utils/generate_async_inference_client.py | 10 + 11 files changed, 787 insertions(+), 44 deletions(-) create mode 100644 tests/cassettes/TestOpenAICompatibility.test_base_url_and_api_key.yaml create mode 100644 tests/cassettes/TestOpenAICompatibility.test_with_stream_true.yaml create mode 100644 tests/cassettes/TestOpenAICompatibility.test_without_base_url.yaml create mode 100644 tests/cassettes/test_openai_compatibility_base_url_and_api_key.yaml create mode 100644 tests/cassettes/test_openai_compatibility_with_stream_true.yaml create mode 100644 tests/cassettes/test_openai_compatibility_without_base_url.yaml diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py index fbba38bc5c..48262b5929 100644 --- a/src/huggingface_hub/inference/_client.py +++ b/src/huggingface_hub/inference/_client.py @@ -113,6 +113,7 @@ get_session, hf_raise_for_status, ) +from huggingface_hub.utils._deprecation import _deprecate_positional_args if TYPE_CHECKING: @@ -148,26 +149,40 @@ class InferenceClient: Values in this dictionary will override the default values. cookies (`Dict[str, str]`, `optional`): Additional cookies to send to the server. + base_url (`str`, `optional`): + Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClient`] + follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None. + api_key (`str`, `optional`): + Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClient`] + follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None. """ + @_deprecate_positional_args(version="0.26") def __init__( self, model: Optional[str] = None, + *, token: Union[str, bool, None] = None, timeout: Optional[float] = None, headers: Optional[Dict[str, str]] = None, cookies: Optional[Dict[str, str]] = None, proxies: Optional[Any] = None, + # OpenAI compatibility + base_url: Optional[str] = None, + api_key: Optional[str] = None, ) -> None: self.model: Optional[str] = model - self.token: Union[str, bool, None] = token - self.headers = CaseInsensitiveDict(build_hf_headers(token=token)) # contains 'authorization' + 'user-agent' + self.token: Union[str, bool, None] = token or api_key + self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token)) # 'authorization' + 'user-agent' if headers is not None: self.headers.update(headers) self.cookies = cookies self.timeout = timeout self.proxies = proxies + # OpenAI compatibility + self.base_url = base_url + def __repr__(self): return f"" @@ -441,7 +456,6 @@ def chat_completion( # type: ignore tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, top_p: Optional[float] = None, - model_id: Optional[str] = None, ) -> ChatCompletionOutput: ... @overload @@ -465,7 +479,6 @@ def chat_completion( # type: ignore tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, top_p: Optional[float] = None, - model_id: Optional[str] = None, ) -> Iterable[ChatCompletionStreamOutput]: ... @overload @@ -489,7 +502,6 @@ def chat_completion( tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, top_p: Optional[float] = None, - model_id: Optional[str] = None, ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]: ... def chat_completion( @@ -513,7 +525,6 @@ def chat_completion( tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, top_p: Optional[float] = None, - model_id: Optional[str] = None, ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]: """ A method for completing conversations using a specified language model. @@ -525,6 +536,9 @@ def chat_completion( The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used. See https://huggingface.co/tasks/text-generation for more details. + + If `model` is a model ID, it is passed to the server as the `model` parameter. If you want to define a + custom URL while setting `model` in the request payload, you must set `base_url` when initializing [`InferenceClient`]. frequency_penalty (`float`, *optional*): Penalizes new tokens based on their existing frequency in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0. @@ -568,10 +582,6 @@ def chat_completion( tools (List of [`ChatCompletionInputTool`], *optional*): A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for. - model_id (`str`, *optional*): - The model ID to use for chat-completion. Only used when `model` is a URL to a deployed Text Generation Inference server. - It is passed to the server as the `model` parameter. This parameter has no impact on the URL that will be used to - send the request. Returns: [`ChatCompletionOutput`] or Iterable of [`ChatCompletionStreamOutput`]: @@ -625,8 +635,10 @@ def chat_completion( ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504) (...) ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504) + ``` - # Chat example with tools + Example using tools: + ```py >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct") >>> messages = [ ... { @@ -708,8 +720,11 @@ def chat_completion( ) ``` """ - # determine model - model = model or self.model or self.get_recommended_model("text-generation") + # Determine model + # `self.xxx` takes precedence over the method argument only in `chat_completion` + # since `chat_completion(..., model=xxx)` is also a payload parameter for the + # server, we need to handle it differently + model = self.base_url or self.model or model or self.get_recommended_model("text-generation") if _is_chat_completion_server(model): # First, let's consider the server has a `/v1/chat/completions` endpoint. @@ -718,14 +733,13 @@ def chat_completion( if not model_url.endswith("/chat/completions"): model_url += "/v1/chat/completions" - # `model_id` sent in the payload. Not used by the server but can be useful for debugging/routing. - if model_id is None: - if not model.startswith("http") and model.count("/") == 1: - # If it's a ID on the Hub => use it - model_id = model - else: - # Otherwise, we use a random string - model_id = "tgi" + # `model` is sent in the payload. Not used by the server but can be useful for debugging/routing. + if not model.startswith("http") and model.count("/") == 1: + # If it's a ID on the Hub => use it + model_id = model + else: + # Otherwise, we use a random string + model_id = "tgi" try: data = self.post( @@ -2562,7 +2576,7 @@ def zero_shot_image_classification( return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response) def _resolve_url(self, model: Optional[str] = None, task: Optional[str] = None) -> str: - model = model or self.model + model = model or self.model or self.base_url # If model is already a URL, ignore `task` and return directly if model is not None and (model.startswith("http://") or model.startswith("https://")): @@ -2754,3 +2768,30 @@ def get_model_status(self, model: Optional[str] = None) -> ModelStatus: compute_type=response_data["compute_type"], framework=response_data["framework"], ) + + @property + def chat(self) -> "ProxyClientChat": + return ProxyClientChat(self) + + +class _ProxyClient: + """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client.""" + + def __init__(self, client: InferenceClient): + self._client = client + + +class ProxyClientChat(_ProxyClient): + """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client.""" + + @property + def completions(self) -> "ProxyClientChatCompletions": + return ProxyClientChatCompletions(self._client) + + +class ProxyClientChatCompletions(_ProxyClient): + """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client.""" + + @property + def create(self): + return self._client.chat_completion diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py index 502e79155b..c47b7ec2b7 100644 --- a/src/huggingface_hub/inference/_generated/_async_client.py +++ b/src/huggingface_hub/inference/_generated/_async_client.py @@ -96,6 +96,7 @@ from huggingface_hub.utils import ( build_hf_headers, ) +from huggingface_hub.utils._deprecation import _deprecate_positional_args from .._common import _async_yield_from, _import_aiohttp @@ -133,26 +134,40 @@ class AsyncInferenceClient: Values in this dictionary will override the default values. cookies (`Dict[str, str]`, `optional`): Additional cookies to send to the server. + base_url (`str`, `optional`): + Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClient`] + follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None. + api_key (`str`, `optional`): + Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClient`] + follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None. """ + @_deprecate_positional_args(version="0.26") def __init__( self, model: Optional[str] = None, + *, token: Union[str, bool, None] = None, timeout: Optional[float] = None, headers: Optional[Dict[str, str]] = None, cookies: Optional[Dict[str, str]] = None, proxies: Optional[Any] = None, + # OpenAI compatibility + base_url: Optional[str] = None, + api_key: Optional[str] = None, ) -> None: self.model: Optional[str] = model - self.token: Union[str, bool, None] = token - self.headers = CaseInsensitiveDict(build_hf_headers(token=token)) # contains 'authorization' + 'user-agent' + self.token: Union[str, bool, None] = token or api_key + self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token)) # 'authorization' + 'user-agent' if headers is not None: self.headers.update(headers) self.cookies = cookies self.timeout = timeout self.proxies = proxies + # OpenAI compatibility + self.base_url = base_url + def __repr__(self): return f"" @@ -442,7 +457,6 @@ async def chat_completion( # type: ignore tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, top_p: Optional[float] = None, - model_id: Optional[str] = None, ) -> ChatCompletionOutput: ... @overload @@ -466,7 +480,6 @@ async def chat_completion( # type: ignore tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, top_p: Optional[float] = None, - model_id: Optional[str] = None, ) -> AsyncIterable[ChatCompletionStreamOutput]: ... @overload @@ -490,7 +503,6 @@ async def chat_completion( tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, top_p: Optional[float] = None, - model_id: Optional[str] = None, ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]: ... async def chat_completion( @@ -514,7 +526,6 @@ async def chat_completion( tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, top_p: Optional[float] = None, - model_id: Optional[str] = None, ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]: """ A method for completing conversations using a specified language model. @@ -526,6 +537,9 @@ async def chat_completion( The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for chat-based text-generation will be used. See https://huggingface.co/tasks/text-generation for more details. + + If `model` is a model ID, it is passed to the server as the `model` parameter. If you want to define a + custom URL while setting `model` in the request payload, you must set `base_url` when initializing [`InferenceClient`]. frequency_penalty (`float`, *optional*): Penalizes new tokens based on their existing frequency in the text so far. Range: [-2.0, 2.0]. Defaults to 0.0. @@ -569,10 +583,6 @@ async def chat_completion( tools (List of [`ChatCompletionInputTool`], *optional*): A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for. - model_id (`str`, *optional*): - The model ID to use for chat-completion. Only used when `model` is a URL to a deployed Text Generation Inference server. - It is passed to the server as the `model` parameter. This parameter has no impact on the URL that will be used to - send the request. Returns: [`ChatCompletionOutput`] or Iterable of [`ChatCompletionStreamOutput`]: @@ -627,8 +637,11 @@ async def chat_completion( ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504) (...) ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504) + ``` - # Chat example with tools + Example using tools: + ```py + # Must be run in an async context >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-70B-Instruct") >>> messages = [ ... { @@ -710,8 +723,11 @@ async def chat_completion( ) ``` """ - # determine model - model = model or self.model or self.get_recommended_model("text-generation") + # Determine model + # `self.xxx` takes precedence over the method argument only in `chat_completion` + # since `chat_completion(..., model=xxx)` is also a payload parameter for the + # server, we need to handle it differently + model = self.base_url or self.model or model or self.get_recommended_model("text-generation") if _is_chat_completion_server(model): # First, let's consider the server has a `/v1/chat/completions` endpoint. @@ -720,14 +736,13 @@ async def chat_completion( if not model_url.endswith("/chat/completions"): model_url += "/v1/chat/completions" - # `model_id` sent in the payload. Not used by the server but can be useful for debugging/routing. - if model_id is None: - if not model.startswith("http") and model.count("/") == 1: - # If it's a ID on the Hub => use it - model_id = model - else: - # Otherwise, we use a random string - model_id = "tgi" + # `model` is sent in the payload. Not used by the server but can be useful for debugging/routing. + if not model.startswith("http") and model.count("/") == 1: + # If it's a ID on the Hub => use it + model_id = model + else: + # Otherwise, we use a random string + model_id = "tgi" try: data = await self.post( @@ -2597,7 +2612,7 @@ async def zero_shot_image_classification( return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response) def _resolve_url(self, model: Optional[str] = None, task: Optional[str] = None) -> str: - model = model or self.model + model = model or self.model or self.base_url # If model is already a URL, ignore `task` and return directly if model is not None and (model.startswith("http://") or model.startswith("https://")): @@ -2795,3 +2810,30 @@ async def get_model_status(self, model: Optional[str] = None) -> ModelStatus: compute_type=response_data["compute_type"], framework=response_data["framework"], ) + + @property + def chat(self) -> "ProxyClientChat": + return ProxyClientChat(self) + + +class _ProxyClient: + """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client.""" + + def __init__(self, client: AsyncInferenceClient): + self._client = client + + +class ProxyClientChat(_ProxyClient): + """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client.""" + + @property + def completions(self) -> "ProxyClientChatCompletions": + return ProxyClientChatCompletions(self._client) + + +class ProxyClientChatCompletions(_ProxyClient): + """Proxy class to be able to call `client.chat.completion.create(...)` as OpenAI client.""" + + @property + def create(self): + return self._client.chat_completion diff --git a/tests/cassettes/TestOpenAICompatibility.test_base_url_and_api_key.yaml b/tests/cassettes/TestOpenAICompatibility.test_base_url_and_api_key.yaml new file mode 100644 index 0000000000..4a4fd8456c --- /dev/null +++ b/tests/cassettes/TestOpenAICompatibility.test_base_url_and_api_key.yaml @@ -0,0 +1,75 @@ +interactions: +- request: + body: '{"model": "tgi", "messages": [{"role": "system", "content": "You are a + helpful assistant."}, {"role": "user", "content": "Count to 10"}], "frequency_penalty": + null, "logit_bias": null, "logprobs": null, "max_tokens": 1024, "n": null, "presence_penalty": + null, "seed": null, "stop": null, "temperature": null, "tool_choice": null, + "tool_prompt": null, "tools": null, "top_logprobs": null, "top_p": null, "stream": + false}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, br + Connection: + - keep-alive + Content-Length: + - '419' + Content-Type: + - application/json + X-Amzn-Trace-Id: + - 0edbb1d5-e548-491b-8781-3c413bff44e9 + authorization: + - Bearer my-api-key + user-agent: + - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0; + fastcore/1.5.23 + method: POST + uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions + response: + body: + string: '{"id":"","object":"text_completion","created":1720603621,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"message":{"role":"assistant","content":"1, + 2, 3, 4, 5, 6, 7, 8, 9, 10!"},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":25,"completion_tokens":30,"total_tokens":55}}' + headers: + Connection: + - keep-alive + Content-Type: + - application/json + Date: + - Wed, 10 Jul 2024 09:27:01 GMT + Transfer-Encoding: + - chunked + access-control-allow-credentials: + - 'true' + access-control-allow-origin: + - '*' + vary: + - origin, Origin, Access-Control-Request-Method, Access-Control-Request-Headers + x-compute-characters: + - '209' + x-compute-time: + - '0.669188581' + x-compute-type: + - 2-a10-g + x-generated-tokens: + - '30' + x-inference-time: + - '655' + x-prompt-tokens: + - '26' + x-queue-time: + - '13' + x-request-id: + - N9kl_-NDF8oojD5QSbqPN + x-sha: + - e1945c40cd546c78e41f1151f4db032b271faeaa + x-time-per-token: + - '21' + x-total-time: + - '669' + x-validation-time: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/TestOpenAICompatibility.test_with_stream_true.yaml b/tests/cassettes/TestOpenAICompatibility.test_with_stream_true.yaml new file mode 100644 index 0000000000..0ab91101cf --- /dev/null +++ b/tests/cassettes/TestOpenAICompatibility.test_with_stream_true.yaml @@ -0,0 +1,173 @@ +interactions: +- request: + body: '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": + "system", "content": "You are a helpful assistant."}, {"role": "user", "content": + "Count to 10"}], "frequency_penalty": null, "logit_bias": null, "logprobs": + null, "max_tokens": 1024, "n": null, "presence_penalty": null, "seed": null, + "stop": null, "temperature": null, "tool_choice": null, "tool_prompt": null, + "tools": null, "top_logprobs": null, "top_p": null, "stream": true}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, br + Connection: + - keep-alive + Content-Length: + - '450' + Content-Type: + - application/json + X-Amzn-Trace-Id: + - 70c5f6b7-a74b-411a-94cd-c632b0d856e7 + user-agent: + - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0; + fastcore/1.5.23 + method: POST + uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions + response: + body: + string: 'data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"Here"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + it"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + goes"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":":\n\n"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"1"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"2"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"3"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"4"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"5"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"6"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"7"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"8"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"9"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"10"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":"eos_token"}]} + + + ' + headers: + Connection: + - keep-alive + Content-Type: + - text/event-stream + Date: + - Wed, 10 Jul 2024 09:39:59 GMT + Transfer-Encoding: + - chunked + access-control-allow-credentials: + - 'true' + access-control-allow-origin: + - '*' + cache-control: + - no-cache + vary: + - origin, Origin, Access-Control-Request-Method, Access-Control-Request-Headers + x-accel-buffering: + - 'no' + x-compute-characters: + - '209' + x-compute-type: + - 2-a10-g + x-request-id: + - sZyDhIrhOAL-aoenNyHaw + x-sha: + - e1945c40cd546c78e41f1151f4db032b271faeaa + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/TestOpenAICompatibility.test_without_base_url.yaml b/tests/cassettes/TestOpenAICompatibility.test_without_base_url.yaml new file mode 100644 index 0000000000..b16bd835fb --- /dev/null +++ b/tests/cassettes/TestOpenAICompatibility.test_without_base_url.yaml @@ -0,0 +1,72 @@ +interactions: +- request: + body: '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": + "system", "content": "You are a helpful assistant."}, {"role": "user", "content": + "Count to 10"}], "frequency_penalty": null, "logit_bias": null, "logprobs": + null, "max_tokens": 1024, "n": null, "presence_penalty": null, "seed": null, + "stop": null, "temperature": null, "tool_choice": null, "tool_prompt": null, + "tools": null, "top_logprobs": null, "top_p": null, "stream": false}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, br + Connection: + - keep-alive + Content-Length: + - '451' + Content-Type: + - application/json + X-Amzn-Trace-Id: + - c1242095-c567-48d6-9438-f311c11b7c4b + user-agent: + - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0; + fastcore/1.5.23 + method: POST + uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions + response: + body: + string: '{"id":"","object":"text_completion","created":1720603859,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"message":{"role":"assistant","content":"1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":25,"completion_tokens":34,"total_tokens":59}}' + headers: + Connection: + - keep-alive + Content-Type: + - application/json + Date: + - Wed, 10 Jul 2024 09:30:59 GMT + Transfer-Encoding: + - chunked + access-control-allow-credentials: + - 'true' + access-control-allow-origin: + - '*' + vary: + - origin, Origin, Access-Control-Request-Method, Access-Control-Request-Headers + x-compute-characters: + - '209' + x-compute-time: + - '0.79170697' + x-compute-type: + - 2-a10-g + x-generated-tokens: + - '34' + x-inference-time: + - '791' + x-prompt-tokens: + - '26' + x-queue-time: + - '0' + x-request-id: + - LlOpMjxIPx4kEhMdUYLfC + x-sha: + - e1945c40cd546c78e41f1151f4db032b271faeaa + x-time-per-token: + - '23' + x-total-time: + - '791' + x-validation-time: + - '0' + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_openai_compatibility_base_url_and_api_key.yaml b/tests/cassettes/test_openai_compatibility_base_url_and_api_key.yaml new file mode 100644 index 0000000000..5bd733cd6d --- /dev/null +++ b/tests/cassettes/test_openai_compatibility_base_url_and_api_key.yaml @@ -0,0 +1,39 @@ +interactions: +- request: + body: null + headers: + authorization: + - Bearer my-api-key + user-agent: + - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0; + fastcore/1.5.23 + method: POST + uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions + response: + body: + string: '{"id":"","object":"text_completion","created":1720603621,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"message":{"role":"assistant","content":"1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":25,"completion_tokens":30,"total_tokens":55}}' + headers: + Access-Control-Allow-Credentials: + - 'true' + Connection: + - keep-alive + Content-Length: + - '356' + Content-Type: + - application/json + Date: + - Wed, 10 Jul 2024 09:36:14 GMT + Vary: + - Origin, Access-Control-Request-Method, Access-Control-Request-Headers + x-compute-time: + - '0.669188581' + x-compute-type: + - cache + x-request-id: + - XoBs6zJdqFuEeJmIE7apH + x-sha: + - e1945c40cd546c78e41f1151f4db032b271faeaa + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_openai_compatibility_with_stream_true.yaml b/tests/cassettes/test_openai_compatibility_with_stream_true.yaml new file mode 100644 index 0000000000..e7b049ec78 --- /dev/null +++ b/tests/cassettes/test_openai_compatibility_with_stream_true.yaml @@ -0,0 +1,148 @@ +interactions: +- request: + body: null + headers: + user-agent: + - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0; + fastcore/1.5.23 + method: POST + uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions + response: + body: + string: 'data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"Here"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + it"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + goes"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":":\n\n"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"1"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604399,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"2"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"3"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"4"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"5"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"6"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"7"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"8"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"9"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + "},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"10"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"!"},"logprobs":null,"finish_reason":null}]} + + + data:{"id":"","object":"text_completion","created":1720604400,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":"eos_token"}]} + + + ' + headers: + Access-Control-Allow-Credentials: + - 'true' + Connection: + - keep-alive + Content-Length: + - '8690' + Content-Type: + - text/event-stream + Date: + - Wed, 10 Jul 2024 09:43:22 GMT + Vary: + - Origin, Access-Control-Request-Method, Access-Control-Request-Headers + x-compute-type: + - cache + x-request-id: + - 5ZjP6frO9z95MsZ37xFRk + x-sha: + - e1945c40cd546c78e41f1151f4db032b271faeaa + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_openai_compatibility_without_base_url.yaml b/tests/cassettes/test_openai_compatibility_without_base_url.yaml new file mode 100644 index 0000000000..0ed0c7cf48 --- /dev/null +++ b/tests/cassettes/test_openai_compatibility_without_base_url.yaml @@ -0,0 +1,37 @@ +interactions: +- request: + body: null + headers: + user-agent: + - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0; + fastcore/1.5.23 + method: POST + uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct/v1/chat/completions + response: + body: + string: '{"id":"","object":"text_completion","created":1720603859,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"message":{"role":"assistant","content":"1, 2, 3, 4, 5, 6, 7, 8, 9, 10!"},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":25,"completion_tokens":34,"total_tokens":59}}' + headers: + Access-Control-Allow-Credentials: + - 'true' + Connection: + - keep-alive + Content-Length: + - '371' + Content-Type: + - application/json + Date: + - Wed, 10 Jul 2024 09:37:03 GMT + Vary: + - Origin, Access-Control-Request-Method, Access-Control-Request-Headers + x-compute-time: + - '0.79170697' + x-compute-type: + - cache + x-request-id: + - 5i2r4xvmzRAE5J2qdwQOi + x-sha: + - e1945c40cd546c78e41f1151f4db032b271faeaa + status: + code: 200 + message: OK +version: 1 diff --git a/tests/test_inference_async_client.py b/tests/test_inference_async_client.py index 047c708da7..61d67c99ba 100644 --- a/tests/test_inference_async_client.py +++ b/tests/test_inference_async_client.py @@ -367,3 +367,60 @@ async def test_close_connection_on_post_error(mock_close: Mock, mock_post: Mock) await async_client.post(model="http://127.0.0.1/api", json={}) mock_close.assert_called_once() + + +@pytest.mark.vcr +@pytest.mark.asyncio +@with_production_testing +async def test_openai_compatibility_base_url_and_api_key(): + client = AsyncInferenceClient( + base_url="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct", + api_key="my-api-key", + ) + output = await client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Count to 10"}, + ], + stream=False, + max_tokens=1024, + ) + assert output.choices[0].message.content == "1, 2, 3, 4, 5, 6, 7, 8, 9, 10!" + + +@pytest.mark.vcr +@pytest.mark.asyncio +@with_production_testing +async def test_openai_compatibility_without_base_url(): + client = AsyncInferenceClient() + output = await client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Count to 10"}, + ], + stream=False, + max_tokens=1024, + ) + assert output.choices[0].message.content == "1, 2, 3, 4, 5, 6, 7, 8, 9, 10!" + + +@pytest.mark.vcr +@pytest.mark.asyncio +@with_production_testing +async def test_openai_compatibility_with_stream_true(): + client = AsyncInferenceClient(token="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") + output = await client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Count to 10"}, + ], + stream=True, + max_tokens=1024, + ) + + chunked_text = [chunk.choices[0].delta.content async for chunk in output] + assert len(chunked_text) == 34 + assert "".join(chunked_text) == "Here it goes:\n\n1, 2, 3, 4, 5, 6, 7, 8, 9, 10!" diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py index 06fdc705c2..1f17b06527 100644 --- a/tests/test_inference_client.py +++ b/tests/test_inference_client.py @@ -854,3 +854,52 @@ def test_list_deployed_models_single_frameworks(self) -> None: self.assertIn("text-generation", models_by_task) self.assertIn("bigscience/bloom", models_by_task["text-generation"]) + + +@pytest.mark.vcr +@with_production_testing +class TestOpenAICompatibility(unittest.TestCase): + def test_base_url_and_api_key(self): + client = InferenceClient( + base_url="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct", + api_key="my-api-key", + ) + output = client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Count to 10"}, + ], + stream=False, + max_tokens=1024, + ) + assert output.choices[0].message.content == "1, 2, 3, 4, 5, 6, 7, 8, 9, 10!" + + def test_without_base_url(self): + client = InferenceClient() + output = client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Count to 10"}, + ], + stream=False, + max_tokens=1024, + ) + assert output.choices[0].message.content == "1, 2, 3, 4, 5, 6, 7, 8, 9, 10!" + + def test_with_stream_true(self): + client = InferenceClient() + output = client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Count to 10"}, + ], + stream=True, + max_tokens=1024, + ) + + chunked_text = [chunk.choices[0].delta.content for chunk in output] + assert len(chunked_text) == 34 + assert "".join(chunked_text) == "Here it goes:\n\n1, 2, 3, 4, 5, 6, 7, 8, 9, 10!" diff --git a/utils/generate_async_inference_client.py b/utils/generate_async_inference_client.py index d9b40c427c..cb8136f951 100644 --- a/utils/generate_async_inference_client.py +++ b/utils/generate_async_inference_client.py @@ -68,6 +68,9 @@ def generate_async_client_code(code: str) -> str: # Adapt /info and /health endpoints code = _adapt_info_and_health_endpoints(code) + # Adapt the proxy client (for client.chat.completions.create) + code = _adapt_proxy_client(code) + return code @@ -482,6 +485,13 @@ def _adapt_info_and_health_endpoints(code: str) -> str: return code.replace(health_sync_snippet, health_async_snippet) +def _adapt_proxy_client(code: str) -> str: + return code.replace( + "def __init__(self, client: InferenceClient):", + "def __init__(self, client: AsyncInferenceClient):", + ) + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( From 7215d61ed90e62fdcd1d0c897b8e9126ffcbcf67 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 10 Jul 2024 12:09:45 +0200 Subject: [PATCH 2/6] boulet --- tests/test_inference_async_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_inference_async_client.py b/tests/test_inference_async_client.py index 61d67c99ba..c9ad204ec3 100644 --- a/tests/test_inference_async_client.py +++ b/tests/test_inference_async_client.py @@ -410,7 +410,7 @@ async def test_openai_compatibility_without_base_url(): @pytest.mark.asyncio @with_production_testing async def test_openai_compatibility_with_stream_true(): - client = AsyncInferenceClient(token="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") + client = AsyncInferenceClient() output = await client.chat.completions.create( model="meta-llama/Meta-Llama-3-8B-Instruct", messages=[ From 0316248823cd4d2303f9f0b49295e79a77d7c6a3 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 10 Jul 2024 15:03:24 +0200 Subject: [PATCH 3/6] document openai compatibility --- docs/source/en/guides/inference.md | 61 ++++++++++++++++++- src/huggingface_hub/inference/_client.py | 56 ++++++++++++++++- .../inference/_generated/_async_client.py | 57 ++++++++++++++++- tests/test_inference_client.py | 8 +++ 4 files changed, 177 insertions(+), 5 deletions(-) diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md index c2e2ffbd5b..4f20e4213e 100644 --- a/docs/source/en/guides/inference.md +++ b/docs/source/en/guides/inference.md @@ -43,7 +43,7 @@ Let's get started with a text-to-image task: In the example above, we initialized an [`InferenceClient`] with the default parameters. The only thing you need to know is the [task](#supported-tasks) you want to perform. By default, the client will connect to the Inference API and select a model to complete the task. In our example, we generated an image from a text prompt. The returned value is a `PIL.Image` object that can be saved to a file. For more details, check out the [`~InferenceClient.text_to_image`] documentation. -Let's now see an example using the `chat_completion` API. This task uses an LLM to generate a response from a list of messages: +Let's now see an example using the [~`InferenceClient.chat_completion`] API. This task uses an LLM to generate a response from a list of messages: ```python >>> from huggingface_hub import InferenceClient @@ -147,6 +147,65 @@ endpoints. +## OpenAI compatibility + +The `chat_completion` task follows [OpenAI's Python client](https://github.com/openai/openai-python) syntax. What does it mean for you? It means that if you are used to play with `OpenAI`'s APIs you will be able to switch to `huggingface_hub.InferenceClient` to work with open-source models by updating just 2 line of code! + +```py +# instead of `from openai import OpenAI` +from huggingface_hub import InferenceClient + +# instead of `client = OpenAI(...)` +client = InferenceClient( + base_url=..., + api_key=..., +) + + +output = client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Count to 10"}, + ], + stream=True, + max_tokens=1024, +) + +for chunk in output: + print(chunk.choices[0].delta.content) +``` + +And that's it! The only required changes are to replace `from openai import OpenAI` by `from huggingface_hub import InferenceClient` and `client = OpenAI(...)` by `client = InferenceClient(...)`. You can chose any LLM model from the Hugging Face Hub by passing its model id as `model` parameter. [Here is a list](https://huggingface.co/models?pipeline_tag=text-generation&other=conversational,text-generation-inference&sort=trending) of supported models. For authentication, you should pass a valid [User Access Token](https://huggingface.co/settings/tokens) as `api_key` or authenticate using `huggingface_hub` (see the [authentication guide](https://huggingface.co/docs/huggingface_hub/quick-start#authentication)). + +All input parameters and output format are strictly the same. In particular, you can pass `stream=True` to receive tokens as they are generated. You can also use the [`AsyncInferenceClient`] to run inference using `asyncio`: + +```py +import asyncio +# instead of `from openai import AsyncOpenAI` +from huggingface_hub import AsyncInferenceClient + +# instead of `client = AsyncOpenAI()` +client = AsyncOpenAI() + +async def main(): + stream = await client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[{"role": "user", "content": "Say this is a test"}], + stream=True, + ) + async for chunk in stream: + print(chunk.choices[0].delta.content or "", end="") + +asyncio.run(main()) +``` + + + +`InferenceClient.chat.completions.create` is simply an alias for `InferenceClient.chat_completion`. Check out the package reference of [`~InferenceClient.chat_completion`] for more details. `base_url` and `api_key` parameters when instantiating the client are also aliases for `model` and `token`. These aliases have been defined to reduce friction when switching from `OpenAI` to `InferenceClient`. + + + ## Supported tasks [`InferenceClient`]'s goal is to provide the easiest interface to run inference on Hugging Face models. It diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py index 48262b5929..b7e431e26e 100644 --- a/src/huggingface_hub/inference/_client.py +++ b/src/huggingface_hub/inference/_client.py @@ -135,12 +135,16 @@ class InferenceClient: Args: model (`str`, `optional`): - The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `bigcode/starcoder` + The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `meta-llama/Meta-Llama-3-8B-Instruct` or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is automatically selected for the task. + Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2 + arguments are mutually exclusive and have the exact same behavior. token (`str` or `bool`, *optional*): Hugging Face token. Will default to the locally saved token if not provided. Pass `token=False` if you don't want to send your token to the server. + Note: for better compatibility with OpenAI's client, `token` has been aliased as `api_key`. Those 2 + arguments are mutually exclusive and have the exact same behavior. timeout (`float`, `optional`): The maximum number of seconds to wait for a response from the server. Loading a new model in Inference API can take up to several minutes. Defaults to None, meaning it will loop until the server is available. @@ -171,6 +175,19 @@ def __init__( base_url: Optional[str] = None, api_key: Optional[str] = None, ) -> None: + if model is not None and base_url is not None: + raise ValueError( + "Received both `model` and `base_url` arguments. Please provide only one of them." + " `base_url` is an alias for `model` to make the API compatible with OpenAI's client." + " It has the exact same behavior as `model`." + ) + if token is not None and api_key is not None: + raise ValueError( + "Received both `token` and `api_key` arguments. Please provide only one of them." + " `api_key` is an alias for `token` to make the API compatible with OpenAI's client." + " It has the exact same behavior as `token`." + ) + self.model: Optional[str] = model self.token: Union[str, bool, None] = token or api_key self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token)) # 'authorization' + 'user-agent' @@ -529,6 +546,15 @@ def chat_completion( """ A method for completing conversations using a specified language model. + + + The `client.chat_completion` method is aliased as `client.chat.completions.create` for compatibility with OpenAI's client. + Inputs and outputs are strictly the same and using either syntax will yield the same results. + Check out the [Inference guide](https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility) + for more details about OpenAI's compatibility. + + + Args: messages (List[Union[`SystemMessage`, `UserMessage`, `AssistantMessage`]]): Conversation history consisting of roles and content pairs. @@ -637,6 +663,32 @@ def chat_completion( ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504) ``` + Example using OpenAI's syntax: + ```py + # instead of `from openai import OpenAI` + from huggingface_hub import InferenceClient + + # instead of `client = OpenAI(...)` + client = InferenceClient( + base_url=..., + api_key=..., + ) + + + output = client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Count to 10"}, + ], + stream=True, + max_tokens=1024, + ) + + for chunk in output: + print(chunk.choices[0].delta.content) + ``` + Example using tools: ```py >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct") @@ -2744,7 +2796,7 @@ def get_model_status(self, model: Optional[str] = None) -> ModelStatus: ```py >>> from huggingface_hub import InferenceClient >>> client = InferenceClient() - >>> client.get_model_status("bigcode/starcoder") + >>> client.get_model_status("meta-llama/Meta-Llama-3-8B-Instruct") ModelStatus(loaded=True, state='Loaded', compute_type='gpu', framework='text-generation-inference') ``` """ diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py index c47b7ec2b7..7193076c08 100644 --- a/src/huggingface_hub/inference/_generated/_async_client.py +++ b/src/huggingface_hub/inference/_generated/_async_client.py @@ -120,12 +120,16 @@ class AsyncInferenceClient: Args: model (`str`, `optional`): - The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `bigcode/starcoder` + The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `meta-llama/Meta-Llama-3-8B-Instruct` or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is automatically selected for the task. + Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2 + arguments are mutually exclusive and have the exact same behavior. token (`str` or `bool`, *optional*): Hugging Face token. Will default to the locally saved token if not provided. Pass `token=False` if you don't want to send your token to the server. + Note: for better compatibility with OpenAI's client, `token` has been aliased as `api_key`. Those 2 + arguments are mutually exclusive and have the exact same behavior. timeout (`float`, `optional`): The maximum number of seconds to wait for a response from the server. Loading a new model in Inference API can take up to several minutes. Defaults to None, meaning it will loop until the server is available. @@ -156,6 +160,19 @@ def __init__( base_url: Optional[str] = None, api_key: Optional[str] = None, ) -> None: + if model is not None and base_url is not None: + raise ValueError( + "Received both `model` and `base_url` arguments. Please provide only one of them." + " `base_url` is an alias for `model` to make the API compatible with OpenAI's client." + " It has the exact same behavior as `model`." + ) + if token is not None and api_key is not None: + raise ValueError( + "Received both `token` and `api_key` arguments. Please provide only one of them." + " `api_key` is an alias for `token` to make the API compatible with OpenAI's client." + " It has the exact same behavior as `token`." + ) + self.model: Optional[str] = model self.token: Union[str, bool, None] = token or api_key self.headers = CaseInsensitiveDict(build_hf_headers(token=self.token)) # 'authorization' + 'user-agent' @@ -530,6 +547,15 @@ async def chat_completion( """ A method for completing conversations using a specified language model. + + + The `client.chat_completion` method is aliased as `client.chat.completions.create` for compatibility with OpenAI's client. + Inputs and outputs are strictly the same and using either syntax will yield the same results. + Check out the [Inference guide](https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility) + for more details about OpenAI's compatibility. + + + Args: messages (List[Union[`SystemMessage`, `UserMessage`, `AssistantMessage`]]): Conversation history consisting of roles and content pairs. @@ -639,6 +665,33 @@ async def chat_completion( ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504) ``` + Example using OpenAI's syntax: + ```py + # Must be run in an async context + # instead of `from openai import OpenAI` + from huggingface_hub import AsyncInferenceClient + + # instead of `client = OpenAI(...)` + client = AsyncInferenceClient( + base_url=..., + api_key=..., + ) + + + output = await client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Count to 10"}, + ], + stream=True, + max_tokens=1024, + ) + + for chunk in output: + print(chunk.choices[0].delta.content) + ``` + Example using tools: ```py # Must be run in an async context @@ -2785,7 +2838,7 @@ async def get_model_status(self, model: Optional[str] = None) -> ModelStatus: # Must be run in an async context >>> from huggingface_hub import AsyncInferenceClient >>> client = AsyncInferenceClient() - >>> await client.get_model_status("bigcode/starcoder") + >>> await client.get_model_status("meta-llama/Meta-Llama-3-8B-Instruct") ModelStatus(loaded=True, state='Loaded', compute_type='gpu', framework='text-generation-inference') ``` """ diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py index 1f17b06527..43c91ff3d2 100644 --- a/tests/test_inference_client.py +++ b/tests/test_inference_client.py @@ -903,3 +903,11 @@ def test_with_stream_true(self): chunked_text = [chunk.choices[0].delta.content for chunk in output] assert len(chunked_text) == 34 assert "".join(chunked_text) == "Here it goes:\n\n1, 2, 3, 4, 5, 6, 7, 8, 9, 10!" + + def test_token_and_api_key_mutually_exclusive(self): + with self.assertRaises(ValueError): + InferenceClient(token="my-token", api_key="my-api-key") + + def test_model_and_base_url_mutually_exclusive(self): + with self.assertRaises(ValueError): + InferenceClient(model="meta-llama/Meta-Llama-3-8B-Instruct", base_url="http://127.0.0.1:8000") From 5f71531305f3925c3281fee313548ddfba3ea2ee Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 10 Jul 2024 15:18:06 +0200 Subject: [PATCH 4/6] typo --- src/huggingface_hub/inference/_client.py | 1 - src/huggingface_hub/inference/_generated/_async_client.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py index b7e431e26e..5f3cca1b1b 100644 --- a/src/huggingface_hub/inference/_client.py +++ b/src/huggingface_hub/inference/_client.py @@ -674,7 +674,6 @@ def chat_completion( api_key=..., ) - output = client.chat.completions.create( model="meta-llama/Meta-Llama-3-8B-Instruct", messages=[ diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py index 7193076c08..982adf618e 100644 --- a/src/huggingface_hub/inference/_generated/_async_client.py +++ b/src/huggingface_hub/inference/_generated/_async_client.py @@ -677,7 +677,6 @@ async def chat_completion( api_key=..., ) - output = await client.chat.completions.create( model="meta-llama/Meta-Llama-3-8B-Instruct", messages=[ From 3e30e6f390fbe09812cb5ce43811cbefd50ec38c Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 10 Jul 2024 16:41:10 +0200 Subject: [PATCH 5/6] use diff in code snippets --- docs/source/en/guides/inference.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md index 4f20e4213e..7eb48ec4c5 100644 --- a/docs/source/en/guides/inference.md +++ b/docs/source/en/guides/inference.md @@ -151,12 +151,12 @@ endpoints. The `chat_completion` task follows [OpenAI's Python client](https://github.com/openai/openai-python) syntax. What does it mean for you? It means that if you are used to play with `OpenAI`'s APIs you will be able to switch to `huggingface_hub.InferenceClient` to work with open-source models by updating just 2 line of code! -```py -# instead of `from openai import OpenAI` -from huggingface_hub import InferenceClient +```diff +- from openai import OpenAI ++ from huggingface_hub import InferenceClient -# instead of `client = OpenAI(...)` -client = InferenceClient( +- client = OpenAI( ++ client = InferenceClient( base_url=..., api_key=..., ) @@ -180,13 +180,13 @@ And that's it! The only required changes are to replace `from openai import Open All input parameters and output format are strictly the same. In particular, you can pass `stream=True` to receive tokens as they are generated. You can also use the [`AsyncInferenceClient`] to run inference using `asyncio`: -```py +```diff import asyncio -# instead of `from openai import AsyncOpenAI` -from huggingface_hub import AsyncInferenceClient +- from openai import AsyncOpenAI ++ from huggingface_hub import AsyncInferenceClient -# instead of `client = AsyncOpenAI()` -client = AsyncOpenAI() +- client = AsyncOpenAI() ++ client = AsyncInferenceClient() async def main(): stream = await client.chat.completions.create( From dc97b0c9d6fafcfd9b7c5cb70ccd5d260af74deb Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 11 Jul 2024 12:08:04 +0200 Subject: [PATCH 6/6] why using us --- docs/source/en/guides/inference.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md index 7eb48ec4c5..92207658cd 100644 --- a/docs/source/en/guides/inference.md +++ b/docs/source/en/guides/inference.md @@ -200,6 +200,11 @@ async def main(): asyncio.run(main()) ``` +You might wonder why using [`InferenceClient`] instead of OpenAI's client? There are a few reasons for that: +1. [`InferenceClient`] is configured for Hugging Face services. You don't need to provide a `base_url` to run models on the serverless Inference API. You also don't need to provide a `token` or `api_key` if you machine is already correctly logged in. +2. [`InferenceClient`] is tailored for both Text-Generation-Inference (TGI) and `transformers` frameworks, meaning you are assured it will always be on-par with the latest updates. +3. [`InferenceClient`] is integrated with our Inference Endpoints service, making it easier to launch an Inference Endpoint, check its status and run inference on it. Check out the [Inference Endpoints](./inference_endpoints.md) guide for more details. + `InferenceClient.chat.completions.create` is simply an alias for `InferenceClient.chat_completion`. Check out the package reference of [`~InferenceClient.chat_completion`] for more details. `base_url` and `api_key` parameters when instantiating the client are also aliases for `model` and `token`. These aliases have been defined to reduce friction when switching from `OpenAI` to `InferenceClient`.