complete to add for tool

Signed-off-by: Xue, Chendi <chendi.xue@intel.com>
intel · Mar 11, 2024 · 3a0e574 · 3a0e574
1 parent fcf254a
commit 3a0e574
Show file tree

Hide file tree

Showing 13 changed files with 350 additions and 163 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,3 @@
 # with [tools.setuptools] in pyproject.toml, the configs below work in both baremetal and container
 include inference/**/*.yaml
+include inference/**/*.jinja
diff --git a/examples/inference/api_server_langchain/openai_agent_tools_call_query_with_langchain_sdk.py b/examples/inference/api_server_langchain/openai_agent_tools_call_query_with_langchain_sdk.py
@@ -81,7 +81,7 @@ def get_current_weather(location, unit):
         "location": location,
         "temperature": "78",
         "unit": unit,
-        "forecast": ["sunny", "with a chance of meatballs"],
+        "forecast": ["sunny", "with a chance of rain"],
     }
     return weather_info
 
@@ -126,3 +126,4 @@ def _arun(self, location: str, unit: str):
 agent = create_openai_tools_agent(tools=tools, llm=llm, prompt=prompt)
 agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
 agent_executor.invoke({"input": "what is the weather today in Boston?"})
+agent_executor.invoke({"input": "tell me a short joke?"})
diff --git a/examples/inference/api_server_langchain/openai_bind_tools_with_langchain_sdk.py b/examples/inference/api_server_langchain/openai_bind_tools_with_langchain_sdk.py
diff --git a/examples/inference/api_server_langchain/query_langchain_sdk.py b/examples/inference/api_server_langchain/query_langchain_sdk.py
@@ -29,6 +29,8 @@
     action="store_true",
     help="Whether to enable streaming response",
 )
+parser.add_argument("--max_tokens", default=256, help="The maximum numbers of tokens to generate")
+
 
 args = parser.parse_args()
 
@@ -52,6 +54,7 @@
     model_name=args.model_name,
     openai_api_key=openai_api_key,
     streaming=args.streaming_response,
+    max_tokens=args.max_tokens,
 )
 
 prompt = PromptTemplate(template="list 3 {things}", input_variables=["things"])

diff --git a/examples/inference/api_server_openai/openai_tools_call_query.py b/examples/inference/api_server_openai/openai_tools_call_query.py
@@ -74,17 +74,35 @@
     }
 ]
 messages = [
-    {"role": "system", "content": "You are a helpful assistant"},
-    {"role": "user", "content": "What's the weather like in Boston today?"},
+    [
+        {"role": "user", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "What's the weather like in Boston today?"},
+    ],
+    [
+        {"role": "user", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Tell me a short joke?"},
+    ],
 ]
+for message in messages:
+    print(f"User: {message[1]['content']}")
+    print("Assistant:", end=" ", flush=True)
+    chat_completion = client.chat.completions.create(
+        model=args.model_name,
+        messages=message,
+        max_tokens=args.max_new_tokens,
+        tools=tools,
+        tool_choice="auto",
+        stream=args.streaming_response,
+    )
 
-chat_completion = client.chat.completions.create(
-    model=args.model_name,
-    messages=messages,
-    max_tokens=args.max_new_tokens,
-    tools=tools,
-    tool_choice="auto",
-    stream=args.streaming_response,
-)
-
-print(repr(chat_completion.choices[0].message.model_dump()))
+    if args.streaming_response:
+        for chunk in chat_completion:
+            content = chunk.choices[0].delta.content
+            if content is not None:
+                print(content, end="", flush=True)
+            tool_calls = chunk.choices[0].delta.tool_calls
+            if tool_calls is not None:
+                print(tool_calls, end="", flush=True)
+        print("")
+    else:
+        print(repr(chat_completion.choices[0].message.model_dump()))
diff --git a/llm_on_ray/inference/api_openai_backend/openai_protocol.py b/llm_on_ray/inference/api_openai_backend/openai_protocol.py
@@ -164,9 +164,14 @@ def __str__(self):
         return self.role
 
 
+class DeltaEOS(BaseModel):
+    class Config:
+        extra = "forbid"
+
+
 class DeltaContent(BaseModel):
     content: str
-    tool_calls: Optional[List[Dict[str, Any]]] = None
+    tool_calls: Optional[List[ToolCall]] = None
 
     def __str__(self):
         if self.tool_calls:
@@ -175,11 +180,6 @@ def __str__(self):
             return str(self.dict())
 
 
-class DeltaEOS(BaseModel):
-    class Config:
-        extra = "forbid"
-
-
 class DeltaChoices(BaseModel):
     delta: Union[DeltaRole, DeltaContent, DeltaEOS]
     index: int

diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py
@@ -48,7 +48,6 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep
         else:
             raise HTTPException(404, f"Could not find model with id {model}")
 
-        prompt_content = prompt.prompt
         request_config = prompt.parameters
         temperature = request_config.get("temperature", 1.0)
         top_p = request_config.get("top_p", 1.0)
@@ -64,7 +63,7 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep
             async_iterator=deploy_handle.options(stream=True)
             .openai_call.options(stream=True, use_new_handle_api=True)
             .remote(
-                prompt_content,
+                prompt.prompt,
                 gen_config,
                 streaming_response=streaming_reponse,
                 tools=prompt.tools,

diff --git a/llm_on_ray/inference/api_openai_backend/router_app.py b/llm_on_ray/inference/api_openai_backend/router_app.py
@@ -162,13 +162,15 @@ async def _chat_completions_wrapper(
                 finish_reason=None,
             )
         ]
-        yield "data: " + ChatCompletionResponse(
+        chunk = ChatCompletionResponse(
             id=completion_id,
             object="chat.completion.chunk",
             model=body.model,
             choices=choices,
             usage=None,
-        ).json() + "\n\n"
+        )
+        data = chunk.json()
+        yield f"data: {data}\n\n"
 
         all_results = []
         async for results in generator:
@@ -198,13 +200,16 @@ async def _chat_completions_wrapper(
                             finish_reason=None,
                         )
                     ]
-                    yield "data: " + ChatCompletionResponse(
+                    chunk = ChatCompletionResponse(
                         id=completion_id,
                         object="chat.completion.chunk",
                         model=body.model,
                         choices=choices,
                         usage=None,
-                    ).json() + "\n\n"
+                    )
+                    # data = chunk.json(exclude_unset=True, ensure_ascii=False)
+                    data = chunk.json()
+                    yield f"data: {data}\n\n"
             if had_error:
                 # Return early in case of an error
                 break
@@ -221,13 +226,15 @@ async def _chat_completions_wrapper(
                 if all_results
                 else None
             )
-            yield "data: " + ChatCompletionResponse(
+            chunk = ChatCompletionResponse(
                 id=completion_id,
                 object="chat.completion.result",
                 model=body.model,
                 choices=choices,
                 usage=usage,
-            ).json() + "\n\n"
+            )
+            data = chunk.json()
+            yield f"data: {data}\n\n"
         yield "data: [DONE]\n\n"
 
 
@@ -333,17 +340,13 @@ async def chat(
         Returns:
             A response object with completions.
         """
-        tools = body.tools
-        tool_choice = body.tool_choice
-        # Doing this to remove them from sampling params
-        body.tools = None
-        body.tool_choice = None
-
         prompt = Prompt(
-            prompt=body.messages, parameters=dict(body), tools=tools, tool_choice=tool_choice
+            prompt=body.messages,
+            parameters=dict(body),
+            tools=body.tools,
+            tool_choice=body.tool_choice,
         )
         request_id = f"chatcmpl-{str(uuid.uuid4().hex)}"
-
         if body.stream:
             return StreamingResponse(
                 _chat_completions_wrapper(

diff --git a/llm_on_ray/inference/api_openai_backend/templates/tools_functions.jinja b/llm_on_ray/inference/api_openai_backend/templates/tools_functions.jinja
@@ -0,0 +1,60 @@
+{%- set func_call_token = "!function_call:" -%} {#- The special prefix to functions calls, be aware of extra space or new lines ! -#}
+
+{%- if CONTEXT == CALL_TOKEN -%} {#- return only the func_call_token value. Needed by the implementation. No data included -#}
+    {{- func_call_token -}}
+{%- endif -%} {#- CONTEXT == CALL_TOKEN -#}
+
+{%- if CONTEXT == CALLS_NOTIF -%} {#- Format the notification of the function call. Data: tool_calls = ToolCall -#}
+    {%- for call in tool_calls -%}
+    {%- if call.function.arguments == None or call.function.arguments|count == 0 -%}
+    {{- call.id }} was called with no argument
+    {%- else -%}
+    {{- call.id }} was called with arguments : {{- call.function.arguments -}}
+    {%- endif -%}
+    {%- raw %}
+{% endraw -%}
+    {%- endfor -%}
+{%- endif -%} {#- CONTEXT == CALLS_NOTIF -#}
+
+{%- if CONTEXT == TOOL_RESPONSE -%} {#- Format of the response of the function call. Data: message = ChatMessage -#}
+    {{- message.content -}}
+{%- endif -%} {#- CONTEXT == TOOL_RESPONSE -#}
+
+{%- if CONTEXT == FORCE_CALL -%} {#- One tool call defined request. Data: tool = ToolCall -#}
+You must call the following function at least one time to answer the question. You may call it multiple times if needed:
+        {%- if tool.function.parameters == None or tool.function.parameters|count == 0 -%}  {#- without parameter #}
+  {'name': "{{tool.function.name}}", 'description': "{{tool.function.description}}", 'arguments': null},
+        {%- else -%} {#- with parameters #}
+  {'name': "{{tool.function.name}}", 'description': "{{tool.function.description}}", 'arguments': { {{tool.function.parameters}} {{ '}}' }},
+        {%- endif %}  {#- tool.function.parameters #}
+{%- endif -%} {#- CONTEXT == FORCE_CALL -#}
+
+{%- if CONTEXT == FUNCTIONS_LIST -%} {#- Functions list generation Data: tools_list = List[ToolCall] -#}
+    {%- raw -%}The following is a list of external functions that may be called to complete certain tasks:
+[
+    {%- endraw -%}
+    {%- for tool in tools_list -%}
+    {%- if tool.function.parameters == None or tool.function.parameters|count == 0 -%}  {#- without parameter #}
+  {'name': "{{tool.function.name}}", 'description': "{{tool.function.description}}", 'arguments': null},
+    {%- else -%} {#- with parameters #}
+  {'name': "{{tool.function.name}}", 'description': "{{tool.function.description}}", 'arguments': { {{tool.function.parameters}} {{ '}}' }},
+  {% endif -%}  {#- tool.function.parameters #}
+    {%- endfor -%}
+    {%- raw %}
+]
+End of list
+
+* Whenever the user asks you something, you can either respond directly or invoke a function if it is present in the previous list.
+* The decision to invoke a function is yours, only invoke a function if it is necessary to answer the user's question
+* If you need to call at least one function, your message should contain only a list of function calls and nothing else; the function calls are the response.
+    {%- endraw %}
+{%- endif -%} {#- CONTEXT == FUNCTIONS_LIST -#}
+
+{%- if CONTEXT == FORCE_CALL or CONTEXT == FUNCTIONS_LIST -%}
+To call a function, the message must start by "{{func_call_token}}" followed by a json like this:
+* With arguments:
+{{func_call_token}}{"name": "function_name", "arguments": {"arg1": "value1"}}
+* Without arguments:
+{{func_call_token}}{"name": "function_name", "arguments": null}
+End of functions instructions
+{%- endif -%} {#- CONTEXT == FORCE_CALL or CONTEXT == FUNCTIONS_LIST -#}