Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 11 additions & 18 deletions docs/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -1500,7 +1500,7 @@
"streaming_query"
],
"summary": "Streaming Query Endpoint Handler",
"description": "Handle request to the /streaming_query endpoint using Agent API.\n\nThis is a wrapper around streaming_query_endpoint_handler_base that provides\nthe Agent API specific retrieve_response and response generator functions.\n\nReturns:\n StreamingResponse: An HTTP streaming response yielding\n SSE-formatted events for the query lifecycle.\n\nRaises:\n HTTPException: Returns HTTP 500 if unable to connect to the\n Llama Stack server.",
"description": "Handle request to the /streaming_query endpoint using Agent API.\n\nReturns a streaming response using Server-Sent Events (SSE) format with\ncontent type text/event-stream.\n\nReturns:\n StreamingResponse: An HTTP streaming response yielding\n SSE-formatted events for the query lifecycle with content type\n text/event-stream.\n\nRaises:\n HTTPException:\n - 401: Unauthorized - Missing or invalid credentials\n - 403: Forbidden - Insufficient permissions or model override not allowed\n - 404: Not Found - Conversation, model, or provider not found\n - 422: Unprocessable Entity - Request validation failed\n - 429: Too Many Requests - Quota limit exceeded\n - 500: Internal Server Error - Configuration not loaded or other server errors\n - 503: Service Unavailable - Unable to connect to Llama Stack backend",
"operationId": "streaming_query_endpoint_handler_v1_streaming_query_post",
"requestBody": {
"content": {
Expand All @@ -1514,16 +1514,14 @@
},
"responses": {
"200": {
"description": "Streaming response (Server-Sent Events)",
"description": "Successful response",
"content": {
"application/json": {
"schema": {}
},
"text/event-stream": {
"schema": {
"type": "string"
"type": "string",
"format": "text/event-stream"
},
"example": "data: {\"event\": \"start\", \"data\": {\"conversation_id\": \"123e4567-e89b-12d3-a456-426614174000\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 0, \"token\": \"Hello\"}}\n\ndata: {\"event\": \"end\", \"data\": {\"referenced_documents\": [], \"truncated\": null, \"input_tokens\": 0, \"output_tokens\": 0}, \"available_quotas\": {}}\n\n"
"example": "data: {\"event\": \"start\", \"data\": {\"conversation_id\": \"123e4567-e89b-12d3-a456-426614174000\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 0, \"token\": \"No Violation\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 1, \"token\": \"\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 2, \"token\": \"Hello\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 3, \"token\": \"!\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 4, \"token\": \" How\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 5, \"token\": \" can\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 6, \"token\": \" I\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 7, \"token\": \" assist\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 8, \"token\": \" you\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 9, \"token\": \" today\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 10, \"token\": \"?\"}}\n\ndata: {\"event\": \"turn_complete\", \"data\": {\"token\": \"Hello! How can I assist you today?\"}}\n\ndata: {\"event\": \"end\", \"data\": {\"rag_chunks\": [], \"referenced_documents\": [], \"truncated\": null, \"input_tokens\": 11, \"output_tokens\": 19, \"available_quotas\": {}}}\n\n"
}
}
},
Expand Down Expand Up @@ -3719,7 +3717,7 @@
"streaming_query_v2"
],
"summary": "Streaming Query Endpoint Handler V2",
"description": "Handle request to the /streaming_query endpoint using Responses API.\n\nThis is a wrapper around streaming_query_endpoint_handler_base that provides\nthe Responses API specific retrieve_response and response generator functions.\n\nReturns:\n StreamingResponse: An HTTP streaming response yielding\n SSE-formatted events for the query lifecycle.\n\nRaises:\n HTTPException: Returns HTTP 500 if unable to connect to the\n Llama Stack server.",
"description": "Handle request to the /streaming_query endpoint using Responses API.\n\nReturns a streaming response using Server-Sent Events (SSE) format with\ncontent type text/event-stream.\n\nReturns:\n StreamingResponse: An HTTP streaming response yielding\n SSE-formatted events for the query lifecycle with content type\n text/event-stream.\n\nRaises:\n HTTPException:\n - 401: Unauthorized - Missing or invalid credentials\n - 403: Forbidden - Insufficient permissions or model override not allowed\n - 404: Not Found - Conversation, model, or provider not found\n - 422: Unprocessable Entity - Request validation failed\n - 429: Too Many Requests - Quota limit exceeded\n - 500: Internal Server Error - Configuration not loaded or other server errors\n - 503: Service Unavailable - Unable to connect to Llama Stack backend",
"operationId": "streaming_query_endpoint_handler_v2_v2_streaming_query_post",
"requestBody": {
"content": {
Expand All @@ -3733,19 +3731,14 @@
},
"responses": {
"200": {
"description": "Streaming response with Server-Sent Events",
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "string",
"example": "data: {\"event\": \"start\", \"data\": {\"conversation_id\": \"123e4567-e89b-12d3-a456-426614174000\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 0, \"token\": \"Hello\"}}\n\ndata: {\"event\": \"end\", \"data\": {\"referenced_documents\": [], \"truncated\": null, \"input_tokens\": 0, \"output_tokens\": 0}, \"available_quotas\": {}}\n\n"
}
},
"text/plain": {
"text/event-stream": {
"schema": {
"type": "string",
"example": "Hello world!\n\n---\n\nReference: https://example.com/doc"
}
"format": "text/event-stream"
},
"example": "data: {\"event\": \"start\", \"data\": {\"conversation_id\": \"123e4567-e89b-12d3-a456-426614174000\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 0, \"token\": \"No Violation\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 1, \"token\": \"\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 2, \"token\": \"Hello\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 3, \"token\": \"!\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 4, \"token\": \" How\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 5, \"token\": \" can\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 6, \"token\": \" I\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 7, \"token\": \" assist\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 8, \"token\": \" you\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 9, \"token\": \" today\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 10, \"token\": \"?\"}}\n\ndata: {\"event\": \"turn_complete\", \"data\": {\"token\": \"Hello! How can I assist you today?\"}}\n\ndata: {\"event\": \"end\", \"data\": {\"rag_chunks\": [], \"referenced_documents\": [], \"truncated\": null, \"input_tokens\": 11, \"output_tokens\": 19, \"available_quotas\": {}}}\n\n"
}
}
},
Expand Down
41 changes: 19 additions & 22 deletions src/app/endpoints/streaming_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
NotFoundResponse,
QuotaExceededResponse,
ServiceUnavailableResponse,
StreamingQueryResponse,
UnauthorizedResponse,
UnprocessableEntityResponse,
)
Expand All @@ -76,22 +77,7 @@


streaming_query_responses: dict[int | str, dict[str, Any]] = {
200: {
"description": "Streaming response (Server-Sent Events)",
"content": {
"text/event-stream": {
"schema": {"type": "string"},
"example": (
'data: {"event": "start", '
'"data": {"conversation_id": "123e4567-e89b-12d3-a456-426614174000"}}\n\n'
'data: {"event": "token", "data": {"id": 0, "token": "Hello"}}\n\n'
'data: {"event": "end", "data": {"referenced_documents": [], '
'"truncated": null, "input_tokens": 0, "output_tokens": 0}, '
'"available_quotas": {}}\n\n'
),
}
},
},
200: StreamingQueryResponse.openapi_response(),
401: UnauthorizedResponse.openapi_response(
examples=["missing header", "missing token"]
),
Expand Down Expand Up @@ -937,7 +923,11 @@ async def error_generator() -> AsyncGenerator[str, None]:
return StreamingResponse(error_generator(), media_type=content_type)


@router.post("/streaming_query", responses=streaming_query_responses)
@router.post(
"/streaming_query",
response_class=StreamingResponse,
responses=streaming_query_responses,
)
@authorize(Action.STREAMING_QUERY)
async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals,too-many-statements
request: Request,
Expand All @@ -948,16 +938,23 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals,t
"""
Handle request to the /streaming_query endpoint using Agent API.

This is a wrapper around streaming_query_endpoint_handler_base that provides
the Agent API specific retrieve_response and response generator functions.
Returns a streaming response using Server-Sent Events (SSE) format with
content type text/event-stream.

Returns:
StreamingResponse: An HTTP streaming response yielding
SSE-formatted events for the query lifecycle.
SSE-formatted events for the query lifecycle with content type
text/event-stream.

Raises:
HTTPException: Returns HTTP 500 if unable to connect to the
Llama Stack server.
HTTPException:
- 401: Unauthorized - Missing or invalid credentials
- 403: Forbidden - Insufficient permissions or model override not allowed
- 404: Not Found - Conversation, model, or provider not found
- 422: Unprocessable Entity - Request validation failed
- 429: Too Many Requests - Quota limit exceeded
- 500: Internal Server Error - Configuration not loaded or other server errors
- 503: Service Unavailable - Unable to connect to Llama Stack backend
"""
return await streaming_query_endpoint_handler_base(
request=request,
Expand Down
49 changes: 19 additions & 30 deletions src/app/endpoints/streaming_query_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
NotFoundResponse,
QuotaExceededResponse,
ServiceUnavailableResponse,
StreamingQueryResponse,
UnauthorizedResponse,
UnprocessableEntityResponse,
)
Expand All @@ -58,30 +59,7 @@
auth_dependency = get_auth_dependency()

streaming_query_v2_responses: dict[int | str, dict[str, Any]] = {
200: {
"description": "Streaming response with Server-Sent Events",
"content": {
"application/json": {
"schema": {
"type": "string",
"example": (
'data: {"event": "start", '
'"data": {"conversation_id": "123e4567-e89b-12d3-a456-426614174000"}}\n\n'
'data: {"event": "token", "data": {"id": 0, "token": "Hello"}}\n\n'
'data: {"event": "end", "data": {"referenced_documents": [], '
'"truncated": null, "input_tokens": 0, "output_tokens": 0}, '
'"available_quotas": {}}\n\n'
),
}
},
"text/plain": {
"schema": {
"type": "string",
"example": "Hello world!\n\n---\n\nReference: https://example.com/doc",
}
},
},
},
200: StreamingQueryResponse.openapi_response(),
401: UnauthorizedResponse.openapi_response(
examples=["missing header", "missing token"]
),
Expand Down Expand Up @@ -313,7 +291,11 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat
return response_generator


@router.post("/streaming_query", responses=streaming_query_v2_responses)
@router.post(
"/streaming_query",
response_class=StreamingResponse,
responses=streaming_query_v2_responses,
)
@authorize(Action.STREAMING_QUERY)
async def streaming_query_endpoint_handler_v2( # pylint: disable=too-many-locals
request: Request,
Expand All @@ -324,16 +306,23 @@ async def streaming_query_endpoint_handler_v2( # pylint: disable=too-many-local
"""
Handle request to the /streaming_query endpoint using Responses API.

This is a wrapper around streaming_query_endpoint_handler_base that provides
the Responses API specific retrieve_response and response generator functions.
Returns a streaming response using Server-Sent Events (SSE) format with
content type text/event-stream.

Returns:
StreamingResponse: An HTTP streaming response yielding
SSE-formatted events for the query lifecycle.
SSE-formatted events for the query lifecycle with content type
text/event-stream.

Raises:
HTTPException: Returns HTTP 500 if unable to connect to the
Llama Stack server.
HTTPException:
- 401: Unauthorized - Missing or invalid credentials
- 403: Forbidden - Insufficient permissions or model override not allowed
- 404: Not Found - Conversation, model, or provider not found
- 422: Unprocessable Entity - Request validation failed
- 429: Too Many Requests - Quota limit exceeded
- 500: Internal Server Error - Configuration not loaded or other server errors
- 503: Service Unavailable - Unable to connect to Llama Stack backend
"""
return await streaming_query_endpoint_handler_base(
request=request,
Expand Down
73 changes: 71 additions & 2 deletions src/models/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from quota.quota_exceed_error import QuotaExceedError
from models.config import Action, Configuration

SUCCESSFUL_RESPONSE_DESCRIPTION = "Successful response"
BAD_REQUEST_DESCRIPTION = "Invalid request format"
UNAUTHORIZED_DESCRIPTION = "Unauthorized"
FORBIDDEN_DESCRIPTION = "Permission denied"
Expand Down Expand Up @@ -52,7 +53,7 @@ def openapi_response(cls) -> dict[str, Any]:
content = {"application/json": {"example": example_value}}

return {
"description": "Successful response",
"description": SUCCESSFUL_RESPONSE_DESCRIPTION,
"model": cls,
"content": content,
}
Expand Down Expand Up @@ -449,6 +450,74 @@ class QueryResponse(AbstractSuccessfulResponse):
}


class StreamingQueryResponse(AbstractSuccessfulResponse):
"""Documentation-only model for streaming query responses using Server-Sent Events (SSE)."""

@classmethod
def openapi_response(cls) -> dict[str, Any]:
"""Generate FastAPI response dict for SSE streaming with examples.

Note: This is used for OpenAPI documentation only. The actual endpoint
returns a StreamingResponse object, not this Pydantic model.
"""
schema = cls.model_json_schema()
model_examples = schema.get("examples")
if not model_examples:
raise SchemaError(f"Examples not found in {cls.__name__}")
example_value = model_examples[0]
content = {
"text/event-stream": {
"schema": {"type": "string", "format": "text/event-stream"},
"example": example_value,
}
}

return {
"description": SUCCESSFUL_RESPONSE_DESCRIPTION,
"content": content,
# Note: No "model" key since we're not actually serializing this model
}

model_config = {
"json_schema_extra": {
"examples": [
(
'data: {"event": "start", "data": {'
'"conversation_id": "123e4567-e89b-12d3-a456-426614174000"}}\n\n'
'data: {"event": "token", "data": {'
'"id": 0, "token": "No Violation"}}\n\n'
'data: {"event": "token", "data": {'
'"id": 1, "token": ""}}\n\n'
'data: {"event": "token", "data": {'
'"id": 2, "token": "Hello"}}\n\n'
'data: {"event": "token", "data": {'
'"id": 3, "token": "!"}}\n\n'
'data: {"event": "token", "data": {'
'"id": 4, "token": " How"}}\n\n'
'data: {"event": "token", "data": {'
'"id": 5, "token": " can"}}\n\n'
'data: {"event": "token", "data": {'
'"id": 6, "token": " I"}}\n\n'
'data: {"event": "token", "data": {'
'"id": 7, "token": " assist"}}\n\n'
'data: {"event": "token", "data": {'
'"id": 8, "token": " you"}}\n\n'
'data: {"event": "token", "data": {'
'"id": 9, "token": " today"}}\n\n'
'data: {"event": "token", "data": {'
'"id": 10, "token": "?"}}\n\n'
'data: {"event": "turn_complete", "data": {'
'"token": "Hello! How can I assist you today?"}}\n\n'
'data: {"event": "end", "data": {'
'"rag_chunks": [], "referenced_documents": [], '
'"truncated": null, "input_tokens": 11, "output_tokens": 19, '
'"available_quotas": {}}}\n\n'
),
]
}
}


class InfoResponse(AbstractSuccessfulResponse):
"""Model representing a response to an info request.

Expand Down Expand Up @@ -806,7 +875,7 @@ def openapi_response(cls) -> dict[str, Any]:
content = {"application/json": {"examples": named_examples or None}}

return {
"description": "Successful response",
"description": SUCCESSFUL_RESPONSE_DESCRIPTION,
"model": cls,
"content": content,
}
Expand Down
Loading
Loading