Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lightspeed-stack.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ llama_stack:
use_as_library_client: false
url: http://localhost:8321
api_key: xyzzy
chat_completion_mode: true
56 changes: 43 additions & 13 deletions src/app/endpoints/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any

from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client import LlamaStackClient

from fastapi import APIRouter, Request
Expand All @@ -12,7 +13,7 @@
from models.config import LLamaStackConfiguration
from models.responses import QueryResponse

logger = logging.getLogger(__name__)
logger = logging.getLogger("app.endpoints.handlers")
router = APIRouter(tags=["models"])


Expand All @@ -28,10 +29,8 @@
def info_endpoint_handler(request: Request, query: str) -> QueryResponse:
llama_stack_config = configuration.llama_stack_configuration
logger.info("LLama stack config: %s", llama_stack_config)
client = getLLamaStackClient(llama_stack_config)
client = LlamaStackClient(
base_url=llama_stack_config.url, api_key=llama_stack_config.api_key
)

client = get_llama_stack_client(llama_stack_config)

# retrieve list of available models
models = client.models.list()
Expand All @@ -42,24 +41,55 @@ def info_endpoint_handler(request: Request, query: str) -> QueryResponse:

logger.info("Model: %s", model_id)

response = client.inference.chat_completion(
model_id=model_id,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": query},
],
response = retrieve_response(
client, model_id, llama_stack_config.chat_completion_mode, query
)
return QueryResponse(query=query, response=str(response.completion_message.content))

return QueryResponse(query=query, response=response)


def retrieve_response(
client: LlamaStackClient, model_id: str, chat_completion_mode: bool, query: str
) -> str:
if chat_completion_mode:
logger.info("Chat completion mode enabled")
response = client.inference.chat_completion(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO all interactions with llama-stack will be through an agent.

Here you're calling the inference provider directly.

"Chat" can be implemented like this:

agent = Agent(client, ...)
# [session/conversation persistence but this is something else for the future...]
# session_id = agent.create_session("test-session")

prompt = input("Enter a prompt: ")
response = agent.create_turn(
  messages=[UserMessage(
    role="user",
    content=prompt,
  )],
  session_id=session_id,
)

for log in EventLogger().log(response):
  log.print()

Copy link
Contributor

@manstis manstis May 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using the inference provider on it's own means you're skipping the Agent's support for "safety/shields (Question validity, Answer redaction)" and RAG and other things. You'd be left to implement support for these things yourself - which was OK in road-core but is supported "out of the box" with llama-stack.

model_id=model_id,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": query},
],
)
return str(response.completion_message.content)
else:
logger.info("Chat completion mode disabled")
agent = Agent(
client,
model=model_id,
instructions="You are a helpful assistant",
tools=[],
)

prompt = "How do you do great work?"

response = agent.create_turn(
messages=[{"role": "user", "content": prompt}],
session_id=agent.create_session("rag_session"),
stream=False,
)
return str(response.output_message.content)


def getLLamaStackClient(
def get_llama_stack_client(
llama_stack_config: LLamaStackConfiguration,
) -> LlamaStackClient:
if llama_stack_config.use_as_library_client is True:
logger.info("Using Llama stack as library client")
client = LlamaStackAsLibraryClient("ollama")
client.initialize()
return client
else:
logger.info("Using Llama stack running as a service")
return LlamaStackClient(
base_url=llama_stack_config.url, api_key=llama_stack_config.api_key
)
5 changes: 4 additions & 1 deletion src/app/endpoints/root.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
from fastapi import APIRouter, Request
from fastapi.responses import HTMLResponse

logger = logging.getLogger(__name__)
logger = logging.getLogger("app.endpoints.handlers")

router = APIRouter(tags=["root"])


index_page = """
<html>
<head>
Expand All @@ -25,4 +27,5 @@

@router.get("/", response_class=HTMLResponse)
def root_endpoint_handler(request: Request) -> HTMLResponse:
logger.info("Serving index page")
return HTMLResponse(index_page)
1 change: 1 addition & 0 deletions src/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class LLamaStackConfiguration(BaseModel):
url: Optional[str] = None
api_key: Optional[str] = None
use_as_library_client: Optional[bool] = None
chat_completion_mode: bool = False

@model_validator(mode="after")
def check_llama_stack_model(self) -> Self:
Expand Down