From fc1a86d5c66a6916fdac5704e4f2aaafefd3874c Mon Sep 17 00:00:00 2001 From: Pavel Tisnovsky Date: Tue, 13 May 2025 10:41:32 +0200 Subject: [PATCH] Use llama-stack to retrieve LLM output --- src/app/endpoints/query.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index f3bde3c8..da4164db 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -3,8 +3,11 @@ import logging from typing import Any +from llama_stack_client import LlamaStackClient + from fastapi import APIRouter, Request +from configuration import configuration from models.responses import QueryResponse logger = logging.getLogger(__name__) @@ -19,6 +22,28 @@ } -@router.get("/query", responses=query_response) -def info_endpoint_handler(request: Request) -> QueryResponse: - return QueryResponse(query="foo", response="bar") +@router.post("/query", responses=query_response) +def info_endpoint_handler(request: Request, query: str) -> QueryResponse: + llama_stack_config = configuration.llama_stack_configuration + logger.info("LLama stack config: %s", llama_stack_config) + client = LlamaStackClient( + base_url=llama_stack_config.url, api_key=llama_stack_config.api_key + ) + + # retrieve list of available models + models = client.models.list() + + # select the first LLM + llm = next(m for m in models if m.model_type == "llm") + model_id = llm.identifier + + logger.info("Model: %s", model_id) + + response = client.inference.chat_completion( + model_id=model_id, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": query}, + ], + ) + return QueryResponse(query=query, response=str(response.completion_message.content))