In [None]:

from pathlib import Path
import os
import logging
import sys
from uuid import UUID

from langchain_community.chat_models import ChatLiteLLM
from langchain_community.embeddings import AzureOpenAIEmbeddings
from langchain.schema import SystemMessage
from langchain_elasticsearch.vectorstores import ElasticsearchStore, ApproxRetrievalStrategy
from elasticsearch import Elasticsearch

from redbox.models import Settings, File
from redbox.models.settings import ElasticLocalSettings
from redbox.storage import ElasticsearchStorageHandler

from pydantic import BaseModel

from dotenv import find_dotenv, load_dotenv
import json
from humanize import naturaltime

ROOT = Path().resolve().parent
sys.path.append(str(ROOT))

_ = load_dotenv(find_dotenv(ROOT / '.env'))

logging.basicConfig(steam=sys.stdout, level=logging.INFO)
log = logging.getLogger()

env = Settings(
    _env_file=(ROOT / '.env'),
    minio_host="localhost", 
    object_store="minio",
    elastic=ElasticLocalSettings(host="localhost"),
)

es = Elasticsearch(
    hosts=[
        {
            "host": "localhost",
            "port": env.elastic.port,
            "scheme": env.elastic.scheme,
        }
    ],
    basic_auth=(env.elastic.user, env.elastic.password),
)

if env.elastic.subscription_level == "basic":
    strategy = ApproxRetrievalStrategy(hybrid=False)
elif env.elastic.subscription_level in ["platinum", "enterprise"]:
    strategy = ApproxRetrievalStrategy(hybrid=True)

embedding_model = AzureOpenAIEmbeddings(
    model=env.azure_embedding_model,
    azure_endpoint=env.azure_openai_endpoint,
    openai_api_base=None,
    openai_api_key=env.azure_openai_api_key,
    openai_api_version=env.openai_api_version,
    azure_ad_token=None,
    openai_organization=None,
    chunk_size=2048
)


vector_store = ElasticsearchStore(
    es_connection=es,
    index_name="redbox-data-integration-chunk",
    embedding=embedding_model,
    strategy=strategy,
    vector_query_field="azure_embedding",
)

# See core_api.dependecies for details on this hack
os.environ["AZURE_API_VERSION"] = env.openai_api_version



llm = ChatLiteLLM(
    model=env.azure_openai_model,
    streaming=True,
    azure_key=env.azure_openai_api_key,
    api_base=env.azure_openai_endpoint,
    max_tokens=1_024,
)

storage_handler = ElasticsearchStorageHandler(es_client=es, root_index=env.elastic_root_index+"-integration")

In [None]:
# Quick test to check if the model is working

user_query = "What can we do to improve school inspections?"


vector_store.similarity_search(query=user_query, k=10)

Three approaches to develop:
- `LIST`: List available files and have LLM select
- `LIST_SUMMARY`: List available files with a brief summary description and have LLM select
- `PARENT_DOC_RETRIEVER`: Do a similarity search on chunks and retrieve the most relevant parent files
  - `DENSITY_PARENT_DOC_RETRIEVER`: Perform a density analysis to determine which files have the most relevant content

In [None]:
# LIST approach

USER_UUID = "5b57d820-1b6b-41db-b265-ca046e7a9674"

def simplify_file(file: File) -> dict:
    """
    Convert a File object into a simple dictionary that can be used in prompt by the LLM

    Args:
        file (File): The file object to simplify

    Returns:
        dict: A simplified dictionary representation of the file
    """

    simple_file = file.model_dump()

    # Remove fields that are not needed by LLM
    del simple_file["model_type"]
    del simple_file["creator_user_uuid"]
    del simple_file["ingest_status"]
    del simple_file["bucket"]

    # Add a relative datetime field for the LLM to use (relative to now)
    simple_file["created_datetime_relative"] = naturaltime(file.created_datetime)
    simple_file["created_datetime"] = file.created_datetime.isoformat()

    simple_file["uuid"] = str(file.uuid)
    return simple_file

# Pydantic model to parse the LLM response of selected files

class FileSelectionResponse(BaseModel):
    reasoning: str = ""
    selected_files: list[UUID] = []
    

# Get the list of all the files available

files = storage_handler.read_all_items(model_type="File", user_uuid=USER_UUID)

# filter where ingest_status == "complete"

files = [f for f in files if f.ingest_status == "complete"]

# Simplify the files

simplified_files = [simplify_file(f) for f in files]

# Create a prompt for the LLM



prompt = "File Selection Instructions: \n\n"

prompt += f"User Query: '{user_query}'\n\n"

prompt += "The following files are available for selection:\n\n"

for i, file in enumerate(simplified_files):
    prompt += f"{file["uuid"]} - {json.dumps(file)}\n\n"

prompt += "Please select only relevant file UUIDs to be used in answering this question. \
    If they reference time of a file, use the datetime to infer which file they might mean. \
    Only respond using this following this JSON response schema. Do not repeat the schema, but respond with a JSON response that conforms\n\n"

prompt += json.dumps(FileSelectionResponse().model_json_schema(), indent=2)

In [None]:
messages = [
    SystemMessage(content=prompt)
]

resp = llm(messages)

print(resp.content)

In [None]:
# Convert the above into pure langchain code



from langchain_core.runnables import Runnable, RunnableLambda, RunnablePassthrough, chain
from langchain_core.runnables.config import RunnableConfig
from langchain_core.prompts import PromptTemplate


FILE_SELECTION_PROMPT = """
File Selection Instructions:

Chat History: '{user_query}'

The following files are available for selection:

{files}

Please select only relevant file UUIDs to be used in answering the latest question from the user. \
If they reference time of a file, use the datetime to infer which file they might mean. \
Only respond using this following this JSON response schema. Do not repeat the schema, but respond with a JSON response that conforms

{schema}"""





def build_file_selection_chain(
        llm: ChatLiteLLM,
        storage_handler: ElasticsearchStorageHandler,
        user_uuid: str,
        env: Settings
) -> Runnable:
    pass
    
    
    
    
