In [1]:
from transformers import AutoModel, AutoTokenizer

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_postgres import PGVector
from langchain.chains import LLMChain
from langchain_core.documents import Document
from langchain.retrievers.document_compressors.base import BaseDocumentCompressor
from langchain_core.callbacks import BaseCallbackHandler
from langchain.retrievers import ContextualCompressionRetriever
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.memory import ConversationBufferMemory

from langchain.tools.retriever import create_retriever_tool
from langchain.agents import initialize_agent, AgentType


from langchain_openai import ChatOpenAI

from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate


from pydantic import Field
from typing import List, Optional, Sequence
import asyncio

import torch
torch.cuda.is_available()
import os
import dotenv

import utils
from utils.helpers import *
from utils.tools import *

PGHOST=os.getenv("PGHOST", "localhost")
PGPORT=os.getenv("PGPORT")
PGUSER=os.getenv("PGUSER")
PGPASSWORD=os.getenv("PGPASSWORD")
PGDATABASE=os.getenv("PGDATABASE")

In [2]:
class MyStreamingHandler(BaseCallbackHandler):
    def __init__(self):
        self.tokens = []

    def on_llm_new_token(self, token: str, **kwargs):
        #print(token, end="", flush=True)  # Print the token to stdout
        self.tokens.append(token)  # Just collect tokens, do not print

    def get_streamed_text(self):
        return "".join(self.tokens)

class LLMReranker(BaseDocumentCompressor):
    llm_chain: object = Field(LLMChain, description="LLM chain to rerank documents")
    document_variable_name: str = "document"

    def compress_documents(
        self,
        documents: Sequence[Document],
        query: str,
        *,
        callbacks: Optional[list] = None,
    ) -> List[Document]:
        
        scored_docs = []
        for doc in documents:
            inputs = {
                "query": query,
                self.document_variable_name: doc.page_content,
            }
            output = self.llm_chain.invoke(inputs)
            try:
                score = int(output.strip())
            except Exception:
                score = 0
            scored_docs.append((doc, score))
        scored_docs.sort(key=lambda x: x[[1]], reverse=True)
        return [doc for doc, score in scored_docs]
    
class LLMRerankerBatched(BaseDocumentCompressor):
    """ LLM Reranker that uses LLMChain to rerank documents in batches.
    It passes the documents to the LLM in a single call and expects the LLM to return a list of scores.
    """
    llm_chain: object = Field(LLMChain, description="LLM chain to rerank documents")
    document_variable_name: str = "documents"

    def compress_documents(
        self,
        documents: Sequence[Document],
        query: str,
        callbacks: Optional[list] = None,
    ) -> List[Document]:

        
        scored_docs = []
        
        inputs = {
            "query": query,
            self.document_variable_name: [doc.page_content for doc in documents],
        }
        output = self.llm_chain.invoke(inputs)
        try:
            scores = [int(score.strip()) for score in output.split(",")]
        except Exception:
            scores = [0] * len(documents)
        scored_docs = list(zip(documents, scores))
        scored_docs.sort(key=lambda x: x[1], reverse=True)
        return [doc for doc, score in scored_docs]

In [3]:
connection_string = f"postgresql+psycopg://{PGUSER}:{PGPASSWORD}@{PGHOST}:{PGPORT}/{PGDATABASE}"

In [4]:
# Load the model
embedding_model_name = "BAAI/bge-small-en" #"BAAI/bge-large-en-v1.5"
model = AutoModel.from_pretrained(embedding_model_name)
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
# Create the embeddings object
embedding_model = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [5]:
llm_streaming = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.5,
    streaming=True,
    callbacks=[MyStreamingHandler()],
)
# llm = ChatOpenAI(
#     model="gpt-4o-mini",
#     temperature=0.5,
#     streaming=False
# )

In [6]:
connection_string = "postgresql+psycopg://username:password@localhost:5432/database"
collection_name = "project_documents"
vector_store = PGVector(
    embeddings=embedding_model,
    collection_name=collection_name,
    connection=connection_string,
    #async_mode=True,
)

In [7]:
retriever = vector_store.as_retriever(search_kwargs={"k": 20})  # retrieve top 20 docs

# Define a template for reranking
rerank_prompt = PromptTemplate(
    input_variables=["query", "document"],
    template=(
        "Given the query:\n{query}\n\n"
        "Rate the relevance of the following document to the query on a scale from 1 to 10:\n"
        "{document}\n\n"
        "Only output the score as an integer."
    ),
)
rerank_batch_prompt = PromptTemplate(
    input_variables=["query", "documents"],
    template=(
        "Given the query:\n{query}\n\n"
        "Rate the relevance of the following documents to the query on a scale from 1 to 10:\n"
        "{documents}\n\n"
        "Only output the scores as a list of integers."
    ),
)

reranker_model_name = "gpt-4o-mini"
llm_reranker = ChatOpenAI(
    model_name=reranker_model_name,
    temperature=0.0,
)

reranker_batched = LLMRerankerBatched(llm_chain=rerank_batch_prompt | llm_reranker)
#reranker = LLMReranker(llm_chain=rerank_prompt | llm_reranker)
# Wrap your base retriever with ContextualCompressionRetriever using the reranker
reranking_retriever = ContextualCompressionRetriever(
    base_retriever=retriever,
    base_compressor=reranker_batched,
)

# A tool for retrieving files by name and for retrieving relevant documents
retriever_tool = create_retriever_tool(
    reranking_retriever,
    name="project_search",
    description="Searches the project documents for relevant information."
)
tools = [retriever_tool, get_file_by_name_tool, generate_plantuml_diagram_tool]

In [8]:
prompt_template = ChatPromptTemplate.from_template(
    "You are an expert assistant. Use the following project documents to answer the user's question."
    "If the answer is in the documents, provide it and reference the document(s) used."
    "If the answer is not in the documents, provide a general answer based on your knowledge,"
    "and state that the documents do not contain the answer.\n\n"
    "Context:\n"
    "{context}\n\n"
    "Question:\n"
    "{question}\n\n"
)
async def full_chain_stream(query: str):
    # Embed query and retrieve reranked docs as before
    query_embedding = embedding_model.embed_query(query)
    reranked_docs = reranking_retriever.base_retriever.vectorstore.similarity_search_by_vector(query_embedding, k=5)
    context = format_documents(reranked_docs)

    # Prepare prompt input
    prompt_text = prompt_template.format(context=context, question=query)

    # Stream tokens asynchronously from the LLM
    async for token in llm_streaming.astream(prompt_text):
        yield token.content  # yield each token as it arrives

In [9]:
response = ""
async for resp in full_chain_stream("What is the name of the app?"):
    response += resp
    print(resp, end="", flush=True)  # Print each token as it arrives    

The name of the app is the "Eprice App." This information is found in the README.md document.

In [16]:
message_history = InMemoryChatMessageHistory()

prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                "You are an expert assistant. Use the following project documents to answer the user's question. "
                "If the answer is in the documents, provide it and reference the document(s) used. "
                "If the answer is not in the documents, provide a general answer based on your knowledge, "
                "and state that the documents do not contain the answer."
            )
        ),
        MessagesPlaceholder(variable_name="history"),
        ("user", "{question}"),
    ]
)

async def full_chain_with_history_stream(query: str):
    # Embed and retrieve documents as before
    query_embedding = embedding_model.embed_query(query)
    reranked_docs = reranking_retriever.base_retriever.vectorstore.similarity_search_by_vector(query_embedding, k=5)
    context = format_documents(reranked_docs)

    # Add the context as a system message or part of the prompt
    system_msg = SystemMessage(content=f"Context:\n{context}")

    # Get current history messages and append the system context message
    history_msgs = message_history.messages + [system_msg]

    # Add the current user query as a HumanMessage
    user_msg = HumanMessage(content=query)

    # Prepare messages for the LLM: history + current user message
    messages = history_msgs + [user_msg]

    # Stream tokens asynchronously from the LLM and collect the reply
    assistant_reply = ""
    async for token in llm_streaming.astream(messages):
        if token.content:
            assistant_reply += token.content
            yield token.content

    # Update the history with the new turn
    message_history.add_user_message(query)
    message_history.add_ai_message(assistant_reply)

In [18]:
response = ""
async for resp in full_chain_with_history_stream("What did I just ask?"):
    response += resp
    print(resp, end="", flush=True)  # Print each token as it arrives


You asked, "What is the name of the app?"

In [88]:
system_message = (
    "You are an expert assistant. "
    "Always first use the project_search tool to retrieve relevant information from the project documents. "
    "Only if this does not provide sufficient information, then try looking up individual files using the get_file_by_name tool. "
    "If the answer is in the documents/files, provide it and reference the document(s) used. "
    "You also have access to the generate_plantuml_diagram tool to create diagrams based on the information provided. "
    "You can use the tool by calling it with plantuml code as input. "
    "When appropriate, you can use the tool to generate diagrams to give a visual representation of the information. "
    "If the answer is not in the documents/files, provide a general answer based on your knowledge, "
    "and state that the documents do not contain the answer. "
)

memory = ConversationBufferMemory(memory_key="history", return_messages=True, k=5, output_key="output")
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.OPENAI_FUNCTIONS,
    verbose=False,
    memory=memory,
    system_message=system_message,  # <--- This is the key line
)

In [77]:
# Async streaming function
async def agent_stream(query: str):
    # agent.astream yields AgentAction/AgentFinish or message objects
    async for chunk in agent.astream(query):
        # For OpenAI Functions agent, chunk is usually a message object
        # You may want to yield chunk.content or just chunk
        yield chunk.content if hasattr(chunk, "content") else str(chunk)

In [None]:
response = ""
async for resp in agent_stream("What is the name of the app?"):
    response += resp
    print(resp, end="", flush=True)

{'actions': [AgentActionMessageLog(tool='project_search', tool_input={'query': 'app name'}, log="\nInvoking: `project_search` with `{'query': 'app name'}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"query":"app name"}', 'name': 'project_search'}, 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 184, 'total_tokens': 199, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_62a23a81ef', 'id': 'chatcmpl-Bb5HMIitKVeb8vKKQMZkRotjEOaor', 'service_tier': 'default', 'finish_reason': 'function_call', 'logprobs': None}, id='run--f735af07-b624-4af1-ab62-191845ff9270-0', usage_metadata={'input_tokens': 184, 'output_tokens': 15, 'total_tokens': 199, 'input_token_details': {'audio': 0, 'c