In [1]:
from langchain_community.chat_models import ChatOllama

llm = ChatOllama(model='llama3', temperature=0)



In [2]:
# pip install langchain_chroma

In [3]:
import os

# Set the API key
os.environ['OPENAI_API_KEY'] = "sk-"
os.environ['TAVILY_API_KEY'] = "tvly-"

In [4]:
# pip install langchain_text_splitters

In [5]:
# pip install langchain_openai

## Todo
0. vector store에 데이터 저장 [v]
1. 유저로부터 Query 받기 [v]
2. 쿼리로 retrieval 얻기 [v]
3. retrieval 관련성 체크 [v]
4. !통과 못한 경우 웹서치 [v]
5. !web search 관련성 체크 [v]
6. 통과한 경우 RAG + query를 넣어서 답변 [v]
7. 할루시네이션 체크 (근거가 있는 사실인지 물어보기) [v]
8. 할루시네이션이 아닌 경우 바로 답변 [v]
9. 할루시네이션인 경우 다시 답변 생성 (retry 5 times) [v]
10. !답변에 제목 및 출처 추가 [v]

## Split and Load

In [6]:
# Build a sample vectorDB
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

all_splits = []

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
doc_splits = text_splitter.split_documents(docs_list)

# VectorDB
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(persist_directory="./chroma_db", documents=doc_splits, embedding=embedding)
retriever = vectordb.as_retriever()

## Query

In [7]:
# from langchain.retrievers.multi_query import MultiQueryRetriever

# question = "agent memory"
# retriever_from_llm = MultiQueryRetriever.from_llm(
#     retriever=vectordb.as_retriever(), llm=llm
# )

# rags = retriever_from_llm.invoke(question)

In [8]:
### LLM

local_llm = "llama3"

## Relevance Check - LLM as a Judge

In [9]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

llm = ChatOllama(model=local_llm, format="json", temperature=0)

prompt = PromptTemplate(
            template = """
            <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                
            You are developing a retrieval(context) evaluator system. 
            Define the criteria that determine if a retrieved document is relevant to a user's question. 
            Your goal is to create a set of guidelines that the system will follow to assess relevance accurately. 
            First, consider the key elements that indicate relevance between a user's question and a retrieved document. 
            Think about how the system can analyze the content to make this determination effectively.
            Please ANSWER TOTOAL SCORE IN FORMAT 'answer:float' whch has key named answer and value of score.
            <|eot_id|>
            <|start_header_id|>user<|end_header_id|>
            Context : {context}
            User Question : {question}<|eot_id|>
            <|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question", "context"],
        )

retrieval_grader = prompt | llm | JsonOutputParser()
question = "Who are the Bears expected to draft first in the NFL draft?"
docs = retriever.invoke(question)

print(retrieval_grader.invoke({"question": question, "context": docs[0].metadata}))
print(docs)

{'answer': 0.05}
[Document(page_content='follows:', metadata={'description': 'The use of large language models in the real world has strongly accelerated by the launch of ChatGPT. We (including my team at OpenAI, shoutout to them) have invested a lot of effort to build default safe behavior into the model during the alignment process (e.g. via RLHF). However, adversarial attacks or jailbreak prompts could potentially trigger the model to output something undesired.\nA large body of ground work on adversarial attacks is on images, and differently it operates in the continuous, high-dimensional space.', 'language': 'en', 'source': 'https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/', 'title': "Adversarial Attacks on LLMs | Lil'Log"}), Document(page_content='follows:', metadata={'description': 'The use of large language models in the real world has strongly accelerated by the launch of ChatGPT. We (including my team at OpenAI, shoutout to them) have invested a lot of effort to 

## RAG + query로 질문

In [10]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
            template = """
            <|begin_of_text|><|start_header_id|>system<|end_header_id|>
            ADD TITLES AND URLS THAT IS USED FROM GIVEN CONTEXT.
            Start by understanding the user's question.
            Analyze the context to determine the appropriate response.
            Tailor the response based on the context provided.
            Provide helpful and relevant information to the user within the specified context.
            <|eot_id|>
            <|start_header_id|>user<|end_header_id|>
            Context
            {context}
            '''
            User Question
            {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question", "context"],
        )

llm = ChatOllama(model=local_llm, temperature=0)

# Chain
rag_chain = prompt | llm | StrOutputParser()

# Run
question = "agent memory"
docs = retriever.invoke(question)
generation = rag_chain.invoke({"context": docs, "question": question})
print(generation)

Based on the provided context, it appears that you are asking about the concept of an "agent memory" in the context of LLM-powered autonomous agents.

From the given text, I can infer that the "Memory stream" is a long-term memory module (external database) that records a comprehensive list of agents' experience in natural language. Each element in this memory stream is an observation or event directly provided by the agent. Additionally, inter-agent communication can trigger new natural language statements.

In this context, the "agent memory" refers to the Memory Stream, which serves as a repository for storing and retrieving information about the agent's experiences, observations, and events. This memory stream plays a crucial role in informing the agent's behavior by surfacing relevant context based on factors such as relevance, recency, and importance.

If you have any specific questions or would like further clarification on this topic, please feel free to ask!


## Hallucination Checker

In [11]:
# LLM
llm = ChatOllama(model=local_llm, format="json", temperature=0)

# Prompt
prompt = PromptTemplate(
            template = """
            <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                
            You are a hallucination checker AI. Your task is to assess whether a given answer generated by the LLM model corresponds to a user's question and determine if it exhibits signs of hallucination. 
            If the answer is deemed hallucinatory, provide guidance on how to identify and correct such responses. If the answer is coherent, acknowledge its clarity and relevance to the question.
            
            Take a deep breath and let's take this step by step.
            
            Analyze the user's question carefully.
            Evaluate the LLM-generated answer in relation to the question.
            Determine if the answer shows signs of hallucination or if it directly addresses the question.
            If hallucination is detected, provide feedback on how to improve the response.
            If the answer is coherent, acknowledge its relevance and clarity.
            Please ANSWER TOTOAL SCORE IN JSON FORMAT 'is_hallucinated:boolean'.
            <|eot_id|>
            <|start_header_id|>user<|end_header_id|>
            LLM-generated answer
            {generation}
            ''
            User question
            {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["generation", "question"],
        )

hallucination_grader = prompt | llm | JsonOutputParser()
hallucination_grader.invoke({"question": question, "generation": generation})


{'is_hallucinated': False}

In [12]:
### Search
from langchain_community.tools.tavily_search import TavilySearchResults

web_search_tool = TavilySearchResults(k=3)

In [13]:
from pprint import pprint
from typing import List

from langchain_core.documents import Document
from typing_extensions import TypedDict

from langgraph.graph import END, StateGraph
import copy

### State


class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        question: question
        generation: LLM generation
        web_search: whether to add search
        documents: list of documents
        retry: retry count
        useful: whether the result is useful
    """

    question: str
    generation: str
    web_search: str
    documents: List[str]
    retry: int


### Nodes


def retrieve(state):
    """
    Retrieve documents from vectorstore

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, documents, that contains retrieved documents
    """
    print("---RETRIEVE---")
    question = state["question"]

    # Retrieval
    documents = retriever.invoke(question)
    
    transformed_documents = []
    for detailed_doc in documents:
        metadata = detailed_doc.metadata
        transformed_doc = {
            'url': metadata.get('source', ''),
            'title': metadata.get('title', 'Untitled Document'),
            'content': detailed_doc.page_content
        }
        transformed_documents.append(transformed_doc)
    
    return {"documents": transformed_documents, "question": question, "retry": 0}


def generate(state):
    """
    Generate answer using RAG on retrieved documents

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, generation, that contains LLM generation
    """
    print("---GENERATE---")
    question = state["question"]
    documents = state["documents"]

    # RAG generation
    generation = rag_chain.invoke({"context": documents, "question": question})
    return {"documents": documents, "question": question, "generation": generation, "retry": state["retry"] + 1}


def grade_documents(state):
    """
    Determines whether the retrieved documents are relevant to the question
    If any document is not relevant, we will set a flag to run web search

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Filtered out irrelevant documents
    """

    print("---CHECK DOCUMENT RELEVANCE TO QUESTION---")
    question = state["question"]
    documents = state["documents"]

    # Score each doc
    filtered_docs = []
    for d in documents:
        score = retrieval_grader.invoke(
            {"question": question, "context": d}
        )
        grade = score["answer"]
        # Document relevant
        if grade > 0.5:
            print("---GRADE: DOCUMENT RELEVANT---")
            filtered_docs.append(d)
        # Document not relevant
        else:
            print("---GRADE: DOCUMENT NOT RELEVANT---")
            # We do not include the document in filtered_docs
            # We set a flag to indicate that we want to run web search
            continue
    return {"documents": filtered_docs, "question": question}


def web_search(state):
    """
    Web search based based on the question

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Appended web results to documents
    """

    print("---WEB SEARCH---")
    question = state["question"]
    documents = state["documents"]

    # Web search
    docs = web_search_tool.invoke({"query": question})
    transformed_documents = []
    for doc in docs:
        transformed_doc = {
            'url': doc.get('url', ''),
            'title': doc.get('title', 'Untitled Document'),
            'content': doc.get('content', '')
        }
        transformed_documents.append(transformed_doc)
    if documents is not None:
        documents= copy.deepcopy(transformed_documents)
    else:
        documents = [web_results]
    return {"documents": documents, "question": question}


def decide_to_generate(state):
    """
    Determines whether to generate an answer, or add web search

    Args:
        state (dict): The current graph state

    Returns:
        str: Binary decision for next node to call
    """

    print("---ASSESS GRADED DOCUMENTS---")
    state["question"]
    filtered_documents = state["documents"]

    if len(filtered_documents) == 0:
        # All documents have been filtered check_relevance
        # We will re-generate a new query
        print(
            "---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, INCLUDE WEB SEARCH---"
        )
        return "websearch"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE---")
        return "generate"


### Conditional edge


def grade_generation_v_documents_and_question(state):
    """
    Determines whether the generation is grounded in the document and answers question.

    Args:
        state (dict): The current graph state

    Returns:
        str: Decision for next node to call
    """

    print("---CHECK HALLUCINATIONS---")
    question = state["question"]
    documents = state["documents"]
    generation = state["generation"]

    score = hallucination_grader.invoke(
        {"question": question, "generation": generation}
    )
    is_hallucinated = score["is_hallucinated"]

    # Check hallucination
    if state["retry"] == 5:
        print("---NOT ANSWERABLE---")
        return "useful"
    elif is_hallucinated:
        pprint("---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY---")
        return "not useful"
    else:
        print("---DECISION: GENERATION ADDRESSES QUESTION---")
        return "useful"


workflow = StateGraph(GraphState)

# Define the nodes
workflow.add_node("retrieve", retrieve)  # retrieve
workflow.add_node("websearch", web_search)  # web search
workflow.add_node("grade_documents", grade_documents)  # grade documents
workflow.add_node("generate", generate)  # generatae

## Graph Build

In [14]:
# Build graph
workflow.set_entry_point("retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "websearch": "websearch",
        "generate": "generate",
    },
)
workflow.add_edge("websearch", "grade_documents")
workflow.add_conditional_edges(
    "generate",
    grade_generation_v_documents_and_question,
    {
        "useful": END,
        "not useful": "generate"
    },
)

In [15]:
# # Compile
# app = workflow.compile()

# # Test

# inputs = {"question": "What are the types of agent memory?"}
# for output in app.stream(inputs):
#     for key, value in output.items():
#         pprint(f"Finished running: {key}:")
# pprint(value["generation"])

---RETRIEVE---
'Finished running: retrieve:'
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
'Finished running: grade_documents:'
---GENERATE---
---CHECK HALLUCINATIONS---
---DECISION: GENERATION ADDRESSES QUESTION---
'Finished running: generate:'
("Based on the provided context, it appears that you're looking for "
 'information about the types of memory used in LLM-powered autonomous '
 'agents. \n'
 '\n'
 'According to the text, there are two main types of memory mentioned:\n'
 '\n'
 '1. **Short-term memory**: This type of memory is utilized through in-context '
 'learning (Prompt Engineering) and allows the model to learn within a '
 'specific context.\n'
 '2. **Long-term memory**: This type of memory enables the agent to retain and '
 'recall information over extended periods by leveraging an external vector '

In [18]:
from pprint import pprint

# Compile
app = workflow.compile()
inputs = {"question": "Who are the Bears expected to draft first in the NFL draft?"}
for output in app.stream(inputs):
    for key, value in output.items():
        pprint(f"Finished running: {key}:")
if value["retry"] != 5:
    pprint(value["generation"])

---RETRIEVE---
'Finished running: retrieve:'
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, INCLUDE WEB SEARCH---
'Finished running: grade_documents:'
---WEB SEARCH---
'Finished running: websearch:'
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
'Finished running: grade_documents:'
---GENERATE---
---CHECK HALLUCINATIONS---
---DECISION: GENERATION ADDRESSES QUESTION---
'Finished running: generate:'
('Based on the provided context, it appears that the Chicago Bears have '
 'already made their selection for the first round of the 2024 NFL Draft. '
 'According to the a

In [17]:
# from pprint import pprint

# # Compile
# app = workflow.compile()
# inputs = {"question": "who is ceo of tesla?"}
# for output in app.stream(inputs):
#     for key, value in output.items():
#         pprint(f"Finished running: {key}:")
# if value["retry"] != 5:
#     pprint(value["generation"])