In [1]:
%%capture --no-stderr
%pip install langchain langchain-openai langchain-openai langchain_chroma langchain-text-splitters langchain_community

In [2]:
%%capture --no-stderr
%pip install langchainhub

In [3]:
%%capture --no-stderr
%pip install langchain-nomic tiktoken chromadb tavily-python gpt4all arxiv

# 위 과정을 한번에 진행

In [4]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate



In [5]:
# import getpass
import os

from google.colab import userdata
api_key = userdata.get('api_key')
tavily_api_key = userdata.get('tavily_api_key')

# os.environ["OPENAI_API_KEY"] = getpass.getpass()
os.environ['OPENAI_API_KEY'] = api_key

- tavily 써보기

In [6]:
from tavily import TavilyClient

In [7]:
tavily = TavilyClient(api_key=tavily_api_key)

In [8]:
query = "Where does Messi play right now?"

response = tavily.search(query=query, max_results=2)
context = [{"url": obj["url"], "content": obj["content"]} for obj in response['results']]

# You can easily get search result context based on any max tokens straight into your RAG.
# The response is a string of the context within the max_token limit.
response_context = tavily.get_search_context(query=query, search_depth="advanced", max_tokens=500)

# You can also get a simple answer to a question including relevant sources all with a simple function call:
# You can use it for baseline
response_qna = tavily.qna_search(query=query)

In [9]:
context

[{'url': 'https://www.usatoday.com/story/sports/mls/2023/06/07/lionel-messi-inter-miami-schedule-tickets/70299298007/',
  'content': 'Lionel Messi joins Inter Miami: Full schedule, MLS tickets to see Messi play in US\nLionel Messi\xa0is taking his talents to South Beach.\nMessi,\xa0the 2022 World Cup champion, announced on Wednesday that he will join Major League Soccer\'s Inter Miami CF, a pro soccer club owned by David Beckham, after exiting Ligue 1\'s Paris Saint-Germain following two seasons.\n Tickets to Inter Miami\'s game on June 10 range from $40-$55, but the price tag to see Inter Miami play LigaMX\'s Cruz Azul on July 21 soared to $495 in anticipation of what\'s expected to be Messi\'s first home game, TicketSmarter CEO Jeff Goodman told USA TODAY Sports.\n Each team will play a minimum of two games in the group stage, similar to the World Cup format, with the possibility of more games if the team advances to the knockout rounds.\n "\nAccording to Goodman, nearly 2,000 Inter 

In [10]:
response_context

'"[\\"{\\\\\\"url\\\\\\": \\\\\\"https://www.nbcsports.com/soccer/news/inter-miami-schedule-when-is-lionel-messis-next-mls-game\\\\\\", \\\\\\"content\\\\\\": \\\\\\"Messi has now played less than a season\'s worth of games in an Inter Miami shirt when you add his appearances across all competitions. His numbers are, in true Messi form, stunning to the eye. ... *Wednesday, April 3: 1-2 loss vs Monterrey - Messi did not play Saturday, April 6: 2-2 draw vs Colorado Rapids - 1 goal for Messi, sets up 2nd goal\\\\\\"}\\", \\"{\\\\\\"url\\\\\\": \\\\\\"https://www.usatoday.com/story/sports/mls/2023/06/07/lionel-messi-inter-miami-schedule-tickets/70299298007/\\\\\\", \\\\\\"content\\\\\\": \\\\\\"Lionel Messi joins Inter Miami: Full schedule, MLS tickets to see Messi play in US\\\\\\\\nLionel Messi\\\\\\\\u00a0is taking his talents to South Beach.\\\\\\\\nMessi,\\\\\\\\u00a0the 2022 World Cup champion, announced on Wednesday that he will join Major League Soccer\'s Inter Miami CF, a pro socc

In [11]:
response_qna

'Messi currently plays for Inter Miami CF in Major League Soccer (MLS).'

# 과제 시작

In [12]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [13]:
# rag 생성
urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

loader = WebBaseLoader(
    web_paths=urls,
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)
splited_docs = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splited_docs, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

# relevance chain 생성
rel_check_prompt = ChatPromptTemplate(
    input_variables=["context", "question"],
    messages=[
        HumanMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=["context", "question"],
                template="""
                You are an assistant for relevance check tasks.
                Use the following pieces of retrieved context and question, check whether context and question is relevanced.
                If they are relevanced return "relevance":"yes", if not return "relevance": "no". Return format should be json format.
                - Question: {question}
                - Context: {context}
                """
                )
            )
        ]
)
rel_check_chain = rel_check_prompt | llm | JsonOutputParser()

# answer chain 생성
retrive_prompt = hub.pull("rlm/rag-prompt")
rag_chain = retrive_prompt | llm | StrOutputParser()

# hallucination check chain 생성
hallucination_check_prompt = ChatPromptTemplate(
    input_variables=["context", "answer"],
    messages=[
        HumanMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=["context", "answer"],
                template="""
                You are an assistant for hallucination check tasks.
                Use the following pieces of retrieved context and answer, check whether answer is from context well.
                If answer is from context well, return "hallucination":"no", if not return "hallucination": "yes". Return format should be json format.
                - Context: {context}
                - Answer: {answer}
                """
                )
            )
        ]
)
hal_check_chain = hallucination_check_prompt | llm | JsonOutputParser()


# 검색 travily 객체 생성
tavily = TavilyClient(api_key=tavily_api_key)

# for DRY!
def check_relevance(chain, context, question):
  relevance_yn = chain.invoke({"context": context, "question": question})
  return relevance_yn["relevance"]

def check_hallucination(chain, context, answer):
  hallucination_yn = chain.invoke({"context": context, "answer": answer})
  return hallucination_yn["hallucination"]

def check_hallucination_max_twice(rag_answer_chain, hallunication_chain, context, question, first_answer):
  if check_hallucination(hallunication_chain, context, first_answer) == "yes":
    second_answer = rag_answer_chain.invoke({"context": context, "question": question})
    if check_hallucination(hallunication_chain, context, second_answer) == "yes":
      print(">> Sorry, I don't know. 🥲")
    else:
      return second_answer
  else:
    return first_answer

# LLM 서비스 진행

In [14]:
# 귀찮아서 그냥 전역변수 lets ggo
def stupid_mason_llm(query, num_re_search = 3):
  print(f"?? question: {query}")

  # rag 에서 context 가져오기
  context_from_rag = format_docs(retriever.invoke(query))

  # relevance 확인하기
  if check_relevance(rel_check_chain, context_from_rag, query) == "no":
    if num_re_search > 0:
      # rag 에 없으면 travily 로 검색
      print(f">> searching in web...")
      tavily_response = tavily.search(query=query, max_results=num_re_search)
      url_and_context_from_tavily = [{"url": obj["url"], "content": obj["content"]} for obj in tavily_response['results']]
      for i in range(num_re_search):
        if i == num_re_search:
            break
        context_from_tavily = url_and_context_from_tavily[i]
        if check_relevance(rel_check_chain, context_from_tavily, query) == "no":
          continue
        else:
          break
      if i == num_re_search:
        print(">> Sorry, I don't know. 🥲")
      else:
        # search 기반 hallucination 확인하고 답변
        first_answer = rag_chain.invoke({"context": context_from_tavily['content'], "question": query})
        result = check_hallucination_max_twice(rag_chain, hal_check_chain, context_from_tavily['content'], query, first_answer)
        if result != None:
          print(f">> answer: {result}")
          print(f">> reference: {context_from_tavily['url']}")
    else:
      print(">> Sorry, I don't know. Turn on web search! 🥲")
  else:
    # rag 기반 hallucination 확인하고 답변
    first_answer = rag_chain.invoke({"context": context_from_rag, "question": query})
    result = check_hallucination_max_twice(rag_chain, hal_check_chain, context_from_rag, query, first_answer)
    if result != None:
      print(f">> answer: {result}")

In [15]:
stupid_mason_llm("What is agent memory?")

?? question: What is agent memory?
>> answer: Agent memory refers to the processes that allow an agent to acquire, store, retain, and retrieve information over time. It encompasses both short-term memory, which involves in-context learning, and long-term memory, which utilizes an external database to retain extensive experiences and observations. This memory system helps inform the agent's behavior based on relevance, recency, and importance of past events.


In [16]:
stupid_mason_llm("Where does Messi play right now?", 0)

?? question: Where does Messi play right now?
>> Sorry, I don't know. Turn on web search! 🥲


In [17]:
stupid_mason_llm("Where does Messi play right now?")

?? question: Where does Messi play right now?
>> searching in web...
>> answer: Lionel Messi currently plays for Inter Miami CF in Major League Soccer (MLS). He joined the team after leaving Paris Saint-Germain.
>> reference: https://www.usatoday.com/story/sports/mls/2023/06/07/lionel-messi-inter-miami-schedule-tickets/70299298007/


In [18]:
stupid_mason_llm("What is the biggest city in Korea?")

?? question: What is the biggest city in Korea?
>> searching in web...
>> answer: The biggest city in Korea is Seoul, which is also the capital of South Korea. It has a population of approximately 10 million people.
>> reference: https://www.discoverytheworld.com/discover-the-20-largest-cities-in-south-korea/
