In [1]:
%%capture --no-stderr
%pip install langchain langchain-openai langchain-openai langchain_chroma langchain-text-splitters langchain_community

In [2]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

 ········


## RAG 실습-1: 웹페이지 크롤링(RecSys 논문 작성법 검색)

In [3]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load, chunk and index the contents for the publication guidelines of RecSys`24
loader = WebBaseLoader(
    web_paths=("https://recsys.acm.org/recsys24/call/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            # class_=("post-content", "post-title", "post-header"),
            class_=("tabs-content",),
        )
    ),
)
docs = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [5]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

  prompt = loads(json.dumps(prompt_object.manifest))


In [6]:
rag_chain.invoke("How many pages we are allowed to submit a short paper?")

'You are allowed to submit a short paper of up to 4 pages, plus an additional page for references.'

## RAG 실습-2: 로컬 파일 검색(카카오 베네핏)

In [7]:
!echo "카카오 베네핏: 크루들의 업무 몰입 뿐 아니라 자기개발, 여가/취미 활동 지원을 위한 카카오베네핏! 연 360만원까지 사용할 수 있는 개인별 베네핏 카드를 지급합니다." > info.txt

In [8]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

loader = TextLoader("info.txt")

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(texts, embeddings)

retriever = vectorstore.as_retriever()

docs = retriever.invoke("카카오 베테핏")

In [9]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [10]:
rag_chain.invoke("카카오 베네핏에 대해 알려줘")

'카카오 베네핏은 크루들의 업무 몰입과 자기개발, 여가/취미 활동을 지원하기 위한 프로그램입니다. 개인별로 연 360만원까지 사용할 수 있는 베네핏 카드를 지급합니다. 이를 통해 다양한 활동을 지원받을 수 있습니다.'

## Day 2 프로젝트

In [11]:
urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

In [12]:
loader = WebBaseLoader(
    web_paths=urls,
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header"),
        )
    ),
)
docs = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(texts, embeddings)

retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

Created a chunk of size 2731, which is longer than the specified 1000
Created a chunk of size 1538, which is longer than the specified 1000
Created a chunk of size 1380, which is longer than the specified 1000
Created a chunk of size 2352, which is longer than the specified 1000
Created a chunk of size 1953, which is longer than the specified 1000
Created a chunk of size 1067, which is longer than the specified 1000
Created a chunk of size 1475, which is longer than the specified 1000
Created a chunk of size 2881, which is longer than the specified 1000
Created a chunk of size 1980, which is longer than the specified 1000
Created a chunk of size 4145, which is longer than the specified 1000
Created a chunk of size 2159, which is longer than the specified 1000
Created a chunk of size 1317, which is longer than the specified 1000
Created a chunk of size 1112, which is longer than the specified 1000
Created a chunk of size 1043, which is longer than the specified 1000
Created a chunk of s

In [13]:
def rag_handler(query: str, check_hallucination: bool = True) -> None:
    PROMPT_FOR_RELEVANCE_CHECK = """
    Would you examine whether QUERY and ANSWER are relevant with each other?
    If QUERY and ANSWER are relevant, answer 'yes', otherwise 'no'.
    
    QUERY: \'{QUERY}\'
    
    ANSWER: \'{ANSWER}\'
    """
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    answer = rag_chain.invoke(query)
    output = llm.invoke(
        PROMPT_FOR_RELEVANCE_CHECK.format(QUERY=query, ANSWER=answer)
    ).content.lower()
    if "yes" in output:
        raws = '\n\n'.join([doc.page_content for doc in retriever.invoke(query)])
        output = llm.invoke(
            PROMPT_FOR_RELEVANCE_CHECK.format(QUERY=query, ANSWER=raws)
        ).content.lower() if check_hallucination else "yes"
        if output in "yes":
            print(answer)
        else:
            print("hallucination!")
    elif "no" in output:
        print("Sorry. No found relavant information.")
    else:
        raise Exception(f"{output=}, \n\n {answer=}")

In [14]:
rag_handler("What is your name?")

Sorry. No found relavant information.


In [15]:
rag_handler("I want to know how old superman is.")

Sorry. No found relavant information.


In [16]:
rag_handler("Could you explain what is the agent system?")

hallucination!


In [17]:
rag_handler("In case we deploy a recommender system, what aspects should be consider to improve CTR?", check_hallucination=False)

To improve click-through rates (CTR) in a recommender system, consider the diversity of recommendations and the biases present in the model. Incorporating human feedback can enhance the training process by filtering out harmful content and refining the model's outputs. Additionally, calibrating label probabilities and addressing biases such as majority label bias and recency bias can significantly impact performance.


In [18]:
rag_handler("In case we deploy a recommender system, what aspects should be consider to improve CTR?", check_hallucination=True)

hallucination!
