# Vector Store Retriver

In [21]:
# Load data -> Text split

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

loader = PyMuPDFLoader('pdf.pdf')
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000,
    chunk_overlap=200,
    encoding_name='cl100k_base'
)

documents = text_splitter.split_documents(data)
len(documents)

25

In [22]:
# 벡터스토어에 문서 임베딩을 저장
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(
    model_name='jhgan/ko-sbert-nli',
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True},
)


vectorstore = FAISS.from_documents(documents,
                                   embedding = embeddings_model,
                                   distance_strategy = DistanceStrategy.COSINE  
                                  )


In [23]:
# 검색 쿼리
query = '카카오뱅크의 환경목표와 세부추진내용을 알려줘'

# 가장 유사도가 높은 문장을 하나만 추출
retriever = vectorstore.as_retriever(search_kwargs={'k': 1})

docs = retriever.get_relevant_documents(query)
print(len(docs))
docs[0]

1


Document(metadata={'source': 'pdf.pdf', 'file_path': 'pdf.pdf', 'page': 12, 'total_pages': 19, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.25', 'creationDate': 'D:20241113010901Z', 'modDate': "D:20241204093544+09'00'", 'trapped': ''}, page_content='Figure 8: Correctness achieved by prompting LLaMA with various numbers of documents retrieved\nwith BGE-base and ColBERT, k, included in the prompts. Optimal performance is observed with\nk = 4 or 5 for ASQA and NQ, while optimal performance for QAMPARI is achieved with k = 10.\nTable 8: Correctness and citation quality on ASQA achieved with LLaMA with various numbers of\nColBERT retrieved documents, k, included in the prompt.\nRet.\nEM Recall\nCitation Recall\nCitation Precision\nk\nRec@k\nMean\n95% CI\nMean\n95% CI\nMean\n95% CI\ngold\n1\n47.466\n45.304 - 49.426\n46.326\n44.123 - 48.524\n46.294\n44.082 - 48.686\n0\n0\n23.327\n21.355 - 25.252\n-\n-

In [24]:
# MMR - 다양성 고려 (lambda_mult = 0.5)
retriever = vectorstore.as_retriever(
    search_type='mmr',
    search_kwargs={'k': 5, 'fetch_k': 50}
)

docs = retriever.get_relevant_documents(query)
print(len(docs))
docs[0]


5


Document(metadata={'source': 'pdf.pdf', 'file_path': 'pdf.pdf', 'page': 12, 'total_pages': 19, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.25', 'creationDate': 'D:20241113010901Z', 'modDate': "D:20241204093544+09'00'", 'trapped': ''}, page_content='Figure 8: Correctness achieved by prompting LLaMA with various numbers of documents retrieved\nwith BGE-base and ColBERT, k, included in the prompts. Optimal performance is observed with\nk = 4 or 5 for ASQA and NQ, while optimal performance for QAMPARI is achieved with k = 10.\nTable 8: Correctness and citation quality on ASQA achieved with LLaMA with various numbers of\nColBERT retrieved documents, k, included in the prompt.\nRet.\nEM Recall\nCitation Recall\nCitation Precision\nk\nRec@k\nMean\n95% CI\nMean\n95% CI\nMean\n95% CI\ngold\n1\n47.466\n45.304 - 49.426\n46.326\n44.123 - 48.524\n46.294\n44.082 - 48.686\n0\n0\n23.327\n21.355 - 25.252\n-\n-

In [25]:
# 문서 객체의 metadata를 이용한 필터링
retriever = vectorstore.as_retriever(
    search_kwargs={'filter': {'format':'PDF 1.6'}}
)

docs = retriever.get_relevant_documents(query)
print(len(docs))
docs[0]

4


Document(metadata={'source': 'pdf.pdf', 'file_path': 'pdf.pdf', 'page': 12, 'total_pages': 19, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.25', 'creationDate': 'D:20241113010901Z', 'modDate': "D:20241204093544+09'00'", 'trapped': ''}, page_content='Figure 8: Correctness achieved by prompting LLaMA with various numbers of documents retrieved\nwith BGE-base and ColBERT, k, included in the prompts. Optimal performance is observed with\nk = 4 or 5 for ASQA and NQ, while optimal performance for QAMPARI is achieved with k = 10.\nTable 8: Correctness and citation quality on ASQA achieved with LLaMA with various numbers of\nColBERT retrieved documents, k, included in the prompt.\nRet.\nEM Recall\nCitation Recall\nCitation Precision\nk\nRec@k\nMean\n95% CI\nMean\n95% CI\nMean\n95% CI\ngold\n1\n47.466\n45.304 - 49.426\n46.326\n44.123 - 48.524\n46.294\n44.082 - 48.686\n0\n0\n23.327\n21.355 - 25.252\n-\n-

filter가 exhautive search 일지 궁금함

In [26]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


# Retrieval
retriever = vectorstore.as_retriever(
    search_type='mmr',
    search_kwargs={'k': 5, 'lambda_mult': 0.15}
)

docs = retriever.get_relevant_documents(query)

# Prompt
template = '''Answer the question based only on the following context:
{context}

Question: {question}
'''

prompt = ChatPromptTemplate.from_template(template)

# Model
llm = ChatOpenAI(
    model='gpt-4o-mini',
    temperature=0,
    max_tokens=500,
)


def format_docs(docs):
    return '\n\n'.join([d.page_content for d in docs])

# Chain
chain = prompt | llm | StrOutputParser()

# Run
response = chain.invoke({'context': (format_docs(docs)), 'question':query})
response

'제공된 문맥에는 카카오뱅크의 환경목표와 세부 추진 내용에 대한 정보가 포함되어 있지 않습니다. 추가적인 정보가 필요합니다.'

솔직히 이게 될지는 몰랐다.

# Multi Query Retriever

In [27]:
# 멀티 쿼리 생성
from langchain.retrievers.multi_query import MultiQueryRetriever

question = 'SK의 ESG 전략은 무엇인가요?'

llm = ChatOpenAI(
    model='gpt-3.5-turbo-0125',
    temperature=0,
    max_tokens=500,
)

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(), llm=llm
)

# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

unique_docs = retriever_from_llm.get_relevant_documents(query=question)
len(unique_docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. What is the ESG strategy of SK?', "2. Can you explain SK's ESG strategy?", '3. How does SK approach ESG in its business strategy?']


4

In [28]:
unique_docs[1]

Document(metadata={'source': 'pdf.pdf', 'file_path': 'pdf.pdf', 'page': 13, 'total_pages': 19, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.25', 'creationDate': 'D:20241113010901Z', 'modDate': "D:20241204093544+09'00'", 'trapped': ''}, page_content='Table 9: Correctness and citation quality on NQ achieved with Mistral with various numbers of\nBGE-base retrieved documents, k, included in the prompt.\nRet.\nEM Recall\nCitation Recall\nCitation Precision\nk\nRec@k\nMean\n95% CI\nMean\n95% CI\nMean\n95% CI\ngold\n1\n84.646\n83.222 - 86.042\n74.66\n73.349 - 76.006\n64.407\n63.179 - 65.615\n0\n46.696\n44.836 - 48.467\n-\n-\n-\n-\n1\n0.072\n37.871\n36.164 - 39.726\n31.101\n29.567 - 32.776\n36.4\n34.695 - 38.181\n2\n0.117\n40.925\n39.02 - 42.792\n43.51\n41.923 - 45.18\n39.554\n38.067 - 41.129\n3\n0.152\n45.301\n43.497 - 47.127\n47.423\n45.796 - 48.944\n40.574\n39.101 - 42.005\n4\n0.181\n49.735\n47.902 

In [29]:
question = 'SK의 ESG 전략은 무엇인가요?'

llm = ChatOpenAI(
    model='gpt-4o-mini',
    temperature=0,
    max_tokens=500,
)

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(), llm=llm
)

# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

unique_docs = retriever_from_llm.get_relevant_documents(query=question)
len(unique_docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['SK의 ESG 전략에 대해 설명해 주실 수 있나요?  ', 'SK 그룹이 채택하고 있는 ESG 접근 방식은 어떤 것들이 있나요?  ', 'SK의 환경, 사회, 지배구조(ESG) 관련 정책과 목표는 무엇인가요?']


6

api 모델의 다국어 처리능력에 따른 query 변환도 해주는 듯? 못하는게 없는것 같어

# Contextual compression

In [33]:
# 기본 검색기

question = 'how to search optimal k for retriever?'

llm = ChatOpenAI(
    model='gpt-4o-mini',
    temperature=0,
    max_tokens=500,
)

base_retriever = vectorstore.as_retriever(
                                search_type='mmr',
                                search_kwargs={'k':7, 'fetch_k': 20})

docs = base_retriever.get_relevant_documents(question)
print(len(docs))


7


In [34]:
# 문서 압축기를 연결하여 구성

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=base_retriever
)

compressed_docs = compression_retriever.get_relevant_documents(question)
print(len(compressed_docs))

3


In [35]:
compressed_docs

[Document(metadata={'source': 'pdf.pdf', 'file_path': 'pdf.pdf', 'page': 3, 'total_pages': 19, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.25', 'creationDate': 'D:20241113010901Z', 'modDate': "D:20241204093544+09'00'", 'trapped': ''}, page_content='We first analyze how many retrieved documents should be included in the LLM context window to maximize correctness on the selected QA tasks. This is shown as a function of the number of retrieved nearest neighbors, k. Incorporating the retrieved documents narrows the performance disparity between the closed-book scenario (k=0) and the gold-document-only ceiling. However, the performance of the evaluated retrieval systems still significantly lags behind the ideal. ColBERT usually outperforms BGE by a small margin. Optimal performance is observed with k = 10 or 20. Correctness on QA begins to plateau around 5-10 documents. We find that Mistral perform

llm prompt를 통한 generation 느낌의 압축은 아니고 MRC의 position 위치를 정해주는 압축 같음, 실제 PDF와 완전히 matching 하는 내용들임