In [1]:
import openai
import os
from dotenv import load_dotenv
from trulens_eval import Tru

load_dotenv()
openai.api_key = os.environ.get('OPENAI_API_KEY')

In [2]:
import os

from llama_index import (
    ServiceContext,
    StorageContext,
    VectorStoreIndex,
    load_index_from_storage,
)
from llama_index.node_parser import HierarchicalNodeParser
from llama_index.node_parser import get_leaf_nodes
from llama_index import StorageContext, load_index_from_storage
from llama_index.retrievers import AutoMergingRetriever
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.query_engine import RetrieverQueryEngine


def build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index",
    chunk_sizes=None,
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents(documents)
    leaf_nodes = get_leaf_nodes(nodes)
    merging_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
    )
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(save_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context, service_context=merging_context
        )
        automerging_index.storage_context.persist(persist_dir=save_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=merging_context,
        )
    return automerging_index


def get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=6,
):
    base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever = AutoMergingRetriever(
        base_retriever, automerging_index.storage_context, verbose=True
    )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    auto_merging_engine = RetrieverQueryEngine.from_args(
        retriever, node_postprocessors=[rerank]
    )
    return auto_merging_engine

In [6]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./document/chess.pdf"] # 예시 pdf
).load_data()

In [10]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[4])

<class 'list'> 

8 

<class 'llama_index.schema.Document'>
Doc ID: 0888b928-3f67-4b62-8f33-6373751d1f4c
Text:


In [11]:
documents

[Document(id_='10f9328d-98a9-49ab-8a3f-2003101ddd73', embedding=None, metadata={'page_label': '1', 'file_name': 'chess.pdf', 'file_path': 'document\\chess.pdf', 'file_type': 'application/pdf', 'file_size': 3104682, 'creation_date': '2023-12-08', 'last_modified_date': '2023-12-08', 'last_accessed_date': '2023-12-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, hash='4b1ddaa2a3bf33f848531eb676164f93bcee035d3c3282649ee1ad0d50fa2af5', text='', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='dc9e8ce2-37a9-463a-b9e6-df40efba618f', embedding=None, metadata={'page_label': '2', 'file_name': 'chess.pdf', 'file_path': 'document\\chess.pdf', 'file_type': 