In [5]:
import asyncio
from llama_index.core.agent.workflow import FunctionAgent
from llama_index.llms.openai import OpenAI

import  load_dotenv
load_dotenv.load_dotenv("../../All_LLM_tutorial/.env")

True

몇가지 컨셉으로 RAG를 다룬다고 한다  
1. Loading == Data ingestion: 다양한 데이터 소스로부터 데이터를 얻어냄
2. Indexing: Make Vector embedding
3. Storing: re-index 없이 바로 저장하기
4. Querying: 서브쿼리, 멀티쿼리, Hybrid 쿼리 등 다양한 패턴으로 질문
5. Evaluation: query가 얼마나 정확히, 빠르게 대답이 생성되었는지  

그 중에 llamaindex는 이 세가지를 중점적으로 다룬다함
### 1. Loading
### 2. Indexing
### 3. Querying

# Loading stage
- Document: Data, API output, 어떤 DB로든부터 얻어진 데이터를 보관하는 컨테이너  
- Node: 청크를 나타내는 단위, 메타데이터에 추가로 **원본 문서와의 연관을 보관함**->다른 노드연계가능
- Reader: 추상적개념 data connector의 구현체, Ingestion을 담당하는듯

In [6]:
from llama_index.core import Document, VectorStoreIndex

text_list = ["Hello", "World"]
documents = [Document(text=t) for t in text_list]

index = VectorStoreIndex.from_documents(documents)

In [7]:
from llama_index.core.node_parser import SentenceSplitter

# load documents
...

# parse nodes
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(documents)

# build index
index = VectorStoreIndex(nodes)

In [8]:
nodes

[TextNode(id_='6f35e2c5-4f27-4bec-a9fd-22a60a3b4644', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='53685ca2-ea73-485e-b01d-5d34fbfaa7da', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='52c87cd40ccfbd7873af4180fced6d38803d4c3684ed60f6513e8d16077e5b8e')}, metadata_template='{key}: {value}', metadata_separator='\n', text='Hello', mimetype='text/plain', start_char_idx=0, end_char_idx=5, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'),
 TextNode(id_='ce766cdc-7d1b-4349-b277-400f396cd6b5', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='786aaf15-2a06-452f-a9ae-f4bb6de6b9c3', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='ff5897aa373b6f1f671166b31e806183334dbfd64eb05f8970ca9555a8643e6e')}, metadata_template='{key}: {va

In [9]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("./data").load_data()

## Inject metadata

In [10]:
document = Document(
    text="text",
    metadata={"filename": "<doc_file_name>", "category": "<category>"},
)

In [11]:
document.metadata = {"filename": "<doc_file_name>"}

In [12]:
from llama_index.core import SimpleDirectoryReader

filename_fn = lambda filename: {"file_name": filename}

# automatically sets the metadata of each document according to filename_fn
## 이게 뭔...?
documents = SimpleDirectoryReader(
    "./data", file_metadata=filename_fn
).load_data()

## ID 지정

In [13]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("./data", filename_as_id=True).load_data()
print([x.doc_id for x in documents])

['/home/work/enssel_test/yhkim/llamaindex_study/official_document/data/paul_graham_essay.txt', '/home/work/enssel_test/yhkim/llamaindex_study/official_document/data/temp.csv_part_0']


In [20]:
# documents[0].__dict__

# Nodes  

In [15]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter()

nodes = parser.get_nodes_from_documents(documents)

In [16]:
# from llama_index.core.schema import TextNode, NodeRelationship, RelatedNodeInfo

# node1 = TextNode(text="<text_chunk>", id_="<node_id>")
# node2 = TextNode(text="<text_chunk>", id_="<node_id>")
# # set relationships
# node1.relationships[NodeRelationship.NEXT] = RelatedNodeInfo(
#     node_id=node2.node_id
# )
# node2.relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(
#     node_id=node1.node_id
# )
# nodes = [node1, node2]

# Ingestion

In [17]:
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache

# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=25, chunk_overlap=0),
        TitleExtractor(),
        OpenAIEmbedding(),
    ]
)

# run the pipeline
nodes = pipeline.run(documents=[Document.example()])

Metadata length (9) is close to chunk size (25). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:08<00:00,  8.23s/it]


In [18]:
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.vector_stores.qdrant import QdrantVectorStore

import qdrant_client

client = qdrant_client.QdrantClient(location=":memory:")
vector_store = QdrantVectorStore(client=client, collection_name="test_store")

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=25, chunk_overlap=0),
        TitleExtractor(),
        OpenAIEmbedding(),
    ],
    vector_store=vector_store,
)

# Ingest directly into a vector db
pipeline.run(documents=[Document.example()])

# Create your index
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_vector_store(vector_store)

Metadata length (9) is close to chunk size (25). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


100%|██████████| 1/1 [00:05<00:00,  5.31s/it]
  self._client.create_payload_index(


In [19]:
# save
pipeline.persist("./pipeline_storage")

# load and restore state
new_pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=25, chunk_overlap=0),
        TitleExtractor(),
    ],
)
new_pipeline.load("./pipeline_storage")

# will run instantly due to the cache
nodes = pipeline.run(documents=[Document.example()])

Loading llama_index.core.storage.kvstore.simple_kvstore from pipeline_storage/llama_cache.
