In [3]:
import nest_asyncio
nest_asyncio.apply()

import os
import getpass
import openai
import logging
import sys
from llama_index import SimpleDirectoryReader, SummaryIndex, ServiceContext

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [4]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key:")

In [5]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [8]:
from llama_index.llms import OpenAI
from llama_index.callbacks import LlamaDebugHandler, CallbackManager

llm = OpenAI("gpt-3.5-turbo")

callback_manager = CallbackManager([LlamaDebugHandler()])

service_context = ServiceContext.from_defaults(
    llm=llm, callback_manager=callback_manager, chunk_size=256
)

In [9]:
required_exts = [".txt"]

reader = SimpleDirectoryReader(
    input_dir="../data",
    required_exts=required_exts,
    recursive=True,
    filename_as_id=True
)

docs = reader.load_data()
print(f"Loaded {len(docs)} docs")

Loaded 4 docs


# Metadata Filters + Auto-Retrieval



In [13]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores import ChromaVectorStore

In [17]:
import chromadb
from llama_index.storage.storage_context import StorageContext

db = chromadb.PersistentClient(path="../chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


In [32]:
# define top-level nodes and vector retrievers
nodes = []
vector_query_engines = {}
vector_retrievers = {}

for doc in docs:
    # build vector index
    vector_index = VectorStoreIndex.from_documents(doc, 
                                                   service_context=service_context
    )
    # define query engines
    vector_query_engine = vector_index.as_query_engine()
    vector_query_engines[doc] = vector_query_engine
    vector_retrievers[doc] = vector_index.as_retriever()

    # save summaries
    out_path = Path("summaries") / f"{doc.id_}.txt"
    if not out_path.exists():
        # use LLM-generated summary
        summary_index = SummaryIndex.from_documents(doc, 
                                                    service_context=service_context
        )

        summarizer = summary_index.as_query_engine(response_mode="tree_summarize")
        response = await summarizer.aquery(f"Give me a summary of {doc.id_}")

        doc_summary = response.response
        Path("summaries").mkdir(exist_ok=True)
        with open(out_path, "w") as fp:
            fp.write(doc)
    else:
        with open(out_path, "r") as fp:
            doc = fp.read()

    print(f"**Summary for {doc}: {doc_summary}")
    node = IndexNode(text=doc_summary, index_id=doc)
    nodes.append(node)

**********
Trace: index_construction
    |_CBEventType.EXCEPTION ->  0.0 seconds
**********


AttributeError: 'tuple' object has no attribute 'get_doc_id'