# Multidoc Autoretrieval Pack

This is the LlamaPack version of our structured hierarchical retrieval guide in the [core repo](https://docs.llamaindex.ai/en/stable/examples/query_engine/multi_doc_auto_retrieval/multi_doc_auto_retrieval.html).

## Setup and Download Data

In this section, we'll load in LlamaIndex Github issues.

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
import os

os.environ["GITHUB_TOKEN"] = ""

In [None]:
import os

from llama_hub.github_repo_issues import (
    GitHubRepositoryIssuesReader,
    GitHubIssuesClient,
)

github_client = GitHubIssuesClient()
loader = GitHubRepositoryIssuesReader(
    github_client,
    owner="run-llama",
    repo="llama_index",
    verbose=True,
)

orig_docs = loader.load_data()

limit = 100

docs = []
for idx, doc in enumerate(orig_docs):
    doc.metadata["index_id"] = doc.id_
    if idx >= limit:
        break
    docs.append(doc)

In [None]:
from copy import deepcopy
import asyncio
from tqdm.asyncio import tqdm_asyncio
from llama_index import SummaryIndex, Document, ServiceContext
from llama_index.llms import OpenAI
from llama_index.async_utils import run_jobs


async def aprocess_doc(doc, include_summary: bool = True):
    """Process doc."""
    print(f"Processing {doc.id_}")
    metadata = doc.metadata

    date_tokens = metadata["created_at"].split("T")[0].split("-")
    year = int(date_tokens[0])
    month = int(date_tokens[1])
    day = int(date_tokens[2])

    assignee = "" if "assignee" not in doc.metadata else doc.metadata["assignee"]
    size = ""
    if len(doc.metadata["labels"]) > 0:
        size_arr = [l for l in doc.metadata["labels"] if "size:" in l]
        size = size_arr[0].split(":")[1] if len(size_arr) > 0 else ""
    new_metadata = {
        "state": metadata["state"],
        "year": year,
        "month": month,
        "day": day,
        "assignee": assignee,
        "size": size,
        "index_id": doc.id_,
    }

    # now extract out summary
    summary_index = SummaryIndex.from_documents([doc])
    query_str = "Give a one-sentence concise summary of this issue."
    query_engine = summary_index.as_query_engine(
        service_context=ServiceContext.from_defaults(llm=OpenAI(model="gpt-3.5-turbo"))
    )
    summary_txt = str(query_engine.query(query_str))

    new_doc = Document(text=summary_txt, metadata=new_metadata)
    return new_doc


async def aprocess_docs(docs):
    """Process metadata on docs."""

    new_docs = []
    tasks = []
    for doc in docs:
        task = aprocess_doc(doc)
        tasks.append(task)

    new_docs = await run_jobs(tasks, show_progress=True, workers=5)

    # new_docs = await tqdm_asyncio.gather(*tasks)

    return new_docs

In [None]:
new_docs = await aprocess_docs(docs)

In [None]:
new_docs[5].metadata

## Setup Weaviate Indices

In [None]:
from llama_index.vector_stores import WeaviateVectorStore
from llama_index.storage import StorageContext
from llama_index import VectorStoreIndex

In [None]:
import weaviate

# cloud
auth_config = weaviate.AuthApiKey(api_key="")
client = weaviate.Client(
    "https://<weaviate-cluster>.weaviate.network",
    auth_client_secret=auth_config,
)

doc_metadata_index_name = "LlamaIndex_auto"
doc_chunks_index_name = "LlamaIndex_AutoDoc"

In [None]:
# optional: delete schema
client.schema.delete_class(doc_metadata_index_name)
client.schema.delete_class(doc_chunks_index_name)

### Setup Metadata Schema

This is required for autoretrieval; we put this in the prompt.

In [28]:
from llama_index.vector_stores.types import MetadataInfo, VectorStoreInfo


vector_store_info = VectorStoreInfo(
    content_info="Github Issues",
    metadata_info=[
        MetadataInfo(
            name="state",
            description="Whether the issue is `open` or `closed`",
            type="string",
        ),
        MetadataInfo(
            name="year",
            description="The year issue was created",
            type="integer",
        ),
        MetadataInfo(
            name="month",
            description="The month issue was created",
            type="integer",
        ),
        MetadataInfo(
            name="day",
            description="The day issue was created",
            type="integer",
        ),
        MetadataInfo(
            name="assignee",
            description="The assignee of the ticket",
            type="string",
        ),
        MetadataInfo(
            name="size",
            description="How big the issue is (XS, S, M, L, XL, XXL)",
            type="string",
        ),
    ],
)

## Download LlamaPack

In [None]:
from llama_index.llama_pack import download_llama_pack

MultiDocAutoRetrieverPack = download_llama_pack(
    "MultiDocAutoRetrieverPack", "./multidoc_autoretriever_pack"
)

In [None]:
pack = MultiDocAutoRetrieverPack(
    client,
    doc_metadata_index_name,
    doc_chunks_index_name,
    new_docs,
    docs,
    vector_store_info,
    auto_retriever_kwargs={
        "verbose": True,
        "similarity_top_k": 2,
        "empty_query_top_k": 10,
    },
    verbose=True,
)

## Run LlamaPack

Now let's try the LlamaPack on some queries! 

In [None]:
response = pack.run("Tell me about some issues on 12/11")
print(str(response))

In [None]:
response = pack.run("Tell me about some open issues related to agents")
print(str(response))

### Retriever-only

We can also get the retriever module and just run that.

In [None]:
retriever = pack.get_modules()["recursive_retriever"]
nodes = retriever.retrieve("Tell me about some open issues related to agents")
print(f"Number of source nodes: {len(nodes)}")
nodes[0].node.metadata