# Querying LlamaIndex Documents

## Figure out the best way to grab the doc structure

### Use Sphinx

In [84]:
import collections

In [33]:
import pickle

def read_doctree(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

# Replace 'path_to_doctree_file.doctree' with your .doctree file path
# doctree = read_doctree('/Users/sasha/github/LlamaIndex/llama_index/docs/_build/doctrees/index.doctree')
# agentic_strategies_doctree = read_doctree('/Users/sasha/github/LlamaIndex/llama_index/docs/_build/doctrees/optimizing/advanced_retrieval/advanced_retrieval.doctree')
installation_doctree = read_doctree('/Users/sasha/github/LlamaIndex/llama_index/docs/_build/doctrees/getting_started/installation.doctree')
# print(doctree)
# print(agentic_strategies_doctree)
print(installation_doctree)

<document source="/Users/sasha/github/LlamaIndex/llama_index/docs/getting_started/installation.md"><section ids="installation-and-setup" myst-anchor="getting_started/installation.md#installation-and-setup" names="installation\ and\ setup"><title>Installation and Setup</title><section ids="installation-from-pip" myst-anchor="getting_started/installation.md#installation-from-pip" names="installation\ from\ pip"><title>Installation from Pip</title><paragraph>Install from pip:</paragraph><literal_block language="default" xml:space="preserve">pip install llama-index
</literal_block><paragraph><strong>NOTE:</strong> LlamaIndex may download and store local files for various packages (NLTK, HuggingFace, …). Use the environment variable “LLAMA_INDEX_CACHE_DIR” to control where these files are saved.</paragraph><paragraph>If you prefer to install from source, see below.</paragraph></section><section ids="important-openai-environment-setup" myst-anchor="getting_started/installation.md#important-o

In [100]:
# sphinx.addnodes.document
# links are: docutils.nodes.target, section is docutils.nodes.section 
# ids is linked content, target and sections

# Print all the children types of all the doctrees ids, and store in a dict
import glob

unique_types = set()
doctree_files = glob.glob('/Users/sasha/github/LlamaIndex/llama_index/docs/_build/doctrees/**/*.doctree', recursive=True)
doctree_files += glob.glob('/Users/sasha/github/LlamaIndex/llama_index/docs/_build/doctrees/*.doctree')

type_examples = {}

for file_path in doctree_files:
    doctree = read_doctree(file_path)
    for id in doctree.ids:
        type_id = type(doctree.ids[id])
        unique_types.add(type_id)
        if type_id not in type_examples:
            type_examples[type_id] = []
        type_examples[type_id].append(doctree.ids[id])

print(unique_types)
for type_id, examples in type_examples.items():
    print(f"\nExamples for {type_id}:")
    for example in examples:
        print(example)
        break



{<class 'docutils.nodes.section'>, <class 'sphinx.addnodes.desc_signature'>, <class 'docutils.nodes.problematic'>, <class 'docutils.nodes.target'>, <class 'docutils.nodes.system_message'>}

Examples for <class 'docutils.nodes.section'>:
<section ids="welcome-to-llamaindex" names="welcome\ to\ llamaindex\ 🦙\ !"><title>Welcome to LlamaIndex 🦙 !</title><paragraph>LlamaIndex is a data framework for <reference name="LLM" refuri="https://en.wikipedia.org/wiki/Large_language_model">LLM</reference><target ids="['llm']" names="['llm']" refuri="https://en.wikipedia.org/wiki/Large_language_model"/>-based applications to ingest, structure, and access private or domain-specific data. It’s available in Python (these docs) and <reference name="Typescript" refuri="https://ts.llamaindex.ai/">Typescript</reference><target ids="['typescript']" names="['typescript']" refuri="https://ts.llamaindex.ai/"/>.</paragraph><section ids="why-llamaindex" names="🚀\ why\ llamaindex?"><title>🚀 Why LlamaIndex?</title><

Discovered: these are different types of values for ids:
- docutils.nodes.section,
- docutils.nodes.target,
- sphinx.addnodes.desc_signature,
- docutils.nodes.system_message,
- docutils.nodes.problematic

#### Inspect Sections

In [83]:
section_index = 0
target_index = 1
desc_index = 2
system_message_index = 3
problematic_index = 4
all_sections = list(type_examples.values())[section_index]
print(f"Structure")
print(all_sections[0].__dict__)
print(all_sections[0].__dict__.keys())
all_sections[0].children

Structure
{'rawsource': '', 'children': [<title: <#text: 'Welcome to Lla ...'>>, <paragraph: <#text: 'LlamaIndex is  ...'><reference...><target "llm" ...>, <section "🚀 why llamaindex?": <title...><paragraph...><paragraph...><paragraph...><bul ...>, <section "🦙 how can llamaindex help?": <title...><paragraph...><bullet_list...>>, <section "👨‍👩‍👧‍👦 who is llamaindex for?": <title...><paragraph...><paragraph...><paragraph...>>, <section "getting started": <title...><paragraph...><paragraph...><paragraph...>>, <section "🗺️ ecosystem": <title...><paragraph...><bullet_list...><section "commun ...>], 'attributes': {'ids': ['welcome-to-llamaindex'], 'classes': [], 'names': ['welcome to llamaindex 🦙 !'], 'dupnames': [], 'backrefs': []}, 'tagname': 'section', 'parent': <document: <section "welcome to llamaindex 🦙 !"...>>, 'document': <document: <section "welcome to llamaindex 🦙 !"...>>, 'source': '/Users/sasha/github/LlamaIndex/llama_index/docs/index.rst', 'line': 2}
dict_keys(['rawsource', 'chi

[<title: <#text: 'Welcome to Lla ...'>>,
 <paragraph: <#text: 'LlamaIndex is  ...'><reference...><target "llm" ...>,
 <section "🚀 why llamaindex?": <title...><paragraph...><paragraph...><paragraph...><bul ...>,
 <section "🦙 how can llamaindex help?": <title...><paragraph...><bullet_list...>>,
 <section "👨‍👩‍👧‍👦 who is llamaindex for?": <title...><paragraph...><paragraph...><paragraph...>>,
 <section "getting started": <title...><paragraph...><paragraph...><paragraph...>>,
 <section "🗺️ ecosystem": <title...><paragraph...><bullet_list...><section "commun ...>]

In [85]:
section_children_type = collections.defaultdict(list)
for section in all_sections:
    for children in section.children:
        section_children_type[type(children)].append(children)

In [86]:
section_children_type.keys()



In [87]:
for key, values in section_children_type.items():
    print(f"Key: {key}")
    print(f"First Value Dict: {values[0].__dict__}")
    break


Key: <class 'docutils.nodes.title'>
First Value Dict: {'rawsource': 'Welcome to LlamaIndex 🦙 !', 'children': [<#text: 'Welcome to LlamaIndex 🦙 !'>], 'attributes': {'ids': [], 'classes': [], 'names': [], 'dupnames': [], 'backrefs': []}, 'tagname': 'title', 'parent': <section "welcome to llamaindex 🦙 !": <title...><paragraph...><section "🚀 why llamaindex?"...> ...>, 'document': <document: <section "welcome to llamaindex 🦙 !"...>>, 'source': '/Users/sasha/github/LlamaIndex/llama_index/docs/index.rst', 'line': 2}


Notes: Inspecting the section is interesting, but I think what benefit me more now is to look at the structure of 1 document hollistically, which I will experiment in the next section. 

#### Holistically look at the doctree for index.doctree

In [103]:
doctree.ids["ecosystem"].children[3].children

# ecosystem is a section

[<title: <#text: 'Community'>>,
 <paragraph: <#text: 'Need help? Hav ...'>>,
 <bullet_list: <list_item...><list_item...>>]

Notes: this informs me that I can build a parser that auto populates the content for each documents to store/index 

### Use Langchain?

In [106]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

markdown_path = "/Users/sasha/github/LlamaIndex/llama_index/docs/index.rst"
loader = UnstructuredMarkdownLoader(markdown_path)

data = loader.load()

In [109]:
data[0].__dict__["page_content"]

'Welcome to LlamaIndex 🦙 !\n\nLlamaIndex is a data framework for LLM <https://en.wikipedia.org/wiki/Large_language_model>-based applications to ingest, structure, and access private or domain-specific data. It\'s available in Python (these docs) and Typescript <https://ts.llamaindex.ai/>.\n\n🚀 Why LlamaIndex?\n\nLLMs offer a natural language interface between humans and data. Widely available models come pre-trained on huge amounts of publicly available data like Wikipedia, mailing lists, textbooks, source code and more.\n\nHowever, while LLMs are trained on a great deal of data, they are not trained on your data, which may be private or specific to the problem you\'re trying to solve. It\'s behind APIs, in SQL databases, or trapped in PDFs and slide decks.\n\nYou may choose to fine-tune a LLM with your data, but:\n\nTraining a LLM is expensive.\n\nDue to the cost to train, it\'s hard to update a LLM with latest information.\n\nObservability is lacking. When you ask a LLM a question, i

Note: this does okay, but it did not capture the fact that "use_cases/agents.md" etc are links 

### Use Unstructured?

##### Observe index.rst

In [111]:
from unstructured.partition.md import partition_md
elements = partition_md(filename="/Users/sasha/github/LlamaIndex/llama_index/docs/index.rst")
elements

[<unstructured.documents.elements.NarrativeText at 0x1bc36e190>,
 <unstructured.documents.elements.NarrativeText at 0x1bc3dc810>,
 <unstructured.documents.elements.Title at 0x1b82da3d0>,
 <unstructured.documents.elements.NarrativeText at 0x1b857ce90>,
 <unstructured.documents.elements.NarrativeText at 0x179965d90>,
 <unstructured.documents.elements.NarrativeText at 0x1bba39f50>,
 <unstructured.documents.elements.ListItem at 0x1bba3abd0>,
 <unstructured.documents.elements.ListItem at 0x1bb9828d0>,
 <unstructured.documents.elements.ListItem at 0x1bb981350>,
 <unstructured.documents.elements.NarrativeText at 0x1bb9822d0>,
 <unstructured.documents.elements.ListItem at 0x1bb982650>,
 <unstructured.documents.elements.ListItem at 0x1bb982910>,
 <unstructured.documents.elements.ListItem at 0x1bb982290>,
 <unstructured.documents.elements.NarrativeText at 0x1bb982d50>,
 <unstructured.documents.elements.ListItem at 0x1bb981050>,
 <unstructured.documents.elements.ListItem at 0x1bb982750>,
 <unstru

In [154]:
print(len(elements))
elements[52].text # everything after 52 for index.rst is structure, observe another random file

69


'.. toctree::\n   :maxdepth: 1\n   :caption: Getting Started\n   :hidden:'

##### Observe another random md file

In [155]:
from unstructured.partition.md import partition_md
elements = partition_md(filename="/Users/sasha/github/LlamaIndex/llama_index/docs/module_guides/loading/documents_and_nodes/usage_documents.md")
elements 

[<unstructured.documents.elements.Title at 0x17796f550>,
 <unstructured.documents.elements.Title at 0x1b868c790>,
 <unstructured.documents.elements.NarrativeText at 0x179a9fe50>,
 <unstructured.documents.elements.NarrativeText at 0x179a9c1d0>,
 <unstructured.documents.elements.Title at 0x179a9e250>,
 <unstructured.documents.elements.Title at 0x177406810>,
 <unstructured.documents.elements.NarrativeText at 0x177406a90>,
 <unstructured.documents.elements.Title at 0x177404050>,
 <unstructured.documents.elements.NarrativeText at 0x177404850>,
 <unstructured.documents.elements.NarrativeText at 0x177404990>,
 <unstructured.documents.elements.Title at 0x177404510>,
 <unstructured.documents.elements.Title at 0x177404310>,
 <unstructured.documents.elements.NarrativeText at 0x177404a90>,
 <unstructured.documents.elements.Title at 0x177406510>,
 <unstructured.documents.elements.NarrativeText at 0x177404c50>,
 <unstructured.documents.elements.NarrativeText at 0x177406a50>,
 <unstructured.documents