## Load documents

In [1]:
import glob
import json
import os
from tqdm import tqdm
from llama_index.readers.web import SimpleWebPageReader
from llama_index.readers.file import HTMLTagReader
from llama_index.core import (
    SimpleDirectoryReader,
    Document,
    Settings,
    VectorStoreIndex,
    StorageContext,
    SummaryIndex
)

from llama_index.core.node_parser import HTMLNodeParser, MarkdownNodeParser
from llama_index.core.storage.docstore import SimpleDocumentStore
from crawl_data import html_to_md
from IPython.display import display, HTML, Markdown

### Load html documents

In [2]:
html_loader = SimpleDirectoryReader(
    input_dir="./data/llama-blogs-html/",
    filename_as_id=True,
)

In [3]:
html_documents = html_loader.load_data(show_progress=True, num_workers=8)

In [4]:
test_doc = html_documents[10]

In [5]:
test_doc.metadata

{'file_path': '/workspace/projects/LlamindexHelper/data/llama-blogs-html/bridging-the-gap-in-crisis-counseling-introducing-counselor-copilot-db42e26ab4f3.html',
 'file_name': 'bridging-the-gap-in-crisis-counseling-introducing-counselor-copilot-db42e26ab4f3.html',
 'file_type': 'text/html',
 'file_size': 11333,
 'creation_date': '2024-08-08',
 'last_modified_date': '2024-08-08'}

In [6]:
display(HTML(test_doc.text))

### Load md documents

In [7]:
md_loader = SimpleDirectoryReader(
    input_dir="./data/llama-blogs-md/",
    filename_as_id=True,
)

In [8]:
md_documents = md_loader.load_data(show_progress=True, num_workers=8)

In [9]:
md_documents[10].metadata

{'file_path': '/workspace/projects/LlamindexHelper/data/llama-blogs-md/a-new-document-summary-index-for-llm-powered-qa-systems-9a32ece2f9ec.md',
 'file_name': 'a-new-document-summary-index-for-llm-powered-qa-systems-9a32ece2f9ec.md',
 'file_type': 'text/markdown',
 'file_size': 8526,
 'creation_date': '2024-08-08',
 'last_modified_date': '2024-08-08'}

In [11]:
md_documents[10]

Document(id_='/workspace/projects/LlamindexHelper/data/llama-blogs-md/a-new-document-summary-index-for-llm-powered-qa-systems-9a32ece2f9ec.md_part_1', embedding=None, metadata={'file_path': '/workspace/projects/LlamindexHelper/data/llama-blogs-md/a-new-document-summary-index-for-llm-powered-qa-systems-9a32ece2f9ec.md', 'file_name': 'a-new-document-summary-index-for-llm-powered-qa-systems-9a32ece2f9ec.md', 'file_type': 'text/markdown', 'file_size': 8526, 'creation_date': '2024-08-08', 'last_modified_date': '2024-08-08'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='\n\nBackground\nOne of the core use cases of Large Language Models (LLMs) is question\\-answering over your own data. To do this, we pair the LLM with a “retrieval” model that can perform in

In [10]:
display(Markdown(md_documents[10].text))



Background
One of the core use cases of Large Language Models (LLMs) is question\-answering over your own data. To do this, we pair the LLM with a “retrieval” model that can perform information retrieval over a knowledge corpus, and perform response synthesis over the retrieved texts using the LLM. This overall framework is called Retrieval\-Augmented Generation.
Most users building LLM\-powered QA systems today tend to do some form of the following:
1. Take source documents, split each one into text chunks
2. Store text chunks in a vector db
3. During query\-time, retrieve text chunks by embedding similarity and/or keyword filters.
4. Perform response synthesis
For a variety of reasons, this approach provides limited retrieval performance.

## Loading and Injestion

### Load documents

In [1]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

In [2]:
documents = SimpleDirectoryReader(
    input_dir="./data/llama-blogs",
).load_data()

In [3]:
len(documents[0].text), len(documents)

(24684, 159)

In [12]:
documents[0].metadata

{'file_path': '/workspace/projects/LlamaRAG/data/a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html',
 'file_name': 'a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html',
 'file_type': 'text/html',
 'file_size': 24708,
 'creation_date': '2024-07-21',
 'last_modified_date': '2024-07-21'}