In [1]:
%%capture
!pip install llama-index html2text

In [2]:
import os

from getpass import getpass
import nest_asyncio

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv()

True

Preparing Data for an LLM involves an ingestion pipeline similar to ML Data Cleaning or traditional ETL Processes


Ingestion Pipeline Stages
- Load Data
- Transform Data
- Index and Store data

In [3]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("../data").load_data()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     c:\Users\maizz\Documents\Python Scripts\seerah-
[nltk_data]     llm\seerahllm_env\Lib\site-
[nltk_data]     packages\llama_index\core\_static/nltk_cache...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Failed to load file c:\Users\maizz\Documents\Python Scripts\seerah-llm\notebooks\..\data\the-life-of-the-prophet-muhammad-by-leila-and-aisha.pdf with error: RetryError[<Future at 0x1b42e398ce0 state=finished raised DependencyError>]. Skipping...


In [4]:
len(documents)

2115

In [5]:
type(documents[0])

llama_index.core.schema.Document

In [6]:
documents[3].__dict__

{'id_': '9e6b4073-8fa5-4794-ae4e-7e1f1b4400a5',
 'embedding': None,
 'metadata': {'page_label': '4',
  'file_name': '01-sejarah-hidup-nabi-muhammad-saw.pdf',
  'file_path': 'c:\\Users\\maizz\\Documents\\Python Scripts\\seerah-llm\\notebooks\\..\\data\\01-sejarah-hidup-nabi-muhammad-saw.pdf',
  'file_type': 'application/pdf',
  'file_size': 2559574,
  'creation_date': '2024-10-16',
  'last_modified_date': '2024-10-16'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'text': " 4 o Orientalis dan Kebudayaan Islam   \n• SEBUAH PENGHARGAAN DAN TERIMAKASIH   \no Sebuah Penghargaan dan Terimakasih   \n• DAFTAR PUSTAKA   \no Daftar Pustaka   \n• SEJARAH PEMBENTUKAN MUSHAF AL- QUR’AN MENURUT AHLI \nSEJARAH NON-MUSLIM   \no Pendapat Sir

# Manually Create Documents Objects

In [7]:
from llama_index.core import Document

manual_documents = Document(text="this is an example of a manual")

In [9]:
manual_documents.__dict__

{'id_': '25fa6866-843b-4b1f-ad22-fc4b91f40a86',
 'embedding': None,
 'metadata': {},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'text': 'this is an example of a manual',
 'mimetype': 'text/plain',
 'start_char_idx': None,
 'end_char_idx': None,
 'text_template': '{metadata_str}\n\n{content}',
 'metadata_template': '{key}: {value}',
 'metadata_seperator': '\n'}

### Adding Metadata

In [11]:
manual_document_with_metadata = Document(
    text="This is an example of a manual document",
    metadata={"filename": "made-up-file-name", "category": "imaginary-category"}
)

In [13]:
manual_document_with_metadata.__dict__

{'id_': '238425bc-3a52-4193-9637-47bc3b48b2ed',
 'embedding': None,
 'metadata': {'filename': 'made-up-file-name',
  'category': 'imaginary-category'},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'text': 'This is an example of a manual document',
 'mimetype': 'text/plain',
 'start_char_idx': None,
 'end_char_idx': None,
 'text_template': '{metadata_str}\n\n{content}',
 'metadata_template': '{key}: {value}',
 'metadata_seperator': '\n'}

In [17]:
manual_documents.metadata={"filename": "made-up-file-name", "category": "imaginary-categorsy"}
manual_documents.__dict__

{'id_': '25fa6866-843b-4b1f-ad22-fc4b91f40a86',
 'embedding': None,
 'metadata': {'filename': 'made-up-file-name',
  'category': 'imaginary-categorsy'},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'text': 'this is an example of a manual',
 'mimetype': 'text/plain',
 'start_char_idx': None,
 'end_char_idx': None,
 'text_template': '{metadata_str}\n\n{content}',
 'metadata_template': '{key}: {value}',
 'metadata_seperator': '\n'}

### Transform the Data

After loading, the pre-processing and transormation data is executed for retrieval. We need to transform the list of Document objects into Node objects

In [18]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(
    chunk_size=128, # in tokens
    chunk_overlap=16, #in tokens
    paragraph_separator="\n\n"
)

nodes = parser.get_nodes_from_documents(documents, show_progress=True)

Parsing nodes:   0%|          | 0/2115 [00:00<?, ?it/s]

In [19]:
nodes

[TextNode(id_='d8d739ca-d98b-47f2-9be2-5b4a33b802b7', embedding=None, metadata={'page_label': '1', 'file_name': '01-sejarah-hidup-nabi-muhammad-saw.pdf', 'file_path': 'c:\\Users\\maizz\\Documents\\Python Scripts\\seerah-llm\\notebooks\\..\\data\\01-sejarah-hidup-nabi-muhammad-saw.pdf', 'file_type': 'application/pdf', 'file_size': 2559574, 'creation_date': '2024-10-16', 'last_modified_date': '2024-10-16'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7057229c-42f1-4cdb-914d-6a5f1a6dc3cf', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': '01-sejarah-hidup-nabi-muhammad-saw.pdf', 'file_path': 'c:\\Users\\maizz\\Documents\\Python Scripts\\seerah-llm\\notebooks\\..\\data\\01-sej

In [20]:
type(nodes[42])

llama_index.core.schema.TextNode

In [22]:
nodes[4300].__dict__

{'id_': 'b14fa9f5-3e01-4b15-bca5-1b917a7ac452',
 'embedding': None,
 'metadata': {'page_label': '216',
  'file_name': '01-sejarah-hidup-nabi-muhammad-saw.pdf',
  'file_path': 'c:\\Users\\maizz\\Documents\\Python Scripts\\seerah-llm\\notebooks\\..\\data\\01-sejarah-hidup-nabi-muhammad-saw.pdf',
  'file_type': 'application/pdf',
  'file_size': 2559574,
  'creation_date': '2024-10-16',
  'last_modified_date': '2024-10-16'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e18d36a1-d00e-482d-8af3-720ff6f1cbc6', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '216', 'file_name': '01-sejarah-hidup-nabi-muhammad-saw.pdf', 'file_path': 'c:\\Users\\maizz\\Documents\\Pytho

## NodeRelationships

You can set relationships between nodes.

- 🌐 NodeRelationships assign connections between chunks of text. It's useful for:
  - Documents organized in a hierarchical manner (e.g., book, chapter, section, subsection)
  - Maintaining sequential order
  - Other complex relationships (ie, in legal documents for links a clause or other cases) 

- 🔍 NodeRelationships help retrieve not just the relevant section, but also related sections that might provide additional context or information.

In [23]:
!rm -rf ./data

'rm' is not recognized as an internal or external command,
operable program or batch file.
