In [1]:
import os

from getpass import getpass
import nest_asyncio

from dotenv import load_dotenv

nest_asyncio.apply()


load_dotenv()

True

In [12]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("../data").load_data()

KeyboardInterrupt: 

In [27]:
sejarah = SimpleDirectoryReader(input_files=["../data/01-sejarah-hidup-nabi-muhammad-saw.pdf"]).load_data()

In [2]:
CO_API_KEY = os.environ['CO_API_KEY']

# 🗂️ Indexing

An `Index` is a data structure that allows for the quick retrieval of relevant context for a user query. 

It is the core foundation for retrieval-augmented generation (RAG) use-cases. Indexes are built from `Documents` and are used to build Retrievers, Query Engines and Chat Engines. All of which enable question & answer and chat over your data.

- 📂 After loading your data, you're ready to construct an `Index`.

- 🌐 **Vector Store Index:** The most common Index type. It segments your `Documents` into `Nodes` and generates vector embeddings for each node's text, prepping them for LLM queries.

- 🔄 **Vector Store Index Process:** Parse raw texts into document objects, split document objects into chunks/nodes, then convert all your nodes into embeddings and store them in a vector database.

### ⚙️ Embedding Text

First, let's see what an embedding is.


In [3]:
from llama_index.embeddings.cohere import CohereEmbedding

embed_v3 = CohereEmbedding(model_name="embed-english-v3.0")

embed_v3_light = CohereEmbedding(model_name="embed-english-light-v3.0")

embed_v2 = CohereEmbedding(model_name="embed-english-v2.0") 

[nltk_data] Downloading package punkt_tab to
[nltk_data]     c:\Users\maizz\Documents\Python Scripts\seerah-
[nltk_data]     llm\seerahllm_env\Lib\site-
[nltk_data]     packages\llama_index\core\_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
string = "A"

string_2 = "This is a complete sentence."

string_3 = """In the pursuit of a life well-lived, one must recognize the transient nature of the 
material world and the enduring value of virtue. The Sikh Gurus taught us that the Divine Light 
resides within all, and thus, we are united in our essence beyond the superficial distinctions of 
caste, creed, or status. Similarly, the Stoics emphasized the cultivation of inner virtues such as courage, 
temperance, and wisdom, understanding that true freedom lies in mastery over one's own perceptions and actions. 
As we navigate the vicissitudes of life, let us remember that our choices are our own, and in choosing virtue, 
we align ourselves with the cosmic order and the teachings of the Gurus. It is through selfless service, 
compassion, and the relentless pursuit of truth that we may attain a state of inner peace and contribute 
to the harmony of the world, embodying the principles of both Sikhism and Stoicism in our daily lives
"""

In [5]:
example_embedding = embed_v3.get_text_embedding(string)

In [6]:
len(example_embedding)

1024

In [7]:
def get_embedding_dimensions(embed_model, list_of_strings):
    embeddings = embed_model.get_text_embedding_batch(list_of_strings)   
    embed_lens = []
    for embedding in embeddings:
        embed_lens.append(len(embedding))
    return embed_lens

In [8]:
get_embedding_dimensions(embed_v3, [string, string_2, string_3])

[1024, 1024, 1024]

In [9]:
get_embedding_dimensions(embed_v3_light, [string, string_2, string_3])

[384, 384, 384]

In [10]:
get_embedding_dimensions(embed_v2, [string, string_2, string_3])

[4096, 4096, 4096]

In [11]:
embed_v3.similarity(
    embed_v3.get_text_embedding("""In embracing both the wisdom of the Sikh Gurus and the Stoic philosophers, 
                              we find a path to tranquility by accepting what is beyond our control and focusing 
                              our efforts on living virtuously and with purpose."""), 
    embed_v3.get_text_embedding(string_2),
    mode="cosine"
    )

0.18940321498701687

In [31]:
import requests

def load_text_from_url(url: str) -> str:
    """
    Fetches and returns the text content from the specified URL.

    Parameters:
    - url: The URL of the text file to fetch.

    Returns:
    - The text content of the file if the request is successful; otherwise, an error message.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # This will raise an HTTPError if the response was an error
        return response.text
    except requests.RequestException as e:
        return f"Failed to load content from {url}. Error: {e}"

url = "https://www.gutenberg.org/files/10763/10763.txt"

text_content = load_text_from_url(url)

In [32]:
from llama_index.core import Document, VectorStoreIndex

full_document = Document(text=text_content)

partial_document = Document(text=text_content[50000:60000])

In [33]:
index = VectorStoreIndex.from_documents(
    # remember, you must pass a list of documents!
    [partial_document], 
    embed_model=embed_v3,
    show_progress=True)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/3 [00:00<?, ?it/s]

In [34]:
from llama_index.core.node_parser import SentenceSplitter

# instantiate a node parser
splitter = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=16,
    paragraph_separator="\n\n\n\n",
)

# pass a list of documents to the node paraser
nodes = splitter.get_nodes_from_documents([partial_document])

# create the index from the nodes
index_from_nodes = VectorStoreIndex(
    nodes,
    embed_model=embed_v3,
    show_progress=True
    )

Generating embeddings:   0%|          | 0/6 [00:00<?, ?it/s]