## Load the libraries

In [None]:
from loguru import logger
import os
from dotenv import load_dotenv
from database import get_engine, get_session, skkuMd
from sqlalchemy.exc import SQLAlchemyError
from llama_index.core import Document
from llama_index.core.node_parser import MarkdownNodeParser

## Create logger and load environment variables

In [None]:
logger.remove()
logger.add("logs/md-indexer.log", rotation="10 MB")
load_dotenv()

aoi_api_key = os.getenv("AZURE_OPENAI_API_KEY")
aoi_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
aoi_emb_model = os.getenv("AZURE_EMBEDDING_MODEL")
aoi_version = os.getenv("AZURE_GENERATION_MODEL_VERSION")

connection_string = os.getenv("DATABASE_URL")

## Create database connection engine

In [None]:
engine = get_engine()
session = get_session(engine)

## Create a document store from the Markdown records in our database

In [None]:
def get_markdown_list():
    """
    Retrieves all records from the SkkuMd table and returns a list of dictionaries,
    each containing 'url' and 'markdown' keys.
    """
    try:

        # Query all records from the SkkuMd table
        records = session.query(skkuMd).all()

        # Build the list of dictionaries
        markdown_list = [{
        'url': record.url,
        'markdown': record.markdown,
        'md_wrap_hash': record.md_wrap_hash,
        'generation_date': record.generation_date
        } for record in records]

        logger.info(f"Successfully retrieved {len(markdown_list)} records from table skku_md.")
        return markdown_list

    except SQLAlchemyError as e:
        logger.error("An error occurred while querying the database.", exc_info=True)
        return []

def create_document_store(markdown_list):
    """
    Creates a document store from the markdown_list, where each document has
    'id_' set to the 'url' and 'text' set to the 'markdown'.
    """
    documents = []
    for item in markdown_list:
        doc = Document(
            text=item['markdown'],
            id_=item['url']
        )
        documents.append(doc)
    logger.info(f"Successfully created a document store with {len(documents)} documents.")
    return documents

In [None]:
if __name__ == "__main__":
    markdown_list = get_markdown_list()
    document_store = create_document_store(markdown_list)

## Let LLamaIndex chunk the documents into smaller pieces (nodes)

In [None]:
def parse_documents_to_nodes(document_store):
    try:
        parser = MarkdownNodeParser()  # optional list of tags
        nodes = parser.get_nodes_from_documents(document_store)
        logger.info(f"Successfully created {len(nodes)} nodes from the document store.")
        return nodes
    except Exception as e:
        logger.error("An error occurred while parsing documents to nodes.", exc_info=True)
        return []

nodes = parse_documents_to_nodes(document_store)

## Use LLamaIndex and Azure OpenAI to calculate embeddings for each node and store into a **Vector Store**

In [None]:
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import Settings

embed_model = AzureOpenAIEmbedding(
    model=aoi_emb_model,
    api_key=aoi_api_key,
    azure_endpoint=aoi_endpoint,
    api_version=aoi_version,
)
Settings.embed_model = embed_model

In [None]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.supabase import SupabaseVectorStore

vector_store = SupabaseVectorStore(
    postgres_connection_string=connection_string,
    collection_name="md_kingo",
)
try:
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex(nodes, storage_context=storage_context)
    logger.info("Successfully created VectorStoreIndex.")
except Exception as e:
    logger.error("An error occurred while creating VectorStoreIndex.", exc_info=True)