In [26]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil
import chromadb

In [27]:
HF_EMBEDDING_MODEL_PATH = "Salesforce/SFR-Embedding-Mistral"

In [28]:
SOURCE_PATH = './markdown_pages'
CHROMADB_FILEPATH = './chroma'

In [29]:
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)


def load_documents():
    loader = DirectoryLoader(SOURCE_PATH, glob="*.md")
    documents = loader.load()
    return documents


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks


def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMADB_FILEPATH):
        shutil.rmtree(CHROMADB_FILEPATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, HuggingFaceEmbeddings(), persist_directory=CHROMADB_FILEPATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMADB_FILEPATH}.")

In [30]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=500,
    length_function=len,
    add_start_index=True
)

chunks = text_splitter.split_documents(load_documents())
save_to_chroma(chunks)

  from .autonotebook import tqdm as notebook_tqdm


Saved 362 chunks to ./chroma.
