In [9]:
from langchain_community.document_loaders import DirectoryLoader, PDFMinerLoader, PyPDFLoader, PyMuPDFLoader, UnstructuredMarkdownLoader
from langchain_objectbox.vectorstores import ObjectBox

from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from glob import glob

In [10]:
embeddings = OllamaEmbeddings(model="mxbai-embed-large")

In [11]:
def generate_data_store():
    documents = load_pdfs_as_docs()
    documents += load_mds_as_docs()
    chunks = split_text(documents)
    ObjectBox.from_documents(chunks, embeddings, embedding_dimensions=1024)

def load_pdfs_as_docs():
    # recursively look through folders in "./" to find files with 'instr.pdf' in the name using glob
    dirs = glob('./**/*.pdf', recursive=True)
    all_docs = []
    for dir in dirs:
        all_docs.extend(PyMuPDFLoader(dir).load())
    return all_docs

def load_mds_as_docs():
    dirs = glob('/workspaces/MigraChat/model_playground/resources/Federal/Legislation/markdown_pages/*.md')
    all_docs = []
    for dir in dirs:
        all_docs.extend(UnstructuredMarkdownLoader(dir).load())
    return all_docs

def split_text(documents: list):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True
    )
    return text_splitter.split_documents(documents)

In [12]:
generate_data_store()

