In [1]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from qdrant_client import QdrantClient
import tqdm
from pathlib import Path
from fastembed.embedding import FlagEmbedding
from pydantic import BaseModel, Field
from typing import Union, List
import abc

In [2]:
!mkdir -p ~/.cache/qdrant

## Configure indexing

In [3]:
collection_name = "org"
dir = Path("~/Projects/org").expanduser()
glob_pattern="**/*.org"
qdrant_path = Path(f"~/.cache/qdrant/{collection_name}").expanduser()

In [4]:
class DirectoryConfig(BaseModel):
    dir: Union[str, Path]
    glob_pattern: str


class DirectoryDocumentLoader(abc.ABC):

    @abc.abstractmethod
    def load_docs(self, dir_config: DirectoryConfig):
        pass


class LangchainDirectoryDocumentLoader(BaseModel, DirectoryDocumentLoader):
    chunk_size: int = Field(default=512)
    chunk_overlap: int = Field(default=128)
    show_progress: bool = Field(default=True)

    def load_raw_docs(self, dir, glob_pattern):
        loader = DirectoryLoader(dir, glob=glob_pattern, show_progress=self.show_progress)
        docs = loader.load()
        return docs
    
    def chunk_docs(self, docs):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = self.chunk_size,
            chunk_overlap  = self.chunk_overlap,
            length_function = len,
            is_separator_regex = False,
        )
        chunked_docs = text_splitter.split_documents(docs)
        return [self._langchain_doc_to_dict_doc(doc) for doc in chunked_docs]

    @classmethod
    def _langchain_doc_to_dict_doc(cls, doc):
        return {"document": doc.page_content, "metadata": doc.metadata}
                 
    def load_docs(self, dir_config: DirectoryConfig):
        raw_docs = self.load_raw_docs(dir_config.dir, dir_config.glob_pattern)
        return self.chunk_docs(raw_docs)

In [5]:
class QdrantIndexer(BaseModel):
    document_loader: DirectoryDocumentLoader
    qdrant_client: QdrantClient
    collection_name: str

    def index_docs_from_directory(self, dir_config: DirectoryConfig):
        docs = self.document_loader.load_docs(dir_config)
        self.index_docs(self.qdrant_client, docs, self.collection_name)
    
    class Config:
        arbitrary_types_allowed = True

    @classmethod
    def index_docs(cls, qdrant_client, docs: List[dict], collection_name: str):
        doc_contents = [doc["document"] for doc in docs]
        metadatas = [doc["metadata"] for doc in docs]
        qdrant_client.add(
            collection_name=collection_name,
            documents=doc_contents,
            metadata=metadatas,
            ids=range(len(docs)),
            batch_size=64,
            parallel=6
        )

    class Config:
        arbitrary_types_allowed = True

In [6]:
loader = LangchainDirectoryDocumentLoader()
dir_config = DirectoryConfig(dir=dir, glob_pattern=glob_pattern)
client = QdrantClient(path=qdrant_path)
indexer = QdrantIndexer(document_loader=loader, qdrant_client=client, collection_name=collection_name)

## Index your directory

In [10]:
%%time
indexer.index_docs_from_directory(dir_config=dir_config)


  d+1}} | <v,x>_L = 0}
                     ^
  unexpected '}'
  expecting "\\bangle", "\\brace", "\\brack", "\\choose", "\\displaystyle", "{", letter, digit, ".", "\\mbox", "\\text", "\\textbf", "\\textit", "\\textrm", "\\textsf", "\\texttt", "\\bm", "\\boldsymbol", "\\mathbb", "\\mathbf", "\\mathbfcal", "\\mathbffrak", "\\mathbfit", "\\mathbfscr", "\\mathbfsfit", "\\mathbfsfup", "\\mathbfup", "\\mathbold", "\\mathcal", "\\mathds", "\\mathfrak", "\\mathit", "\\mathrm", "\\mathscr", "\\mathsf", "\\mathsfit", "\\mathsfup", "\\mathtt", "\\mathup", "\\pmb", "\\symbf", "\\texttt", "\\sqrt", "\\surd", "\\mspace", "\\hspace", "\\mathop", "\\mathrel", "\\mathbin", "\\mathord", "\\mathopen", "\\mathclose", "\\mathpunct", "\\phantom", "\\boxed", "\\overset", "\\stackrel", "\\underset", "\\frac", "\\tfrac", "\\dfrac", "\\binom", "\\genfrac", "\\substack", "_", "^", "\\begin", "\\ensuremath", "\\bigg", "\\Bigg", "\\big", "\\Big", "\\biggr", "\\Biggr", "\\bigr", "\\Bigr", "\\biggl", "\\Biggl", "\

CPU times: user 23.1 s, sys: 2.49 s, total: 25.6 s
Wall time: 3min 30s


## Check whether querying works

In [11]:
search_result = indexer.qdrant_client.query(
    collection_name="org",
    query_text="How to index documents with llamaindex?"
)
print(search_result)

[QueryResponse(id=1954, embedding=None, metadata={'document': '"""\n\nindex knows searchable documents but does not need to know actual documents\n\n"""\n\ndef index_documents(index: Index[RetrievalContextType], documents: Collection[RetrievalContextType, Document[RetrievalContextType]]):\n\npass\n\nQA DSL\n\n???', 'source': '/home/kuba/Projects/org/roam/20230523164215-qa_dsl.org'}, document='"""\n\nindex knows searchable documents but does not need to know actual documents\n\n"""\n\ndef index_documents(index: Index[RetrievalContextType], documents: Collection[RetrievalContextType, Document[RetrievalContextType]]):\n\npass\n\nQA DSL\n\n???', score=0.8762011488301409), QueryResponse(id=3439, embedding=None, metadata={'document': 'llama index response synthesis', 'source': '/home/kuba/Projects/org/roam/20230319113823-llm_response_synthesis.org'}, document='llama index response synthesis', score=0.8728227293344956), QueryResponse(id=3374, embedding=None, metadata={'document': 'task="text-