In [1]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from qdrant_client import QdrantClient
import tqdm
from pathlib import Path
from fastembed.embedding import FlagEmbedding

In [2]:
def load_docs(dir, glob_pattern="**/*.py"):
    loader = DirectoryLoader(dir, glob=glob_pattern, show_progress=True)
    docs = loader.load()
    return docs

def prepare_docs(docs, chunk_size=512, chunk_overlap=128):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap  = chunk_overlap,
        length_function = len,
        is_separator_regex = False,
    )
    return text_splitter.split_documents(docs)


In [3]:
raw_docs = load_docs(Path("~/Projects/forks/dspy").expanduser())

  0%|▌                                                                                                                                                                        | 95/27511 [00:04<21:48, 20.96it/s]


In [4]:
lc_docs = prepare_docs(tqdm.tqdm(raw_docs))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 95/95 [00:00<00:00, 1481259.78it/s]


In [5]:
embedding_model = FlagEmbedding(model_name="BAAI/bge-small-en", max_length=512) 

In [6]:
docs = lc_docs
doc_contents = [doc.page_content for doc in docs]

In [8]:
%%time
embeddings = list(embedding_model.embed(doc_contents, batch_size=64, parallel=12))

CPU times: user 16.8 ms, sys: 9.92 ms, total: 26.7 ms
Wall time: 19.3 s


In [23]:
client = QdrantClient(path="/tmp/dspy_qdrant")

In [24]:
def index_langchain_docs(qdrant_client, docs, collection_name="dspy"):
    doc_contents = [doc.page_content for doc in docs]
    metadatas = [doc.metadata for doc in docs]
    qdrant_client.add(
        collection_name=collection_name,
        documents=doc_contents,
        metadata=metadatas,
        ids=range(len(docs)),
        batch_size=64,
        parallel=6
    )

In [25]:
%%time
index_langchain_docs(client, lc_docs)

CPU times: user 112 ms, sys: 172 ms, total: 284 ms
Wall time: 21.9 s


In [26]:
search_result = client.query(
    collection_name="dspy",
    query_text="How to index documents with llamaindex?"
)
print(search_result)

[QueryResponse(id=707, embedding=None, metadata={'document': "topk = [] for rank, hit in enumerate(hits, start=1): if self.dataset is not None: row = self.dataset_id_to_index[hit.docid] text = ' '.join(self.dataset[field][row] for field in self.text_fields) pid = self.dataset[self.id_field][row] else: # Pyserini prebuilt faiss indexes can perform docid lookup psg = json.loads(self.searcher.doc(hit.docid).raw()) text = ' '.join(psg[field] for field in self.text_fields) pid = psg[self.id_field]\n\ntopk.append({\n\n'text': text,\n\n'long_text': text,\n\n'pid': pid,", 'source': '/home/kuba/Projects/forks/dspy/dsp/modules/pyserini.py'}, document="topk = [] for rank, hit in enumerate(hits, start=1): if self.dataset is not None: row = self.dataset_id_to_index[hit.docid] text = ' '.join(self.dataset[field][row] for field in self.text_fields) pid = self.dataset[self.id_field][row] else: # Pyserini prebuilt faiss indexes can perform docid lookup psg = json.loads(self.searcher.doc(hit.docid).raw(

In [21]:
search_result[0].metadata["source"]

'/home/kuba/Projects/forks/dspy/dsp/modules/pyserini.py'

In [22]:
snapshot_info = client.create_snapshot(collection_name="dspy")


NotImplementedError: Snapshots are not supported in the local Qdrant. Please use server Qdrant if you need full snapshots.

In [None]:
search_result[0].metadata["document"]