In [23]:
import os
import uuid
from sentence_transformers import SentenceTransformer
from qdrant_client.models import PointStruct
from qdrant_client import QdrantClient
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Define the root directory for reading files
root_directory = "/Users/hungcq/projects/knowledge-repo"

# Initialize Langchain components
splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)

# Function to recursively load all text files from a directory
def load_docs_from_directory(directory):
    docs = []
    paths = []

    for subdir, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):  # You can change this to the file types you need
                file_path = os.path.join(subdir, file)
                # Use Langchain TextLoader to load the file
                loader = TextLoader(file_path)
                text_split = splitter.split_documents(loader.load())

                # Add documents to list along with relative paths
                for doc in text_split:
                    docs.append("search_document: " + doc.page_content)
                    # Store the relative path of the file
                    paths.append(os.path.relpath(file_path, root_directory))

    return docs, paths

# Load documents from directory
docs, paths = load_docs_from_directory(root_directory)

In [14]:
# Initialize the SentenceTransformer model
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

<All keys matched successfully>


In [24]:
# Encode documents into embeddings using the SentenceTransformer model
doc_embeddings = model.encode(docs)

# Prepare data for upsert into Qdrant
points = []
for i, (embedding, path, doc) in enumerate(zip(doc_embeddings, paths, docs)):
    points.append(
        PointStruct(
            id=str(uuid.uuid4()),  # Qdrant requires unique IDs
            vector=embedding.tolist(),  # Convert numpy array to list if necessary
            payload={"file_path": path, "doc": doc},
        )
    )

print(len(doc_embeddings[0]))

768


In [28]:
# Initialize the Qdrant client
client = QdrantClient(url="http://localhost:6333")
collection = "knowledge_repo_768"
# Upsert documents into Qdrant
batch_size = 500
# Function to split points into batches
def batch_points(points, batch_size):
    for i in range(0, len(points), batch_size):
        yield points[i:i + batch_size]

# Upsert points in batches
for batch in batch_points(points, batch_size):
    operation_info = client.upsert(
        collection_name=collection,
        wait=True,
        points=batch,  # Upserting a batch of points
    )
    print(f"Batch of {len(batch)} points upserted successfully.")

# Print the operation result
print(operation_info)


Batch of 500 points upserted successfully.
Batch of 500 points upserted successfully.
Batch of 500 points upserted successfully.
Batch of 500 points upserted successfully.
Batch of 500 points upserted successfully.
Batch of 500 points upserted successfully.
Batch of 500 points upserted successfully.
Batch of 481 points upserted successfully.
operation_id=9 status=<UpdateStatus.COMPLETED: 'completed'>


In [29]:
query = 'search_query: how can I deprecate probably?'
query_embeddings = model.encode([query])

In [30]:
search_result = client.query_points(
    collection_name=collection,
    query=query_embeddings[0],
    with_payload=True,
    limit=10
).points

print(search_result)

