In [1]:
from chonkie import SemanticChunker, SemanticChunk
from sentence_transformers import SentenceTransformer
import os
from qdrant_client import QdrantClient, models
import numpy
from typing import List, Dict, Any
import uuid
from dotenv import load_dotenv
print("Import Successful")

Import Successful


In [2]:
load_dotenv()

True

In [3]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"  # Or your preferred model

def process_document_chunks(doc_text: str, doc_id: str, model_name: str = "all-MiniLM-L6-v2") -> List[SemanticChunk]:
    """
    Process a document into SemanticChunks using a SemanticChunker.
    Returns a list of SemanticChunk objects with embeddings.
    """
    encoder = SentenceTransformer(model_name)
    
    semantic_chunker = SemanticChunker(
        embedding_model=embedding_model
    )
    
    chunks = semantic_chunker.chunk(doc_text)

    
    print(f"Document split into {len(chunks)} semantic chunks")
    return chunks

def semantic_chunk_to_qdrant_point(chunk: SemanticChunk, doc_id: str, chunk_id: int, 
                                  use_sentence_vectors: bool = False) -> models.PointStruct:
    """
    Convert a SemanticChunk to a Qdrant PointStruct.
    
    Args:
        chunk: The SemanticChunk to convert
        doc_id: Document identifier
        chunk_id: Chunk identifier within the document
        use_sentence_vectors: If True, use individual sentence embeddings as named vectors
                             If False, use the chunk's embedding as a single vector
    """

    point_id = str(uuid.uuid4())
    
    payload = {
        "doc_id": doc_id,
        "chunk_id": chunk_id,
        "chunk_text": chunk.text,
        "sentence_texts": [s.text for s in chunk.sentences],
        "sentence_start_indices": [s.start_index for s in chunk.sentences],
        "sentence_end_indices": [s.end_index for s in chunk.sentences],
    }
    
    if use_sentence_vectors:
        named_vectors = {
            f"sentence_{i}": sentence.embedding.tolist()
            for i, sentence in enumerate(chunk.sentences)
            if sentence.embedding is not None
        }
        
        return models.PointStruct(
            id=point_id,
            vector=named_vectors,  # Named vectors approach
            payload=payload
        )
    else:

        chunk_embedding = None
        if hasattr(chunk, 'embedding') and chunk.embedding is not None:
            chunk_embedding = chunk.embedding.tolist()
        elif chunk.sentences and chunk.sentences[0].embedding is not None:
            chunk_embedding = chunk.sentences[0].embedding.tolist()
        else:
            # Fallback - should rarely happen if using proper SemanticChunker
            raise ValueError("No embedding found for chunk or its sentences")
            
        return models.PointStruct(
            id=point_id,
            vector=chunk_embedding,  # Single vector approach
            payload=payload
        )

def create_standard_collection(client: QdrantClient, collection_name: str, vector_size: int = 384):
    """
    Create a standard Qdrant collection with single vectors per point.
    """
    collections = client.get_collections().collections
    collection_names = [collection.name for collection in collections]
    
    if collection_name not in collection_names:
        client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(
                size=vector_size,
                distance=models.Distance.COSINE
            )
        )
        print(f"Standard collection '{collection_name}' created.")
    else:
        print(f"Collection '{collection_name}' already exists.")

def create_named_vectors_collection(client: QdrantClient, collection_name: str, max_sentences: int = 20):
    """
    Create a Qdrant collection that supports named vectors for sentence-level embeddings.
    """
    collections = client.get_collections().collections
    collection_names = [collection.name for collection in collections]
    
    if collection_name not in collection_names:
        client.create_collection(
            collection_name=collection_name,
            vectors_config={
                **{f"sentence_{i}": models.VectorParams(size=384, distance=models.Distance.COSINE) 
                   for i in range(max_sentences)},  # Support up to max_sentences per chunk
            }
        )
        print(f"Named vectors collection '{collection_name}' created with support for {max_sentences} sentences per chunk.")
    else:
        print(f"Collection '{collection_name}' already exists.")

def upsert_chunks_to_qdrant(client: QdrantClient, collection_name: str, 
                           chunks: List[SemanticChunk], doc_id: str, use_sentence_vectors: bool = False):
    """
    Upload chunks to Qdrant, with option to use sentence-level vectors.
    
    Args:
        client: QdrantClient instance
        collection_name: Name of the collection to upload to
        chunks: List of SemanticChunk objects
        doc_id: Document identifier
        use_sentence_vectors: If True, use individual sentence embeddings as named vectors
    """
    points = []
    for i, chunk in enumerate(chunks):
        point = semantic_chunk_to_qdrant_point(chunk, doc_id, i, use_sentence_vectors)
        points.append(point)
    
    client.upsert(
        collection_name=collection_name,
        points=points
    )
    
    vector_type = "sentence-level named vectors" if use_sentence_vectors else "chunk-level vectors"
    print(f"Uploaded {len(points)} chunks to collection '{collection_name}' using {vector_type}")

def setup_qdrant_client():
    """Initialize the Qdrant client with credentials from .env file"""
    return QdrantClient(
        url=os.environ.get("QDRANT_URL"), 
        api_key=os.environ.get("QDRANT_API_KEY"),
    )

def load_plain_text(file_path: str) -> str:
    """Utility: read a plain‐text document"""
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

In [4]:
if __name__ == "__main__":
    # Define the path to the "data" folder in the same directory as the script
    input_folder = os.path.join(os.getcwd(), "data")
    
    # Check if the data folder exists
    if not os.path.exists(input_folder):
        print(f"Error: The folder '{input_folder}' does not exist.")
        exit(1)
        
    all_chunks = []
    for fname in os.listdir(input_folder):
        if not fname.endswith(".txt"):
            continue
        path = os.path.join(input_folder, fname)
        doc_text = load_plain_text(path)
        doc_id = os.path.splitext(fname)[0]
        print(f"Chunking '{doc_id}' ...")
        semantic_chunks = process_document_chunks(doc_text, doc_id, model_name=embedding_model)
        all_chunks.extend(semantic_chunks)
    print(f"Total chunks created: {len(all_chunks)}")

Chunking 'ai in games' ...
Document split into 32 semantic chunks
Chunking 'CV' ...
Document split into 9 semantic chunks
Chunking 'marriott international' ...
Document split into 19 semantic chunks
Total chunks created: 60


In [5]:
qdrant_client = setup_qdrant_client()

In [6]:
if __name__ == "__main__":
    # Initialize the Qdrant client
    
    # OPTION 1: Use standard collection with one vector per chunk
    standard_collection = "ragchatbot_standard"
    create_standard_collection(qdrant_client, standard_collection)
    upsert_chunks_to_qdrant(qdrant_client, standard_collection, all_chunks, doc_id, use_sentence_vectors=False)
    
    # OPTION 2: Use named vectors collection with sentence-level vectors
    named_vectors_collection = "ragchatbot_named_vectors"
    create_named_vectors_collection(qdrant_client, named_vectors_collection)
    upsert_chunks_to_qdrant(qdrant_client, named_vectors_collection, all_chunks, doc_id, use_sentence_vectors=True)

Standard collection 'ragchatbot_standard' created.
Uploaded 60 chunks to collection 'ragchatbot_standard' using chunk-level vectors
Named vectors collection 'ragchatbot_named_vectors' created with support for 20 sentences per chunk.
Uploaded 60 chunks to collection 'ragchatbot_named_vectors' using sentence-level named vectors
