In [5]:
from qdrant_client import QdrantClient, models
from tqdm import tqdm

# Initialize the client
client = QdrantClient(url='http://localhost:6333')  # Replace with your Qdrant URL if different

collection_name = 'nederland_cohere'  # Replace with your collection name

# Get total count first
collection_info = client.get_collection(collection_name=collection_name)
total_points = collection_info.points_count

# Initialize variables for scrolling through all points
offset = None
total_chars = 0

# Create progress bar with total count
pbar = tqdm(total=total_points, desc="Processing points")

while True:
    # Scroll through the collection to retrieve points in batches
    points, offset = client.scroll(
        collection_name=collection_name,
        limit=10000,                    # Adjust the limit based on your memory constraints
        with_payload=models.PayloadSelectorInclude(include=["content"]),  # Only return 'content' field
        with_vectors=False,
        offset=offset
    )

    # Break the loop if no more points are returned
    if not points:
        break

    # Calculate the total characters in the 'content' field of payloads
    for point in points:
        pbar.update(1)
        if point.payload and 'content' in point.payload:
            content_field = point.payload['content']
            if isinstance(content_field, str):
                total_chars += len(content_field)
            elif isinstance(content_field, list):
                # If 'content' is a list of strings
                total_chars += sum(len(str(item)) for item in content_field)
            else:
                # Convert other types to string and count their length
                total_chars += len(str(content_field))

    # If there's no next page, exit the loop
    if offset is None:
        break

pbar.close()
print(f"Total number of characters in 'content' fields: {total_chars}")

Processing points: 360000it [01:18, 4585.13it/s]00:00<?, ?it/s]
Processing points: 100%|██████████| 52677937/52677937 [1:09:49<00:00, 12575.10it/s]

Total number of characters in 'content' fields: 58268988426



