In [13]:
from qdrant_client import QdrantClient, models
from tqdm import tqdm
import pandas as pd

# Initialize the client
client = QdrantClient(url='http://localhost:6333', timeout=60)  # Replace with your Qdrant URL if different

collection_name = 'nederland_2025_02_01_cohere'  # Replace with your collection name

# Get total count first
collection_info = client.get_collection(collection_name=collection_name)
total_points = collection_info.points_count
print(f"Total points in collection: {total_points}")

Total points in collection: 44623674


In [11]:
points, offset = client.scroll(
        collection_name=collection_name,
        limit=1,
        with_payload=models.PayloadSelectorInclude(include=["meta.page_number", "meta.page_count", "meta.source_id"]),
        with_vectors=False,
        offset=0
    )
points[0].payload
source_id = points[0].payload.get('meta', {}).get('source_id')
page_number = points[0].payload.get('meta', {}).get('page_number')
page_count = points[0].payload.get('meta', {}).get('page_count')

print(source_id, page_number, page_count)

b5fa6b2ec58043779dddecb5ddea0e791358e73c 199 277


In [15]:
# Initialize variables for scrolling through all points
offset = None
records = []

# Create progress bar with total count
pbar = tqdm(total=total_points, desc="Processing points")

while True:
    # Scroll through the collection to retrieve points in batches
    points, offset = client.scroll(
        collection_name=collection_name,
        limit=100000,
        with_payload=models.PayloadSelectorInclude(include=["meta.page_number", "meta.page_count", "meta.source_id"]),
        with_vectors=False,
        offset=offset
    )

    # Break the loop if no more points are returned
    if not points:
        break

    # Process each point in the batch
    for point in points:
        source_id = point.payload.get('meta', {}).get('source_id')
        page_number = point.payload.get('meta', {}).get('page_number')
        page_count = point.payload.get('meta', {}).get('page_count')
        
        # Skip if either value is None
        if source_id is None or page_number is None:
            continue
            
        records.append({
            'source_id': source_id,
            'page_number': page_number,
            'page_count': page_count
        })
    
    # Update progress bar
    pbar.update(len(points))
    
    # If there's no next page, exit the loop
    if offset is None:
        break

pbar.close()
print(f"\nTotal records collected: {len(records)}")

Processing points:   4%|▍         | 2000000/44623674 [01:26<30:36, 23213.22it/s]
Processing points: 100%|██████████| 44623674/44623674 [31:30<00:00, 23610.18it/s]


Total records collected: 44623674





In [16]:
# Create DataFrame from all records
df = pd.DataFrame(records)

# Debug information
print("\nDataFrame columns:", df.columns.tolist())
print("\nFirst few rows of DataFrame:")
print(df.head())

if df.empty:
    print("\nNo records found to analyze")
else:
    # Group by source_id and page_number to find duplicates
    duplicates = df.groupby(['source_id', 'page_number']).size().reset_index(name='duplicate_count')
    duplicates = duplicates[duplicates['duplicate_count'] > 1]
    
    # Group by source_id to get final result
    result = duplicates.groupby('source_id')['duplicate_count'].sum().reset_index()
    
    if not result.empty:
        print("\nDuplicate counts by source_id:")
        print(result.sort_values('duplicate_count', ascending=False))
        print(f"\nTotal number of source_ids with duplicates: {len(result)}")
    else:
        print("\nNo duplicates found")


DataFrame columns: ['source_id', 'page_number', 'page_count']

First few rows of DataFrame:
                                  source_id  page_number  page_count
0  b5fa6b2ec58043779dddecb5ddea0e791358e73c          199         277
1  8bce38ab9130fffbc5db0fbf6a982d2cf0d6c5c9          922        1127
2  080cf1643d94a5518458aa9a90f329033de49de4          113         182
3  f0aed2b672cab36019e3daf18dac1a8c488a33aa            1           4
4  e20f2a525b7c906e156c554573a9c25ae6ef10f1           21          66

Duplicate counts by source_id:
                                       source_id  duplicate_count
96713   41bb5a11f7427ad806fb63a192a45211a641a60f            14956
246917  a75e18acfdbb80b09aa77d1c573effd7211b481b            13980
278486  bc965c6b83d6245201b0273c493d1ed92d30348b            13622
359943  f3f86c4d59ec701e4a92b3e8c639e1c0784abeff            13592
207853  8cca197a061a03a89eb3eff19f90634322d26940            12772
...                                          ...              ...

In [19]:
source_id = "41bb5a11f7427ad806fb63a192a45211a641a60f"

# Get points for specific source_id from Qdrant
points = client.scroll(
    collection_name="nederland_2025_02_01_cohere",
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="meta.source_id",
                match=models.MatchValue(value=source_id)
            ),
            models.FieldCondition(
                key="meta.page_number",
                match=models.MatchValue(value=1)
            ),
        ]
    ),
    limit=100000  # Adjust batch size as needed
)

# Print number of points found
print(len(points[0]))

# Print first few points for inspection
for point in points[0][:5]:
    print(f"\nPoint ID: {point.id}")
    print(f"Payload: {point.payload}")


5

Point ID: 21159b4f-5d19-446d-a172-21bf30ae4ec8
Payload: {'content': '- 1  Voor de toepassing van deze regeling en de uitwerkingsovereenkomst wordt verstaan\n                                                      onder:\n                                                   \n                                                      a ambtenaar: hij die door of vanwege de gemeente is aangesteld om in openbare dienst werkzaam\n                                                            te zijn alsmede hij met wie een arbeidsovereenkomst naar burgerlijk recht is aangegaan;\n                                                      \n                                                      b betrekking: het geheel van werkzaamheden dat door de ambtenaar is te verrichten;\n                                                      \n                                                      c pensioenwet: de Algemene burgerlijke pensioenwet, zoals die gold tot en met 31 december 1995;\n                          

In [4]:
from cohere import ClientV2 as CohereClient
from qdrant_client import QdrantClient, models

cohere_client = CohereClient(api_key="leBUANLdJzox27RHfrolRkiCzWIEMmyeBTeTKmsE")
qdrant_client = QdrantClient(url='http://localhost:6333', timeout=60)

dense_vector = cohere_client.embed(
    texts=['Ravijnjaar 2026'], 
    input_type="search_query", 
    model="embed-multilingual-v3.0",
    embedding_types=["uint8"]
).embeddings.uint8[0]

DENSE_VECTORS_NAME = "text-dense"
SPARSE_VECTORS_NAME = "text-sparse"

qdrant_documents = qdrant_client.query_points(
    collection_name="nederland_2025_02_01_cohere",
    prefetch=[
        models.Prefetch(
            query=dense_vector,
            using=DENSE_VECTORS_NAME,
            limit=700
        ),
    ],
    limit=200,
    score_threshold=None,
    with_payload=True,
    with_vectors=True,
    timeout=120,  # Increase timeout to 120 seconds
).points
                
# Print number of points found
print(len(qdrant_documents[0]))

# Print first few points for inspection
for point in qdrant_documents[0][:5]:
    print(f"\nPoint ID: {point.id}")
    print(f"Payload: {point.payload}")


NameError: name 'models' is not defined