In [4]:
from elasticsearch import Elasticsearch
from tqdm import tqdm

# Initialize the Elasticsearch client
es = Elasticsearch("http://localhost:9200")

index_name = "bron_2024_10_11"
unique_pairs = set()

# Get the total count of documents
total_docs = es.count(index=index_name)['count']

# Use a scroll or search_after to retrieve all documents
resp = es.search(index=index_name, body={"query": {"match_all": {}}, "size": 1000}, scroll='1m')

sid = resp['_scroll_id']
hits = resp['hits']['hits']

# Initialize tqdm with total_docs
with tqdm(total=total_docs, desc="Processing documents") as pbar:
    while hits:
        for doc in hits:
            source = doc['_source']
            location = source.get('location')
            location_name = source.get('location_name')
            if location and location_name:
                unique_pairs.add((location, location_name))
            pbar.update(1)  # Update the progress bar for each processed document

        resp = es.scroll(scroll_id=sid, scroll='1m')
        # Update the scroll ID and hits
        sid = resp['_scroll_id']
        hits = resp['hits']['hits']

# Convert to a list if needed and print
unique_pairs_list = list(unique_pairs)
for pair in unique_pairs_list:
    print(pair)

Processing documents: 100%|██████████| 3446697/3446697 [07:02<00:00, 8156.42it/s] 

('GM0141', 'Almelo')
('GM1896', 'Zwartewaterland')
('GM0180', 'Staphorst')





In [6]:
len(unique_pairs_list)

3

In [2]:
from qdrant_client import QdrantClient
qdrant_client = QdrantClient(host="localhost", port=6333, timeout=1000000)

collection_name="nederland_2025_02_01_cohere"

query_result =qdrant_client.query(
    collection_name=collection_name,
    with_payload=['meta.source_id'],
    with_vectors=False,
)

ids = []

for point in query_result.result:
    ids.append(point.payload['meta']['source_id'])

ids = list(set(ids))
len(ids)
qdrant_client.scroll(
                    collection_name=collection_name,
                    limit=100,
                    with_payload=['meta.source_id'],
                    with_vectors=False,
                )

TypeError: QdrantFastembedMixin.query() missing 1 required positional argument: 'query_text'

In [2]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Filter, FieldCondition, MatchValue, ScoredPoint
import time
from tqdm import tqdm

# Qdrant configuration
QDRANT_HOST = "http://localhost:6333"  # Change if necessary
COLLECTION_NAME = "nederland_2025_02_01_cohere"  # Change to your collection name
BATCH_SIZE = 10000  # Adjust batch size based on performance

# Initialize client
qdrant_client = QdrantClient(QDRANT_HOST)

def get_unique_source_ids():
    """ Fetch unique source_ids from Qdrant collection efficiently """
    unique_source_ids = set()
    offset = None  # Qdrant uses `offset` for pagination
    retrieved = 0

    doc_count = qdrant_client.count(COLLECTION_NAME, timeout=1000000).count
    pbar = tqdm(desc="Retrieving points", unit="docs", dynamic_ncols=True, total=doc_count)
    
    while True:
        # Perform a scroll-like query to fetch points iteratively
        response = qdrant_client.scroll(
            collection_name=COLLECTION_NAME,
            with_payload=["meta.source_id"],
            limit=BATCH_SIZE,
            offset=offset  # Use offset for pagination
        )

        points, next_offset = response  # Qdrant returns (List[ScoredPoint], Offset)

        # Extract unique source_ids
        for point in points:
            source_id = point.payload['meta']['source_id']
            if source_id:
                unique_source_ids.add(source_id)

        retrieved += len(points)
        
        pbar.update(len(points))  # Update tqdm progress bar
        pbar.set_postfix(unique_ids=len(unique_source_ids))

        # Stop when there's no more data
        if next_offset is None or not points:
            break

        # Update offset for the next batch
        offset = next_offset
        
        # Write current batch of IDs to file
        with open('source_ids.txt', 'a') as f:
            for point in points:
                source_id = point.payload['meta']['source_id']
                if source_id:
                    f.write(f"{source_id}\n")
    
    return unique_source_ids

ids = get_unique_source_ids()
len(ids)



KeyboardInterrupt: 

