In [1]:
import os
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import time
import concurrent.futures
from tabulate import tabulate
from tqdm import tqdm
from qdrant_client import QdrantClient, models
from haystack_integrations.components.embedders.fastembed import FastembedSparseDocumentEmbedder
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack import Document, Pipeline
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentSplitter
from haystack_integrations.components.embedders.cohere import CohereDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.preprocessors import NLTKDocumentSplitter
from haystack.utils import Secret

# Connect to clients
es = Elasticsearch("http://localhost:9200")
qdrant_client = QdrantClient(host="localhost", port=6333, timeout=600)

OLD_COLLECTION = "overijssel_cohere"
NEW_COLLECTION = f"{OLD_COLLECTION}_v2"

# Create new collection with same parameters as old one
collection_info = qdrant_client.get_collection(OLD_COLLECTION)
qdrant_client.recreate_collection(
    collection_name=NEW_COLLECTION,
    vectors_config=collection_info.config.params.vectors,
    sparse_vectors_config=collection_info.config.params.sparse_vectors
)

print(f"Created new collection: {NEW_COLLECTION}")

batch_size = 1000

# Prepare ES query
query = {
    "query": {
        "bool": {
            "must": [
                {"exists": {"field": "description"}},
                {"exists": {"field": "doc_url"}}
            ],
            "minimum_should_match": 1,
            "should": [
                    {"match_phrase": {"location": "GM0193"}},
                    {"match_phrase": {"location": "GM0141"}},
                    {"match_phrase": {"location": "GM0166"}},
                    {"match_phrase": {"location": "GM1774"}},
                    {"match_phrase": {"location": "GM0183"}},
                    {"match_phrase": {"location": "GM0173"}},
                    {"match_phrase": {"location": "GM0150"}},
                    {"match_phrase": {"location": "GM1700"}},
                    {"match_phrase": {"location": "GM0164"}},
                    {"match_phrase": {"location": "GM0153"}},
                    {"match_phrase": {"location": "GM0148"}},
                    {"match_phrase": {"location": "GM1708"}},
                    {"match_phrase": {"location": "GM0168"}},
                    {"match_phrase": {"location": "GM0160"}},
                    {"match_phrase": {"location": "GM0189"}},
                    {"match_phrase": {"location": "GM0177"}},
                    {"match_phrase": {"location": "GM1742"}},
                    {"match_phrase": {"location": "GM0180"}},
                    {"match_phrase": {"location": "GM1896"}},
                    {"match_phrase": {"location": "GM0175"}},
                    {"match_phrase": {"location": "GM1735"}},
                    {"match_phrase": {"location": "GM0147"}},
                    {"match_phrase": {"location": "GM0163"}},
                    {"match_phrase": {"location": "GM0158"}},
                    {"match_phrase": {"location": "GM1773"}}
            ]
        }
    }
}

total_docs = es.count(index="bron_2024_10_11", body=query)['count']

# Get ES documents and create lookup dictionary
print(f"Fetching {total_docs} ES documents...")

es_docs = {}
for doc in scan(es, query=query, index="bron_2024_10_11", scroll='10m'):
    if 'doc_url' in doc['_source']:
        es_docs[doc['_id']] = doc['_source']['doc_url']

print(f"Found {len(es_docs)} documents with doc_url in ES")

# Fetch and process points from old collection
print("Processing points from old collection")
total_count = qdrant_client.count(collection_name=OLD_COLLECTION).count
batch_size = 1000
offset = None  # Initialize offset to None

with tqdm(desc="Processing points", total=total_count) as pbar:
    while True:
        batch_points, next_offset = qdrant_client.scroll(
            collection_name=OLD_COLLECTION,
            offset=offset,
            limit=batch_size,
            with_payload=True,
            with_vectors=True
        )
        if not batch_points:
            break
        batch_to_upsert = []
        for point in batch_points:
            # Get source_id from meta
            source_id = point.payload.get('meta', {}).get('source_id')

            # If we have matching ES doc with doc_url, add it to meta
            if source_id in es_docs:
                if 'meta' not in point.payload:
                    point.payload['meta'] = {}
                point.payload['meta']['doc_url'] = es_docs[source_id]
            else:
                point.payload['meta']['doc_url'] = ""

            # Create new point with all data
            batch_to_upsert.append(models.PointStruct(
                id=point.id,
                vector=point.vector,
                payload=point.payload
            ))
        # Upload batch to new collection
        if batch_to_upsert:
            qdrant_client.upsert(
                collection_name=NEW_COLLECTION,
                points=batch_to_upsert,
                wait=True
            )
        pbar.update(len(batch_points))

        # Update offset for the next iteration
        offset = next_offset
        if offset is None:
            break

print(f"Processed total of {total_count} points")




sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/bizon/.config/sagemaker/config.yaml


  qdrant_client.recreate_collection(


Created new collection: overijssel_cohere_v2
Fetching 63424 ES documents...
Found 63424 documents with doc_url in ES
Processing points from old collection


Processing points: 100%|██████████| 3161684/3161684 [56:31<00:00, 932.36it/s] 

Processed total of 3161684 points





In [68]:
# update_operations = []

# update_operations.append(
#     models.SetPayloadOperation(
#         set_payload=models.SetPayload(
#             payload={
#                 "meta": {
#                     "doc_url": "blablua1"
#                 }
#             },
#             filter=models.Filter(
#                 must=[
#                     models.FieldCondition(
#                         key="meta.source_id",
#                         match=models.MatchValue(value="fbfbdf577fa66d2660c37d5a32a7d395fac70eb0"),
#                     ),
#                 ],
#             ),
#         )
#     )
# )


# result = qdrant_client.batch_update_points(
#     collection_name="1_gemeente_cohere",
#     update_operations=update_operations,                
# )

# qdrant_client.overwrite_payload(
#     collection_name="1_gemeente_cohere",
#     payload={
#         "meta": {
#             "doc_url": "overwrite1"
#         }
#     },
#     points=models.Filter(
#         must=[
#             models.FieldCondition(
#                 key="meta.source_id",
#                 match=models.MatchValue(value="689e55a7653fadb13916237cf4e742a7877477d2"),
#             ),
#         ],
#     ),
#     wait=False
# )

qdrant_client.set_payload(
    collection_name="1_gemeente_cohere",
    payload={
        "doc_url": "set6.key"
    },   
    key="meta",
    points=models.Filter(
        must=[
            models.FieldCondition(
                key="meta.source_id",
                match=models.MatchValue(value="16437c51ef734a755701cb5cff1fd8cc894847e9"),
            ),
        ],
    ),
    wait=False
)

UpdateResult(operation_id=6496, status=<UpdateStatus.ACKNOWLEDGED: 'acknowledged'>)

In [84]:
import os
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import time
import concurrent.futures
from tabulate import tabulate
from tqdm import tqdm
import time
import concurrent.futures
from tabulate import tabulate
from unstructured.partition.html import partition_html
from qdrant_client import QdrantClient, models
from tqdm import tqdm

# Connect to Elasticsearch
es = Elasticsearch("http://localhost:9200")
qdrant_client = QdrantClient(host="localhost", port=6333, timeout=600)

query = {
    "query": {
        "bool": {
            "must": [
                {"exists": {"field": "description"}},
                {"exists": {"field": "doc_url" }}
            ],
            "minimum_should_match": 1,
            "should": [
                {"match_phrase": {"location": "GM0141"}},
                # {"match_phrase": {"location": "GM0141"}},
                # {"match_phrase": {"location": "GM0166"}},
                # {"match_phrase": {"location": "GM1774"}},
                # {"match_phrase": {"location": "GM0183"}},
                # {"match_phrase": {"location": "GM0173"}},
                # {"match_phrase": {"location": "GM0150"}},
                # {"match_phrase": {"location": "GM1700"}},
                # {"match_phrase": {"location": "GM0164"}},
                # {"match_phrase": {"location": "GM0153"}},
                # {"match_phrase": {"location": "GM0148"}},
                # {"match_phrase": {"location": "GM1708"}},
                # {"match_phrase": {"location": "GM0168"}},
                # {"match_phrase": {"location": "GM0160"}},
                # {"match_phrase": {"location": "GM0189"}},
                # {"match_phrase": {"location": "GM0177"}},
                # {"match_phrase": {"location": "GM1742"}},
                # {"match_phrase": {"location": "GM0180"}},
                # {"match_phrase": {"location": "GM1896"}},
                # {"match_phrase": {"location": "GM0175"}},
                # {"match_phrase": {"location": "GM1735"}},
                # {"match_phrase": {"location": "GM0147"}},
                # {"match_phrase": {"location": "GM0163"}},
                # {"match_phrase": {"location": "GM0158"}},
                # {"match_phrase": {"location": "GM1773"}}
            ]
        }
    }
}

print("Getting ES docs...")

docs = list(scan(es, query=query, index="bron_2024_10_11", scroll='10m'))
doc_ids = [doc['_id'] for doc in docs]

print(f"Got ES Docs: {len(docs)}")


print("Getting Qdrant docs...")

# Get all mappings in one batch operation
search_results = qdrant_client.scroll(
    collection_name="1_gemeente_cohere",
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="meta.source_id",
                match=models.MatchAny(any=doc_ids)
            )
        ]
    ),
    limit=len(doc_ids)  # Get all matches at once
)[0]

print(f"Got Qdrant Docs: {len(search_results)}")

# Create id mapping more efficiently
id_mapping = {
    point.payload['meta']['source_id']: point.id 
    for point in search_results
}

print(f"Points mapped {len(id_mapping)} now...")

# Prepare all update operations at once
update_operations = [
    models.SetPayloadOperation(
        set_payload=models.SetPayload(
            payload={"doc_url": doc['_source']['doc_url']},
            key="meta",
            points=[id_mapping[doc['_id']]]
        )
    )
    for doc in docs
    if doc['_id'] in id_mapping and doc['_source'].get('doc_url')
]

print(f"Staring {len(update_operations)} now...")

# Process in larger batches
batch_size = 10  # Increased batch size significantly
with tqdm(total=len(update_operations), desc="Updating documents") as pbar:
    for i in range(0, len(update_operations), batch_size):
        batch = update_operations[i:i + batch_size]
        result = qdrant_client.batch_update_points(
            collection_name="1_gemeente_cohere",
            update_operations=batch,
            wait=False  # Wait for completion to ensure consistency
        )
        pbar.update(len(batch))

print(f"Updated {len(update_operations)} documents")



Getting ES docs...
Got ES Docs: 8055
Getting Qdrant docs...
Got Qdrant Docs: 8055
Points mapped 2051 now...
Staring 2051 now...


Updating documents:   0%|          | 10/2051 [01:21<4:37:12,  8.15s/it]


KeyboardInterrupt: 

In [17]:
print(es.info())

{'name': 'dl', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'n4QZQRBgTBS-CA5bLkHhQA', 'version': {'number': '8.15.1', 'build_flavor': 'default', 'build_type': 'deb', 'build_hash': '253e8544a65ad44581194068936f2a5d57c2c051', 'build_date': '2024-09-02T22:04:47.310170297Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}
