In [2]:
import sys
import os

project_root = os.path.abspath("../..")

if project_root not in sys.path:
    sys.path.append(project_root)

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [3]:
from datapipeline.utils.spark_session import get_spark_session

spark = get_spark_session("Semantic_Vector_Index")

In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

client = QdrantClient(
    url="http://localhost:6333"
)

In [4]:
client.get_collections()

CollectionsResponse(collections=[])

In [5]:
collection_name = "news_embeddings"

existing = [
    c.name for c in client.get_collections().collections
]

if collection_name not in existing:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=384,
            distance=Distance.COSINE
        )
    )

In [6]:
embeddings_path = os.path.join(
    project_root,
    "sanewsstorage/gold/articles_enriched"
)

df = (
    spark.read.format("delta")
    .load(embeddings_path)
    .select(
        "bronze_hash",
        "embedding",
        "published_at",
        "entities"
    )
)

In [7]:
df.printSchema()
df.count()

root
 |-- bronze_hash: long (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- published_at: timestamp (nullable = true)
 |-- entities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- entity: string (nullable = true)
 |    |    |-- label: string (nullable = true)



27663

In [8]:
pdf = df.toPandas()

In [9]:
from qdrant_client.models import PointStruct
import uuid

batch_size = 500

for i in range(0, len(pdf), batch_size):
    batch = pdf.iloc[i:i + batch_size]
    points = []

    for _, row in batch.iterrows():
        points.append(
            PointStruct(
                id=str(uuid.uuid5(uuid.NAMESPACE_DNS, str(row["bronze_hash"]))),
                vector=row["embedding"],
                payload={
                    "published_at": str(row["published_at"]),
                    "entities": row["entities"]
                }
            )
        )

    client.upsert(
        collection_name=collection_name,
        points=points
    )

In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

query = "Israel Gaza ceasefire talks"
query_vector = model.encode(query).tolist()

results = client.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=5
)

results



QueryResponse(points=[ScoredPoint(id='e7653518-4053-5cc5-8c63-a8970f2b0f5b', version=23, score=0.7116135, payload={'published_at': '2026-02-04 18:26:30', 'entities': [['Israel', 'LOC'], ['Gaza', 'LOC'], ['Hamas', 'ORG'], ['DEIR AL-BALAH', 'MISC'], ['Gaza Strip', 'LOC'], ['AP', 'LOC'], ['Israeli', 'MISC'], ['Gaza on Wednesday', 'MISC'], ['Palestinians', 'MISC'], ['href="https://whdh.com/news/', 'MISC'], ['israel-strikes-gaza-killing-21', 'LOC'], ['DEIR AL-BALAH', 'MISC'], ['Gaza Strip', 'LOC'], ['AP', 'LOC'], ['Israeli', 'MISC'], ['Gaza on Wednesday', 'MISC'], ['Palestinians', 'MISC']]}, vector=None, shard_key=None, order_value=None), ScoredPoint(id='66eb7e64-fa62-5f96-9e23-5957fc17d244', version=13, score=0.6806458, payload={'published_at': '2026-02-08 12:06:30', 'entities': [['Газа', 'PER'], ['Израиль', 'LOC'], ['ONLY', 'ORG'], ['AVAILABLE IN PAID', 'ORG']]}, vector=None, shard_key=None, order_value=None), ScoredPoint(id='f1b807dc-fd81-5065-ad94-dea25c9fc90f', version=29, score=0.6715