In [1]:
import sys
import os

project_root = os.path.abspath("../..")

if project_root not in sys.path:
    sys.path.append(project_root)

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
from datapipeline.utils.spark_session import get_spark_session
spark = get_spark_session("Semantic_Vector_Index")

In [3]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from sentence_transformers import SentenceTransformer
import uuid

In [4]:
client = QdrantClient(url="http://localhost:6333")
collection_name = "news_embeddings"
collections = [c.name for c in client.get_collections().collections]

if collection_name not in collections:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=384,
            distance=Distance.COSINE
        )
    )

In [5]:
embeddings_path = os.path.join(
    project_root,
    "sanewsstorage/gold/articles_enriched"
)

df = (
    spark.read.format("delta")
    .load(embeddings_path)
    .select(
        "bronze_hash",
        "embedding",
        "published_at",
        "entities"
    )
    .filter("embedding IS NOT NULL")
)

In [6]:
existing_ids = set()

scroll_result = client.scroll(
    collection_name=collection_name,
    limit=10000,
    with_vectors=False
)

In [7]:
while scroll_result[0]:
    points, next_page = scroll_result
    for p in points:
        existing_ids.add(p.id)
    if not next_page:
        break
    scroll_result = client.scroll(
        collection_name=collection_name,
        offset=next_page,
        limit=10000,
        with_vectors=False
    )

In [8]:
def make_id(bronze_hash):
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, str(bronze_hash)))

pdf = df.toPandas()
pdf["point_id"] = pdf["bronze_hash"].apply(make_id)

pdf = pdf[~pdf["point_id"].isin(existing_ids)]

In [9]:
if len(pdf) == 0:
    print("No new vectors to index.")
    raise SystemExit

In [10]:
batch_size = 500

for i in range(0, len(pdf), batch_size):

    batch = pdf.iloc[i:i + batch_size]
    points = []

    for _, row in batch.iterrows():
        points.append(
            PointStruct(
                id=row["point_id"],
                vector=row["embedding"],
                payload={
                    "bronze_hash": row["bronze_hash"],
                    "published_at": str(row["published_at"]),
                    "entities": row["entities"]
                }
            )
        )

    client.upsert(
        collection_name=collection_name,
        points=points
    )

print(f"Indexed {len(pdf)} new vectors.")

Indexed 27663 new vectors.


In [11]:
model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

def semantic_search(query, limit=10):

    query_vector = model.encode(query).tolist()

    results = client.query_points(
        collection_name=collection_name,
        query=query_vector,
        limit=limit
    )

    formatted = []

    for r in results.points:
        formatted.append({
            "bronze_hash": r.payload.get("bronze_hash"),
            "score": r.score,
            "published_at": r.payload.get("published_at"),
            "entities": r.payload.get("entities")
        })

    return formatted



In [12]:
semantic_search("Israel Gaza ceasefire talks", limit=5)

[{'bronze_hash': -1687600092685131584,
  'score': 0.7116135,
  'published_at': '2026-02-04 18:26:30',
  'entities': [['Israel', 'LOC'],
   ['Gaza', 'LOC'],
   ['Hamas', 'ORG'],
   ['DEIR AL-BALAH', 'MISC'],
   ['Gaza Strip', 'LOC'],
   ['AP', 'LOC'],
   ['Israeli', 'MISC'],
   ['Gaza on Wednesday', 'MISC'],
   ['Palestinians', 'MISC'],
   ['href="https://whdh.com/news/', 'MISC'],
   ['israel-strikes-gaza-killing-21', 'LOC'],
   ['DEIR AL-BALAH', 'MISC'],
   ['Gaza Strip', 'LOC'],
   ['AP', 'LOC'],
   ['Israeli', 'MISC'],
   ['Gaza on Wednesday', 'MISC'],
   ['Palestinians', 'MISC']]},
 {'bronze_hash': 2382599949308452721,
  'score': 0.6806458,
  'published_at': '2026-02-08 12:06:30',
  'entities': [['Газа', 'PER'],
   ['Израиль', 'LOC'],
   ['ONLY', 'ORG'],
   ['AVAILABLE IN PAID', 'ORG']]},
 {'bronze_hash': 3920387014603932467,
  'score': 0.6715852,
  'published_at': '2026-02-04 04:29:48',
  'entities': [['Israel', 'LOC'],
   ['Has Killed at Least 529 Palestinians', 'MISC'],
   ['Gaza

In [13]:
spark.stop()