In [13]:
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.chains import RetrievalQA
# from langchain.schema import Document
from langchain_core.documents import Document
# from qdrant_client import QdrantClient
from qdrant_client import QdrantClient, models as qm
from qdrant_client.http import models
import json

In [2]:
QDRANT_URL = "http://127.0.0.1:6333"
COLLECTION_NAME = "hybrid_docs"
DATA_PATH = "../data/processed/proc_docx.jsonl"
EMBED_MODEL_NAME = 'intfloat/multilingual-e5-large'
EMBED_DIM = 1024

In [3]:
docs = []
with open(DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        text = data.get("text", "")
        meta = {
            "doc_id": data.get("doc_id", ""),
            "file_path": data.get("file_path", "")
        }
        if text.strip():
            docs.append({"text": text, "metadata": meta})

In [4]:
len(docs)

31

In [5]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts, metadatas = [], []
for doc in docs:
    chunks = splitter.split_text(doc["text"])
    texts.extend(chunks)
    metadatas.extend([doc["metadata"]] * len(chunks))

In [6]:
len(texts), len(metadatas)

(53, 53)

In [7]:
texts = ['passage: ' + t for t in texts]

In [14]:
emb = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)

In [18]:
client = QdrantClient(url=QDRANT_URL)

# # если коллекция уже есть — пересоздадим
# collections = [c.name for c in client.get_collections().collections]
# if COLLECTION_NAME in collections:
#     client.delete_collection(COLLECTION_NAME)
# # создаём коллекцию под гибридный поиск
# client.create_collection(
#     collection_name=COLLECTION_NAME,
#     vectors_config=models.VectorParams(size=EMBED_DIM, distance=models.Distance.COSINE),
# )
# # включаем BM25 для текстового поля
# client.create_payload_index(
#     collection_name=COLLECTION_NAME,
#     field_name="text",
#     field_schema=models.PayloadSchemaType.TEXT,
# )

# создаём коллекцию
client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={"dense": qm.VectorParams(size=EMBED_DIM, distance="Cosine")},
)

# создаём текстовый индекс (BM25)
client.create_payload_index(
    collection_name=COLLECTION_NAME,
    field_name="text",
    field_schema=qm.PayloadSchemaType.TEXT,
)

points = []
for idx, d in enumerate(docs):
    vec = emb.embed_query(f"passage: {d['text']}")
    points.append(
        qm.PointStruct(
            id=idx,
            vector={"dense": vec},
            payload=d
        )
    )
client.upsert(collection_name=COLLECTION_NAME, points=points)

  client.recreate_collection(


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [19]:
# qdrant = Qdrant(
#     client=client,
#     collection_name=COLLECTION_NAME,
#     embeddings=embeddings,
# )
# qdrant.add_texts(texts=texts, metadatas=[{"text": t, **m} for t, m in zip(texts, metadatas)])

In [27]:
# def hybrid_search(query: str, alpha: float = 0.5, k: int = 5):
#     q_vec = emb.embed_query(f"query: {query}")
#     hits = client.query_points(
#         collection_name=COLLECTION_NAME,
#         query=q_vec,
#         using=["dense", "text"],    # dense + встроенный BM25
#         limit=k,
#         with_payload=True,
#         # search_params=qm.SearchParams(hybrid=qm.HybridSearch(alpha=alpha))
#         search_params=qm.SearchParams(hybrid_params=qm.HybridParams(alpha=alpha))
#     )
#     return hits

def hybrid_search(query: str, alpha: float = 0.5, k: int = 5):
    q_vec = emb.embed_query(f"query: {query}")
    return client.query_points(
        collection_name=COLLECTION_NAME,
        query=q_vec,
        using=["dense", "text"],
        limit=k,
        with_payload=True,
        search_params=qm.SearchParams(
            fusion=qm.FusionQuery(mode="relative_score", alpha=alpha)
        ),
    )

In [28]:
# for alpha in [0.0, 0.3, 0.5, 0.8, 1.0]:
for alpha in [0.3]:
    print(f"\n⚖️ alpha={alpha}")
    results = hybrid_search("Как оформить скидку по QR-коду?", alpha)
    for r in results.points:
        print(f"- {r.payload.get('file_path')} | score={r.score:.4f}")
        print(f"  {r.payload['text'][:100]}...\n")


⚖️ alpha=0.3


ValidationError: 3 validation errors for FusionQuery
fusion
  Field required [type=missing, input_value={'mode': 'relative_score', 'alpha': 0.3}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
mode
  Extra inputs are not permitted [type=extra_forbidden, input_value='relative_score', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/extra_forbidden
alpha
  Extra inputs are not permitted [type=extra_forbidden, input_value=0.3, input_type=float]
    For further information visit https://errors.pydantic.dev/2.12/v/extra_forbidden