# OpenSearch: Индексация корпуса в makar_sdek1

Этот ноутбук создаёт индекс `makar_sdek1` с кастомными русскими анализаторами и полем `knn_vector`, затем индексирует документы из `indexes/corpus.json` с векторами размерности 256.



In [None]:
# If needed, install dependencies (uncomment to run)
# pip install -q opensearch-py requests fastembed sentence-transformers scikit-learn numpy tqdm


In [None]:
import os
import json
from typing import List, Dict, Any, Iterable, Optional
from dotenv import load_dotenv

import numpy as np
from tqdm import tqdm
from sklearn.decomposition import PCA

import requests
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests.auth import HTTPBasicAuth

load_dotenv()



In [None]:
# Config
BASE_DIR = "/Users/admin/СДЭК"
CORPUS_JSON = os.path.join(BASE_DIR, "corpus.json")
INDEX_NAME = "makar_ozon"

OPENSEARCH_URL = os.getenv("OPENSEARCH_URL")
OPENSEARCH_USER = os.getenv("OPENSEARCH_USER")
OPENSEARCH_PASSWORD = os.getenv("OPENSEARCH_PASSWORD")

if not OPENSEARCH_URL:
    raise ValueError("OPENSEARCH_URL must be set in environment variables")
if not OPENSEARCH_USER:
    raise ValueError("OPENSEARCH_USER must be set in environment variables")
if not OPENSEARCH_PASSWORD:
    raise ValueError("OPENSEARCH_PASSWORD must be set in environment variables")

print("Using OpenSearch:", OPENSEARCH_URL)
print("Index:", INDEX_NAME)
print("Corpus:", CORPUS_JSON)



Using OpenSearch: https://localhost:9200
Index: makar_ozon
Corpus: /Users/admin/СДЭК/corpus.json


In [None]:
print("Auth user:", OPENSEARCH_USER)
print("Auth password:", "***" if OPENSEARCH_PASSWORD else "<empty>")


Auth user: admin
Auth password: ***


In [None]:
YANDEX_API_KEY = os.getenv("YANDEX_API_KEY")
YANDEX_FOLDER_ID = os.getenv("YANDEX_FOLDER_ID")

if not YANDEX_API_KEY:
    raise ValueError("YANDEX_API_KEY must be set in environment variables")
if not YANDEX_FOLDER_ID:
    raise ValueError("YANDEX_FOLDER_ID must be set in environment variables")

os.environ["LLM_PROVIDER"] = os.getenv("LLM_PROVIDER", "yandex")
os.environ["YANDEX_API_KEY"] = YANDEX_API_KEY
os.environ["YANDEX_FOLDER_ID"] = YANDEX_FOLDER_ID
os.environ["YANDEX_LLM_MODEL"] = os.getenv("YANDEX_LLM_MODEL", "yandexgpt-lite")
os.environ["YANDEX_COMPLETION_URL"] = os.getenv("YANDEX_COMPLETION_URL", "https://llm.api.cloud.yandex.net/foundationModels/v1/completion")
os.environ["YANDEX_EMBED_MODEL"] = os.getenv("YANDEX_EMBED_MODEL", "text-search-doc")

print("LLM provider:", os.environ["LLM_PROVIDER"])
print("Yandex folder:", os.environ["YANDEX_FOLDER_ID"])
print("Yandex embed model:", os.environ["YANDEX_EMBED_MODEL"])
print("Yandex API key:", "***" if os.environ.get("YANDEX_API_KEY") else "<empty>")


LLM provider: yandex
Yandex folder: b1gql4st0j9joerfcttt
Yandex embed model: text-search-doc
Yandex API key: ***


In [17]:
# Index body (exactly as requested, adapted to Python dict)
INDEX_BODY: Dict[str, Any] = {
    "settings": {
        "index": {
            "number_of_shards": 1,
            "number_of_replicas": 0,
            "knn": True,
            "knn.algo_param.ef_search": 100,
            "similarity": {
                "custom_similarity": {
                    "type": "BM25",
                    "k1": 1.2,
                    "b": 0.75,
                    "discount_overlaps": "true",
                }
            },
            "analysis": {
                "filter": {
                    "russian_stemmer": {"type": "stemmer", "language": "russian"},
                    "unique_pos": {"type": "unique", "only_on_same_position": False},
                    "my_multiplexer": {
                        "type": "multiplexer",
                        "filters": [
                            "keyword_repeat",
                            "russian_stemmer",
                            "remove_duplicates",
                        ],
                    },
                },
                "analyzer": {
                    "search_text_analyzer": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["lowercase", "my_multiplexer", "unique_pos"],
                        "char_filter": ["e_mapping"],
                    },
                    "ru_international_translit_analyzer": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": [
                            "lowercase",
                            "russian_stemmer"
                        ],
                        "char_filter": ["transliteration_filter", "e_mapping"],
                    },
                    "text_analyzer": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": [
                            "lowercase",
                            "russian_stemmer"
                        ],
                        "char_filter": ["e_mapping"],
                    },
                    "exact_analyzer": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["lowercase"],
                        "char_filter": ["e_mapping"],
                    },
                    "text_standard": {"type": "standard"},
                    "text_whitespace": {"type": "whitespace"},
                    "text_lowercase": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["lowercase"],
                    },
                },
                "char_filter": {
                    "transliteration_filter": {
                        "type": "mapping",
                        "mappings": [
                            "a => а",
                            "b => б",
                            "v => в",
                            "g => г",
                            "d => д",
                            "e => е",
                            "ye => ё",
                            "zh => ж",
                            "z => з",
                            "i => и",
                            "j => й",
                            "k => к",
                            "l => л",
                            "m => м",
                            "n => н",
                            "o => о",
                            "p => п",
                        ],
                    },
                    "e_mapping": {"type": "mapping", "mappings": ["e => ё"]},
                },
            },
        }
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text",
                "analyzer": "text_analyzer",
                "similarity": "BM25",
            },
            "source": {"type": "keyword"},
            "chunk_id": {"type": "keyword"},
            "text_vector": {
                "type": "knn_vector",
                "dimension": 256,
                "space_type": "cosinesimil",
                "method": {
                    "name": "hnsw",
                    "engine": "faiss",
                    "parameters": {"ef_construction": 512, "m": 64},
                },
            },
        }
    },
}
print("Index body prepared")


Index body prepared


In [18]:
def create_client(url: str, user: Optional[str], password: Optional[str]) -> OpenSearch:
    auth = HTTPBasicAuth(user, password) if user and password else None
    client = OpenSearch(
        hosts=[url],
        http_compress=True,
        http_auth=auth,
        use_ssl=url.startswith("https://"),
        verify_certs=False,
        connection_class=RequestsHttpConnection,
        timeout=60,
        max_retries=3,
        retry_on_timeout=True,
    )
    return client

client = create_client(OPENSEARCH_URL, OPENSEARCH_USER, OPENSEARCH_PASSWORD)
print("Client ready")


Client ready


In [19]:
# Delete if exists and create index
if client.indices.exists(index=INDEX_NAME):
    print(f"Index {INDEX_NAME} exists. Deleting...")
    client.indices.delete(index=INDEX_NAME)

print(f"Creating index {INDEX_NAME}...")
client.indices.create(index=INDEX_NAME, body=INDEX_BODY)
print("Index created")


Creating index makar_ozon...
Index created




In [None]:
# Load corpus
if not os.path.exists(CORPUS_JSON):
    raise FileNotFoundError(f"Corpus not found: {CORPUS_JSON}")

with open(CORPUS_JSON, "r", encoding="utf-8") as f:
    corpus = json.load(f)

print(f"Loaded {len(corpus)} records")

# normalize
docs: List[Dict[str, Any]] = []
for i, item in enumerate(corpus):
    text = (item.get("text") or "").strip()
    md = item.get("metadata") or {}
    if not text:
        continue
    docs.append({
        "_id": md.get("chunk_id") or f"doc-{i}",
        "text": text,
        "source": md.get("source") or "unknown",
        "chunk_id": md.get("chunk_id") or f"doc-{i}",
    })

print(f"Prepared {len(docs)} docs for indexing")


Prepared 167 markdown paragraphs from 14 files in '/Users/admin/Downloads/Telegram Desktop/Документация 2'


In [22]:
# Embedding pipeline via Yandex Foundation Models (default)
TARGET_DIM = 256
EMBED_PROVIDER = os.getenv("EMBED_PROVIDER", "yandex").lower()

YANDEX_API_KEY = os.getenv("YANDEX_API_KEY")
YANDEX_FOLDER_ID = os.getenv("YANDEX_FOLDER_ID")
YANDEX_EMBED_MODEL = os.getenv("YANDEX_EMBED_MODEL", "text-search-doc")  # or "text-search-query"
YANDEX_EMBEDDINGS_URL = os.getenv(
    "YANDEX_EMBEDDINGS_URL",
    "https://llm.api.cloud.yandex.net/foundationModels/v1/textEmbedding",
)

if EMBED_PROVIDER == "yandex":
    if not (YANDEX_API_KEY and YANDEX_FOLDER_ID):
        raise ValueError("YANDEX_API_KEY and YANDEX_FOLDER_ID are required for Yandex embeddings")

    model_uri = f"emb://{YANDEX_FOLDER_ID}/{YANDEX_EMBED_MODEL}/latest"
    headers = {
        "Authorization": f"Api-Key {YANDEX_API_KEY}",
        "x-folder-id": YANDEX_FOLDER_ID,
        "Content-Type": "application/json",
    }

    def yandex_embed_one(text: str) -> List[float]:
        body = {"modelUri": model_uri, "text": text}
        resp = requests.post(YANDEX_EMBEDDINGS_URL, headers=headers, json=body, timeout=60)
        resp.raise_for_status()
        data = resp.json()
        emb = data.get("embedding") or (data.get("result") or {}).get("embedding")
        if emb is None:
            raise RuntimeError(f"Bad embedding response: {data}")
        return emb

    vectors: List[List[float]] = []
    for d in tqdm(docs, desc="Yandex embeddings"):
        vectors.append(yandex_embed_one(d["text"]))

    X = np.asarray(vectors, dtype=np.float32)
    print("Raw embedding shape:", X.shape)

    raw_dim = X.shape[1] if X.size else TARGET_DIM
    if raw_dim > TARGET_DIM and X.size:
        pca = PCA(n_components=TARGET_DIM, random_state=42)
        X_reduced = pca.fit_transform(X).astype(np.float32)
        print(f"Applied PCA: {raw_dim} -> {TARGET_DIM}")
    elif raw_dim < TARGET_DIM and X.size:
        pad = np.zeros((X.shape[0], TARGET_DIM - raw_dim), dtype=np.float32)
        X_reduced = np.hstack([X, pad]).astype(np.float32)
        print(f"Zero-padded: {raw_dim} -> {TARGET_DIM}")
    else:
        X_reduced = X.astype(np.float32)
        print(f"Using native dim: {raw_dim}")

    for i, d in enumerate(docs):
        d["text_vector"] = X_reduced[i].tolist()

    print("Prepared vectors for docs (Yandex):", len(docs))

else:
    # Local fallback (FastEmbed / SentenceTransformers)
    EMBED_MODEL_NAME = os.getenv("EMBED_MODEL", "intfloat/multilingual-e5-small")
    embed_backend = None
    try:
        from fastembed import TextEmbedding
        _fe = TextEmbedding(model_name=EMBED_MODEL_NAME)
        def embed_batch(texts: List[str]) -> np.ndarray:
            embs = list(_fe.embed(texts))
            return np.asarray(embs, dtype=np.float32)
        embed_backend = f"fastembed:{EMBED_MODEL_NAME}"
    except Exception as e:
        try:
            from sentence_transformers import SentenceTransformer
            _hf = SentenceTransformer(EMBED_MODEL_NAME)
            def embed_batch(texts: List[str]) -> np.ndarray:
                embs = _hf.encode(texts, batch_size=64, show_progress_bar=False, normalize_embeddings=True)
                return np.asarray(embs, dtype=np.float32)
            embed_backend = f"sbert:{EMBED_MODEL_NAME}"
        except Exception as ee:
            raise RuntimeError(f"No embedding backend available: {e} | {ee}")

    print("Embedding backend:", embed_backend)
    batch_size = 256
    all_embeddings: List[np.ndarray] = []
    for i in tqdm(range(0, len(docs), batch_size)):
        batch = [d["text"] for d in docs[i:i+batch_size]]
        if not batch:
            continue
        all_embeddings.append(embed_batch(batch))

    X = np.vstack(all_embeddings) if all_embeddings else np.zeros((0, TARGET_DIM), dtype=np.float32)
    print("Raw embedding shape:", X.shape)

    raw_dim = X.shape[1] if X.size else TARGET_DIM
    if raw_dim > TARGET_DIM and X.size:
        pca = PCA(n_components=TARGET_DIM, random_state=42)
        X_reduced = pca.fit_transform(X).astype(np.float32)
        print(f"Applied PCA: {raw_dim} -> {TARGET_DIM}")
    elif raw_dim < TARGET_DIM and X.size:
        pad = np.zeros((X.shape[0], TARGET_DIM - raw_dim), dtype=np.float32)
        X_reduced = np.hstack([X, pad]).astype(np.float32)
        print(f"Zero-padded: {raw_dim} -> {TARGET_DIM}")
    else:
        X_reduced = X.astype(np.float32)
        print(f"Using native dim: {raw_dim}")

    for i, d in enumerate(docs):
        d["text_vector"] = X_reduced[i].tolist()

    print("Prepared vectors for docs (local):", len(docs))


Yandex embeddings: 100%|██████████| 167/167 [00:36<00:00,  4.60it/s]

Raw embedding shape: (167, 256)
Using native dim: 256
Prepared vectors for docs (Yandex): 167





In [None]:
len(docs)

167

In [23]:
# Bulk indexing
BULK_ENDPOINT = f"/{INDEX_NAME}/_bulk"

lines: List[str] = []
for d in docs:
    # Do not mutate original doc; support re-runs safely
    doc_id = d.get("_id") or f"{d.get('source', 'unknown')}::{d.get('chunk_id', '')}"
    src = {k: v for k, v in d.items() if k != "_id"}  # contains text, source, chunk_id, text_vector
    meta = {"index": {"_index": INDEX_NAME, "_id": doc_id}}
    lines.append(json.dumps(meta, ensure_ascii=False))
    lines.append(json.dumps(src, ensure_ascii=False))

# Chunked send (avoid too large payloads; keep even line count)
chunk_lines = 2000  # lines, not docs
for i in tqdm(range(0, len(lines), chunk_lines)):
    payload = "\n".join(lines[i:i+chunk_lines]) + "\n"
    resp = client.transport.perform_request("POST", BULK_ENDPOINT, body=payload)
    if isinstance(resp, dict) and resp.get("errors"):
        print("Bulk errors in chunk:", sum(1 for it in resp.get("items", []) if (it.get("index") or {}).get("error")))

print(f"Indexed {len(docs)} docs into '{INDEX_NAME}'")


100%|██████████| 1/1 [00:00<00:00,  3.98it/s]

Indexed 167 docs into 'makar_ozon'





# Второй варик индексации

In [12]:
# Per-doc indexing via requests with explicit _id (idempotent)
from urllib.parse import quote

INDEX_DOC_BASE = f"{OPENSEARCH_URL.rstrip('/')}/{INDEX_NAME}/_doc"
AUTH = HTTPBasicAuth(OPENSEARCH_USER, OPENSEARCH_PASSWORD)

session = requests.Session()
session.auth = AUTH
session.verify = False
session.headers.update({"Content-Type": "application/json"})

ok, err = 0, 0
for idx, d in enumerate(docs):
    doc_id = d.get("_id") or d.get("chunk_id") or f"doc-{idx}"
    url = f"{INDEX_DOC_BASE}/{quote(str(doc_id), safe='')}"
    document = {
        "text": d.get("text", ""),
        "source": d.get("source", "unknown"),
        "chunk_id": d.get("chunk_id", f"doc-{idx}"),
        "text_vector": d.get("text_vector"),
    }
    try:
        resp = session.put(url, json=document, timeout=30)
        try:
            body = resp.json()
        except Exception:
            body = {"raw": resp.text[:1000]}
        print(f"[{idx}] PUT {resp.status_code} id={doc_id} | result={(body.get('result') or body.get('_result') or '')}")
        ok += 1
    except Exception as e:
        print(f"[{idx}] Error indexing id={doc_id}: {e}")
        err += 1

print(f"Done. ok={ok} err={err}")




[0] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::0 | result=updated
[1] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::1 | result=updated
[2] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::2 | result=updated
[3] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::3 | result=updated
[4] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::4 | result=updated
[5] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::5 | result=updated
[6] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::6 | result=updated
[7] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::7 | result=updated
[8] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::8 | result=updated
[9] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::9 | result=updated
[10] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::10 | result=updated
[11] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_



[38] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::38 | result=updated
[39] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::39 | result=updated
[40] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::40 | result=updated
[41] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::41 | result=updated
[42] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::42 | result=updated
[43] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::43 | result=updated
[44] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::44 | result=updated
[45] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::45 | result=updated
[46] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::46 | result=updated
[47] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::47 | result=updated
[48] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::48 | result=updated
[49] PUT 200 id=Sbornik_profilak



[79] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::79 | result=updated
[80] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::80 | result=updated
[81] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::81 | result=updated
[82] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::82 | result=updated
[83] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::83 | result=updated
[84] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::84 | result=updated
[85] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::85 | result=updated
[86] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::86 | result=updated
[87] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::87 | result=updated
[88] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::88 | result=updated
[89] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::89 | result=updated
[90] PUT 200 id=Sbornik_profilak



[119] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::119 | result=updated
[120] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::120 | result=updated
[121] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::121 | result=updated
[122] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::122 | result=updated
[123] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::123 | result=updated
[124] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::124 | result=updated
[125] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::125 | result=updated
[126] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::126 | result=updated
[127] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::127 | result=updated
[128] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::128 | result=updated
[129] PUT 200 id=Sbornik_profilaktika_emotsionalnogo_vygoraniya.pdf::129 | result=updated
[130] PUT 



In [24]:
from urllib.parse import quote

def hybrid_search_rrf(query_text: str, top_k: int = 10, bm25_size: int = 50, knn_k: int = 50, k_rrf: int = 60):
    bm25_size = max(bm25_size, top_k)
    knn_k = max(knn_k, top_k)

    bm25_body = {
        "size": bm25_size,
        "query": {
            "multi_match": {
                "query": query_text,
                "fields": ["text^2", "source"],
                "type": "best_fields",
                "operator": "and",
                "analyzer": "search_text_analyzer",
            }
        },
        "_source": ["text", "source", "chunk_id"],
    }
    res_bm25 = client.search(index=INDEX_NAME, body=bm25_body)
    bm25_hits = res_bm25.get("hits", {}).get("hits", [])

    # 2) Yandex query embedding + kNN
    YANDEX_API_KEY = os.environ.get("YANDEX_API_KEY")
    YANDEX_FOLDER_ID = os.environ.get("YANDEX_FOLDER_ID")
    YANDEX_EMBEDDINGS_URL = os.environ.get(
        "YANDEX_EMBEDDINGS_URL",
        "https://llm.api.cloud.yandex.net/foundationModels/v1/textEmbedding",
    )
    QUERY_EMBED_MODEL = os.getenv("YANDEX_QUERY_EMBED_MODEL", "text-search-query")
    if not (YANDEX_API_KEY and YANDEX_FOLDER_ID):
        raise ValueError("YANDEX_API_KEY and YANDEX_FOLDER_ID are required for Yandex embeddings")

    headers = {
        "Authorization": f"Api-Key {YANDEX_API_KEY}",
        "x-folder-id": YANDEX_FOLDER_ID,
        "Content-Type": "application/json",
    }
    q_model_uri = f"emb://{YANDEX_FOLDER_ID}/{QUERY_EMBED_MODEL}/latest"
    q_body = {"modelUri": q_model_uri, "text": query_text}
    resp_q = requests.post(YANDEX_EMBEDDINGS_URL, headers=headers, json=q_body, timeout=30)
    resp_q.raise_for_status()
    q_data = resp_q.json()
    q_vec = q_data.get("embedding") or (q_data.get("result") or {}).get("embedding")
    if q_vec is None:
        raise RuntimeError(f"Bad query embedding response: {q_data}")

    knn_body = {
        "size": knn_k,
        "query": {
            "knn": {
                "text_vector": {
                    "vector": q_vec,
                    "k": knn_k
                }
            }
        },
        "_source": ["text", "source", "chunk_id"],
    }
    res_knn = client.search(index=INDEX_NAME, body=knn_body)
    knn_hits = res_knn.get("hits", {}).get("hits", [])

    # RRF merge
    def key_for(hit):
        src = hit.get("_source", {})
        return hit.get("_id") or f"{src.get('source')}::{src.get('chunk_id')}"

    bm25_rank = {key_for(h): i + 1 for i, h in enumerate(bm25_hits)}
    knn_rank = {key_for(h): i + 1 for i, h in enumerate(knn_hits)}

    rrf = {}
    cache = {}
    for i, h in enumerate(bm25_hits, 1):
        k = key_for(h)
        rrf[k] = rrf.get(k, 0.0) + 1.0 / (k_rrf + i)
        cache[k] = h
    for i, h in enumerate(knn_hits, 1):
        k = key_for(h)
        rrf[k] = rrf.get(k, 0.0) + 1.0 / (k_rrf + i)
        cache[k] = h

    fused = sorted(rrf.items(), key=lambda x: x[1], reverse=True)[:top_k]
    results = []
    for rank, (k, score) in enumerate(fused, 1):
        h = cache[k]
        src = h.get("_source", {})
        results.append({
            "rank": rank,
            "rrf_score": score,
            "bm25_rank": bm25_rank.get(k),
            "knn_rank": knn_rank.get(k),
            "source": src.get("source"),
            "chunk_id": src.get("chunk_id"),
            "text": (src.get("text") or "")[:240].replace("\n", " ") + ("..." if len(src.get("text") or "") > 240 else ""),
        })
    return results

# Example run
user_query_text = "профессиональное выгорание профилактика"
TOP_K = 8
res = hybrid_search_rrf(user_query_text, top_k=TOP_K)
for r in res:
    print(f"[{r['rank']}] rrf={r['rrf_score']:.4f} | bm25={r['bm25_rank']} | knn={r['knn_rank']} | {r['source']} #{r['chunk_id']}")
    print(r['text'])
    print("-")




[1] rrf=0.0164 | bm25=None | knn=1 | Жалобы и эскалация инцидентов.md #Жалобы и эскалация инцидентов.md::p6
Коммуникация с конфликтными и эмоциональными клиентами требует особого стиля речи. Оператор избегает фраз, обесценивающих эмоции клиента, и не призывает его «успокоиться», вместо этого признавая, что ситуация действительно может вызывать не...
-
[2] rrf=0.0161 | bm25=None | knn=2 | Жалобы и эскалация инцидентов.md #Жалобы и эскалация инцидентов.md::p4
При оформлении официальной жалобы оператор обязан чётко и нейтрально описать суть претензии. В тексте жалобы фиксируются полные данные клиента, идентификаторы затронутых продуктов, дата и время инцидента, канал взаимодействия и суть жалобы ...
-
[3] rrf=0.0159 | bm25=None | knn=3 | Идентификация клиента и безопасность операций.md #Идентификация клиента и безопасность операций.md::p8
С точки зрения коммуникации оператор всегда открыто объясняет клиенту, зачем проводится проверка и какие риски она предотвращает. Если клиент возмущён

