From e8f446eff62edf12a3b6ff4bcab5d2fe4ceb3d23 Mon Sep 17 00:00:00 2001 From: Mridula Date: Thu, 29 May 2025 14:44:40 +0100 Subject: [PATCH] Migrate Hybrid Search Labs Notebook from RRF to Retrievers API (Elasticsearch 9.x) (#459) This commit migrates the Hybrid Search Labs Notebook to use the new Retrievers API in Elasticsearch 9.x, replacing the previous RRF-based implementation. The necessary changes to the JSON structure for the retriever have been addressed to ensure compatibility with the updated API. Note: While the migration covers the JSON and API integration, there may be underlying issues with the notebook execution or the Makefile that are not fully resolved in this commit. Further investigation and testing may be required to ensure smooth operation. --- notebooks/search/02-hybrid-search.ipynb | 36 ++++++++++++++----------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/notebooks/search/02-hybrid-search.ipynb b/notebooks/search/02-hybrid-search.ipynb index 4d7e7a87..5516974d 100644 --- a/notebooks/search/02-hybrid-search.ipynb +++ b/notebooks/search/02-hybrid-search.ipynb @@ -196,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -204,13 +204,13 @@ " if len(response[\"hits\"][\"hits\"]) == 0:\n", " print(\"Your search returned no results.\")\n", " else:\n", - " for hit in response[\"hits\"][\"hits\"]:\n", + " for idx, hit in enumerate(response[\"hits\"][\"hits\"], start=1):\n", " id = hit[\"_id\"]\n", " publication_date = hit[\"_source\"][\"publish_date\"]\n", - " rank = hit[\"_rank\"]\n", + " score = hit[\"_score\"]\n", " title = hit[\"_source\"][\"title\"]\n", " summary = hit[\"_source\"][\"summary\"]\n", - " pretty_output = f\"\\nID: {id}\\nPublication date: {publication_date}\\nTitle: {title}\\nSummary: {summary}\\nRank: {rank}\"\n", + " pretty_output = f\"\\nID: {id}\\nPublication date: {publication_date}\\nTitle: {title}\\nSummary: {summary}\\nRank: {idx}\\nScore: {score}\"\n", " print(pretty_output)" ] }, @@ -231,12 +231,12 @@ "\n", "We then use [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html) to balance the scores to provide a final list of documents, ranked in order of relevance. RRF is a ranking algorithm for combining results from different information retrieval strategies.\n", "\n", - "Note that _score is null, and we instead use _rank to show our top-ranked documents." + "Note: With the retriever API, _score contains the document’s relevance score, and the rank is simply the position in the results (first result is rank 1, etc.)." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -280,18 +280,22 @@ "response = client.search(\n", " index=\"book_index\",\n", " size=5,\n", - " query={\"match\": {\"summary\": \"python programming\"}},\n", - " knn={\n", - " \"field\": \"title_vector\",\n", - " \"query_vector\": model.encode(\n", - " \"python programming\"\n", - " ).tolist(), # generate embedding for query so it can be compared to `title_vector`\n", - " \"k\": 5,\n", - " \"num_candidates\": 10,\n", + " retriever={\n", + " \"rrf\": {\n", + " \"retrievers\": [\n", + " {\"standard\": {\"query\": {\"match\": {\"summary\": \"python programming\"}}}},\n", + " {\n", + " \"knn\": {\n", + " \"field\": \"title_vector\",\n", + " \"query_vector\": model.encode(\"python programming\").tolist(),\n", + " \"k\": 5,\n", + " \"num_candidates\": 10,\n", + " }\n", + " },\n", + " ]\n", + " }\n", " },\n", - " rank={\"rrf\": {}},\n", ")\n", - "\n", "pretty_response(response)" ] }