# Elasticsearch Query Configuration Experiment Notebook

## Install Elasticsearch Package

In [1]:
!pip install -qU elasticsearch

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/412.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/412.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m409.6/412.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.6/412.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.8/59.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## Import Library

In [2]:
import os
import json
import time

from elasticsearch import Elasticsearch, helpers, exceptions

## Connect to Elastic Cloud

In [3]:
ES_CLOUD_ID = '5481-Project:dXMtY2VudHJhbDEuZ2NwLmNsb3VkLmVzLmlvOjQ0MyQ5NTc2NTk4MTZhOTU0MjQwYTMzNTcxMmExNzhiMjkwMSRiN2Y0ZWQ4YWYyZjg0Y2ZjYjkxNThmNmZlN2YzZGIwOA=='
ES_API_KEY = 'TWpwdTRJc0JxS3hZMjI0N19nY2Q6aVNONVhCRURTTGlDM3FnSVJXV0FZUQ=='

es = Elasticsearch(cloud_id=ES_CLOUD_ID, api_key=ES_API_KEY)

if es.ping():
    print("Connected to Elasticsearch on Elastic Cloud")
    print(es.info())
else:
    print("Connection failed")

Connected to Elasticsearch on Elastic Cloud
{'name': 'instance-0000000001', 'cluster_name': '957659816a954240a335712a178b2901', 'cluster_uuid': 'P6T96E1USIO1gnF5NWspHw', 'version': {'number': '8.11.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '6f9ff581fbcde658e6f69d6ce03050f060d1fd0c', 'build_date': '2023-11-11T10:05:59.421038163Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


## ELSER Model Deployment

In [4]:
es_model_id = ".elser_model_2_linux-x86_64"

try:
    trained_models = es.ml.get_trained_models()
    # Check if the model is present in the trained model info
    if any(model['model_id'] == es_model_id for model in trained_models.get("trained_model_configs", [])):
        print(f"Pre-trained Model '{es_model_id}' is deployed.")

    else:
        print(f"Pre-trained Model '{es_model_id}' is not deployed yet.")

        try:
            print(f"Start deploying the Model '{es_model_id}'.")
            es.ml.start_trained_model_deployment(
            model_id=es_model_id,
            number_of_allocations=1
            )
            print("ELSER Model is deployed.")

        except exceptions.NotFoundError:
                print(f"Pre-trained Model '{es_model_id}' doesn't exist. Now start downloading.")
                # Creates the ELSER model configuration. Automatically downloads the model if it doesn't exist.
                es.ml.put_trained_model(
                    model_id=es_model_id,
                    input={
                    "field_names": ["text_field"]
                    }
                )
                while True:
                    status = es.ml.get_trained_models(
                        model_id=es_model_id,
                        include="definition_status"
                    )
                    if (status["trained_model_configs"][0]["fully_defined"]):
                        es.ml.start_trained_model_deployment(
                        model_id=es_model_id,
                        number_of_allocations=1
                        )
                        print("ELSER Model is downloaded and deployed.")
                        break
                    time.sleep(5)

        except exceptions.TransportError as e:
            print(f"An error occurred: {e}")

except exceptions.TransportError as e:
    print(f"An error occurred: {e}")

Pre-trained Model '.elser_model_2_linux-x86_64' is deployed.


## Create Ingestion Pipeline

In [5]:
es.ingest.put_pipeline(
    id="elser-news-ingest-pipeline",
    description="News ingest pipeline for ELSER",
    processors=[
    {
      "inference": {
        "model_id": es_model_id,
        "input_output": [
            {
              "input_field": "title",
              "output_field": "title_embedding"
            }
          ]
      }
    }
  ]
)

ObjectApiResponse({'acknowledged': True})

## Delete Index (For Testing Purpose)

In [None]:
es.indices.delete(index="sample_news_index", ignore_unavailable=True)

ObjectApiResponse({'acknowledged': True})

## Create Index (For Testing Purpose)

In [None]:
index_setting={
    "index": {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "default_pipeline": "elser-news-ingest-pipeline"
    }
}

index_mapping = {
        "properties": {
            "title": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}},
            "url": {"type": "keyword"},
            "date": {"type": "date", "format": "yyyy-mm-dd"},
            "keywords": {"type": "keyword"},
            "category": {"type": "keyword"},
            "title_embedding": {"type": "sparse_vector"}
    }
}

es.indices.create(index="sample_news_index", settings=index_setting, mappings=index_mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'sample_news_index'})

## Load Sample News Dataset (For Testing Purpose)

In [None]:
JSON_DIR = '/content/drive/MyDrive/Data-Eng-Proj/sample_news_set'

start_time = time.time()

print("Start loading news files...")

documents = []

for idx, filename in enumerate(os.listdir(JSON_DIR)):
    json_path = os.path.join(JSON_DIR, filename)

    # Load JSON data from the file
    with open(json_path) as file:
        data = json.load(file)

    documents.append({
        "_index": "sample_news_index",
        "_source": data,
    })

    # Ingest data into Elasticsearch
    #es.index(index="sample_news_index", body=data, pipeline="ml-inference-sample_news_index")

    if idx % 100 == 0 and idx != 0:
        print(f"Loaded {idx} documents")

end_time = time.time()
print(f"Finished file loading. Took {round(end_time - start_time, 3)} seconds")
print(f"Total number of documents loaded: {idx + 1}")

Start loading news files...
Loaded 100 documents
Loaded 200 documents
Loaded 300 documents
Loaded 400 documents
Loaded 500 documents
Finished file loading. Took 6.868 seconds
Total number of documents loaded: 564


## Ingest Data into Elasticsearch Index (For Testing Purpose)

In [None]:
start_time = time.time()
print("Start ingesting news data...")
helpers.bulk(es, documents)
end_time = time.time()
print(f"Data ingestion finished. Took {round(end_time - start_time, 3)} seconds")
print(f"Total number of documents ingested: {len(documents)}")

Start ingesting news data...
Data ingestion finished. Took 3.8 seconds
Total number of documents ingested: 564


## Query

In [12]:
user_query = "covid mandates"

### Function for Listing Responses

In [7]:
def response_listing(response):
    if len(response['hits']['hits']) == 0:
        print('No search results.')
    else:
        total_count = response['hits']['total']['value']
        print(f"Total Document Retrieved: {total_count}\n\n")
        for i, hit in enumerate(response['hits']['hits']):
            print(f"==============={i+1}===============")
            news_id = hit['_id']
            score = hit['_score']
            news_title = hit['_source']['title']
            news_date = hit['_source']['date']
            news_cat = hit['_source']['category']
            news_url = hit['_source']['url']
            print(f"Score: {score}\nTitle: {news_title}\nDate: {news_date}\nCategory: {news_cat}\nURL: {news_url}\n")

## Lexical BM25

In [13]:
search_query = {
    "query": {
        "bool": {
            "must": [
                {
                    "exists": {"field": "title_embedding"}
                },
                {
                    "distance_feature": {
                        "field": "date",
                        "pivot": "7d",
                        "origin": "now",
                        "boost": 1000
                    }
                }

            ],
            "should": [
                {
                    "multi_match": {
                        "query": user_query,
                        "fields": ["title", "keywords^2", "category", "url"],
                        "type": "best_fields",
                        "fuzziness": "AUTO",
                    }
                },
                {
                    "match_phrase": {
                        "title": user_query
                    }
                }
            ]
        }
    },
    "from": 0,
    "size": 20
}

response = es.search(index="cs5481-news-search", body=search_query)

response_listing(response)

Total Document Retrieved: 10000


Score: 47.416252
Title: Gov. Ron DeSantis proposes permanent ban on Covid mandates in Florida
Date: 2023-01-18
Category: ['politics news']
URL: https://www.nbcnews.com/politics/politics-news/florida-gov-desantis-proposes-permanent-ban-covid-mandates-rcna66268

Score: 39.654953
Title: The era of big Covid mandates is ending
Date: 2022-02-09
Category: ['meet the press']
URL: https://www.nbcnews.com/politics/meet-the-press/era-big-covid-mandates-ending-n1288840

Score: 34.359352
Title: Pentagon drops Covid-19 vaccination mandate for troops
Date: 2023-01-11
Category: ['coronavirus']
URL: https://www.nbcnews.com/news/us-news/pentagon-drops-covid-19-vaccination-mandate-troops-rcna65233

Score: 34.155956
Title: 'Chaotic situation': Puerto Ricans indignant at tourists breaking Covid mandates
Date: 2021-03-20
Category: ['coronavirus']
URL: https://www.nbcnews.com/news/latino/chaotic-situation-puerto-ricans-indignant-tourists-breaking-covid-mandates-n1261588

Sc

## Semantic Search with ELSER NLP Model

In [14]:
search_query = {
    "query": {
        "bool": {
            "must": [
                {
                    "exists": {"field": "title_embedding"}
                },
                {
                    "distance_feature": {
                        "field": "date",
                        "pivot": "7d",
                        "origin": "now",
                        "boost": 1000
                    }
                },
                {
                    "text_expansion": {
                        "title_embedding": {
                            "model_id": es_model_id,
                            "model_text": user_query
                        }
                    }
                }
            ],
            "should": [

            ]
        }
    },
    "from": 0,
    "size": 25
}

response = es.search(index="cs5481-news-search", body=search_query)

response_listing(response)


Total Document Retrieved: 10000


Score: 37.835293
Title: Court blocks Covid vaccine mandate for U.S. government workers
Date: 2023-03-24
Category: ['coronavirus']
URL: https://www.nbcnews.com/politics/joe-biden/court-blocks-covid-vaccine-mandate-us-government-workers-rcna76487

Score: 36.8088
Title: Gov. Ron DeSantis proposes permanent ban on Covid mandates in Florida
Date: 2023-01-18
Category: ['politics news']
URL: https://www.nbcnews.com/politics/politics-news/florida-gov-desantis-proposes-permanent-ban-covid-mandates-rcna66268

Score: 35.33735
Title: Should we be worried about Covid this winter?
Date: 2023-08-17
Category: ['news', 'uk-scotland']
URL: http://www.bbc.co.uk/news/uk-scotland-66502573

Score: 35.33735
Title: Should we be worried about Covid this winter?
Date: 2023-08-17
Category: ['news', 'uk-scotland']
URL: http://www.bbc.co.uk/news/uk-scotland-66502573

Score: 34.855644
Title: Could getting Covid raise cholesterol?
Date: 2023-01-25
Category: ['coronavirus']
URL: http

## Hybrid BM25 + ELSER Model with Score Summation (For testing)
>It is a straightforward approach in combining the lexical search and semantic search by adding the relevance scores from all the matching fields to rank the documents.

In [15]:
search_query = {
    "query": {
        "bool": {
            "must": [
                {
                    "exists": {
                        "field": "title_embedding"
                        }
                },
                {
                    "distance_feature": {
                        "field": "date",
                        "pivot": "7d",
                        "origin": "now",
                        "boost": 1000
                    }
                }
            ],
            "should": [
                {
                    "multi_match": {
                        "query": user_query,
                        "fields": ["title", "keywords^2", "category", "url"],
                        "type": "best_fields",
                        "fuzziness": "AUTO",
                    }
                },
                {
                    "match_phrase": {
                        "title": user_query
                    }
                },
                {
                    "text_expansion": {
                        "title_embedding": {
                            "model_id": es_model_id,
                            "model_text": user_query,
                            }
                    }
                }
            ]
        }
    },
    "size": 20
}

response = es.search(index="cs5481-news-search", body=search_query)

response_listing(response)

Total Document Retrieved: 10000


Score: 62.471138
Title: Gov. Ron DeSantis proposes permanent ban on Covid mandates in Florida
Date: 2023-01-18
Category: ['politics news']
URL: https://www.nbcnews.com/politics/politics-news/florida-gov-desantis-proposes-permanent-ban-covid-mandates-rcna66268

Score: 56.15783
Title: The era of big Covid mandates is ending
Date: 2022-02-09
Category: ['meet the press']
URL: https://www.nbcnews.com/politics/meet-the-press/era-big-covid-mandates-ending-n1288840

Score: 49.964134
Title: Court blocks Covid vaccine mandate for U.S. government workers
Date: 2023-03-24
Category: ['coronavirus']
URL: https://www.nbcnews.com/politics/joe-biden/court-blocks-covid-vaccine-mandate-us-government-workers-rcna76487

Score: 47.92674
Title: 'Chaotic situation': Puerto Ricans indignant at tourists breaking Covid mandates
Date: 2021-03-20
Category: ['coronavirus']
URL: https://www.nbcnews.com/news/latino/chaotic-situation-puerto-ricans-indignant-tourists-breaking-covid-man

## Hybrid BM25 + ELSER Model with RRF Ranking
>The query below does not work in Python API since it currently does not contain the "sub_searches" method, but the query has been tested in Elasticsearch console and depolyed in our web application.

In [None]:
search_query = {
    "track_total_hits": True,
    "sub_searches": [
        {
            "query": {
                "bool": {
                    "must": [
                        {
                            "exists": {
                                "field": "title_embedding"
                                }
                        },
                        {
                            "distance_feature": {
                                "field": "date",
                                "pivot": "7d",
                                "origin": "now",
                                "boost": 1000
                            }
                        }
                    ],
                    "should": [
                        {
                        "multi_match": {
                            "query": user_query,
                            "fields": ["title", "keywords^2", "category", "url"],
                            "type": "best_fields",
                            "fuzziness": "AUTO",
                            }
                        }
                    ]
                }
            }
        },
        {
            "query": {
                "bool": {
                    "must": [
                        {
                            "exists": {
                                "field": "title_embedding"
                                }
                        },
                        {
                            "distance_feature": {
                                "field": "date",
                                "pivot": "7d",
                                "origin": "now",
                                "boost": 1000
                            }
                        }
                    ],
                    "should": [
                        {
                        "text_expansion": {
                            "title_embedding": {
                                "model_id": es_model_id,
                                "model_text": user_query,
                                }
                            }
                        }
                    ]
                }
            }
        },
        {
            "query": {
                "bool": {
                    "must": [
                        {
                            "exists": {
                                "field": "title_embedding"
                                }
                        },
                        {
                            "distance_feature": {
                                "field": "date",
                                "pivot": "7d",
                                "origin": "now",
                                "boost": 1000
                            }
                        }
                    ],
                    "should": [
                        {
                            "match_phrase": {
                                "title": user_query
                            }
                        }
                    ]
                }
            }
        }
    ],
    "rank": {
        "rrf": {
            "window_size": 50
        }
    }
}