# Connect to Elasticsearch

In [11]:
from elasticsearch import Elasticsearch
import os
 
elastic_user=os.getenv('ES_USER')
elastic_password=os.getenv('ES_PASSWORD')
elastic_endpoint=os.getenv("ES_ENDPOINT")
 
url = f"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200"
es = Elasticsearch(url, ca_certs = "./http_ca.crt", verify_certs = True)
 
print(es.info())

{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'SXGzrN4dSXW1t0pkWXGfjg', 'version': {'number': '8.11.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'd9ec3fa628c7b0ba3d25692e277ba26814820b20', 'build_date': '2023-11-04T10:04:57.184859352Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


# Download and Deploy ELSER Model

In [12]:
# delete model if already downloaded and deployed
try:
  es.ml.delete_trained_model(model_id=".elser_model_2",force=True)
  print("Model deleted successfully, We will proceed with creating one")
except exceptions.NotFoundError:
  print("Model doesn't exist, but We will proceed with creating one")

# Creates the ELSER model configuration. Automatically downloads the model if it doesn't exist. 
es.ml.put_trained_model(
    model_id=".elser_model_2",
    input={
      "field_names": ["text_field"]
    }
  )


Model deleted successfully, We will proceed with creating one


ObjectApiResponse({'model_id': '.elser_model_2', 'model_type': 'pytorch', 'model_package': {'packaged_model_id': 'elser_model_2', 'model_repository': 'https://ml-models.elastic.co', 'minimum_version': '11.0.0', 'size': 438123914, 'sha256': '2e0450a1c598221a919917cbb05d8672aed6c613c028008fedcd696462c81af0', 'metadata': {}, 'tags': [], 'vocabulary_file': 'elser_model_2.vocab.json'}, 'created_by': 'api_user', 'version': '11.0.0', 'create_time': 1703833983809, 'model_size_bytes': 0, 'estimated_operations': 0, 'license_level': 'platinum', 'description': 'Elastic Learned Sparse EncodeR v2', 'tags': ['elastic'], 'metadata': {}, 'input': {'field_names': ['text_field']}, 'inference_config': {'text_expansion': {'vocabulary': {'index': '.ml-inference-native-000002'}, 'tokenization': {'bert': {'do_lower_case': True, 'with_special_tokens': True, 'max_sequence_length': 512, 'truncate': 'first', 'span': -1}}}}, 'location': {'index': {'name': '.ml-inference-native-000002'}}})

In [13]:
while True:
    status = es.ml.get_trained_models(
        model_id=".elser_model_2",
        include="definition_status"
    )
    
    if (status["trained_model_configs"][0]["fully_defined"]):
        print("ELSER Model is downloaded and ready to be deployed.")
        break
    else:
        print("ELSER Model is downloaded but not ready to be deployed.")
    time.sleep(5)

ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be deployed.
ELSER Model is downloaded but not ready to be de

In [14]:
import time

# Start trained model deployment if not already deployed
es.ml.start_trained_model_deployment(
  model_id=".elser_model_2",
  number_of_allocations=1,
  wait_for="starting"
)

ObjectApiResponse({'assignment': {'task_parameters': {'model_id': '.elser_model_2', 'deployment_id': '.elser_model_2', 'model_bytes': 438123914, 'threads_per_allocation': 1, 'number_of_allocations': 1, 'queue_capacity': 1024, 'cache_size': '438123914b', 'priority': 'normal', 'per_deployment_memory_bytes': 0, 'per_allocation_memory_bytes': 0}, 'routing_table': {'Wfv6REEYTky3JEV-lnLNHQ': {'current_allocations': 1, 'target_allocations': 1, 'routing_state': 'starting', 'reason': ''}}, 'assignment_state': 'starting', 'start_time': '2023-12-29T07:16:33.92542Z', 'max_assigned_allocations': 1}})

In [16]:
while True:
  status = es.ml.get_trained_models_stats(
    model_id=".elser_model_2",
  )
  if (status["trained_model_stats"][0]["deployment_stats"]["state"] == "started"):
    print("ELSER Model has been successfully deployed.")
    break
  else:
    print("ELSER Model is currently being deployed.")
  time.sleep(5)

ELSER Model has been successfully deployed.


# Indexing Documents with ELSER

In [17]:
es.ingest.put_pipeline(
    id="elser-ingest-pipeline", 
    description="Ingest pipeline for ELSER",
    processors=[
    {
      "inference": {
        "model_id": ".elser_model_2",
        "input_output": [
            {
              "input_field": "plot",
              "output_field": "plot_embedding"
            }
          ]
      }
    }
  ]
)

ObjectApiResponse({'acknowledged': True})

# Create index

In [18]:
es.indices.delete(index="elser-example-movies", ignore_unavailable=True)
es.indices.create(
  index="elser-example-movies",
  settings={
      "index": {
          "default_pipeline": "elser-ingest-pipeline"
      }
  },
  mappings={
    "properties": {
      "plot": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "plot_embedding": { 
        "type": "sparse_vector" 
      }
    }
  }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'elser-example-movies'})

# Insert documents

In [19]:
import json
from elasticsearch import helpers
 
with open('movies.json') as f:
   data_json = json.load(f)
 
# Prepare the documents to be indexed
documents = []
for doc in data_json:
    documents.append({
        "_index": "elser-example-movies",
        "_source": doc,
    })
 
# Use helpers.bulk to index
helpers.bulk(es, documents)
 
print("Done indexing documents into `elser-example-movies` index!")
time.sleep(3)

Done indexing documents into `elser-example-movies` index!


# Searching documents

In [20]:
response = es.search(
    index='elser-example-movies', 
    size=3,
    query={
        "text_expansion": {
            "plot_embedding": {
                "model_id":".elser_model_2",
                "model_text":"fighting movie"
            }
        }
    }
)

for hit in response['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    title = hit['_source']['title']
    plot = hit['_source']['plot']
    print(f"Score: {score}\nTitle: {title}\nPlot: {plot}\n")

Score: 12.763341
Title: Fight Club
Plot: An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.

Score: 9.930414
Title: Pulp Fiction
Plot: The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.

Score: 9.488333
Title: The Matrix
Plot: A computer hacker learns from mysterious rebels about the true nature of his reality and his role in the war against its controllers.

