In [1]:
from elasticsearch import Elasticsearch
import os

In [2]:
elastic_user=os.getenv('ES_USER')
elastic_password=os.getenv('ES_PASSWORD')
elastic_endpoint=os.getenv("ES_ENDPOINT")
 
url = f"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200"
es = Elasticsearch(url, ca_certs = "./http_ca.crt", verify_certs = True)
 
print(es.info())

{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'n1BjmRPcR2GObT6ZMbJ9xA', 'version': {'number': '8.11.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'd9ec3fa628c7b0ba3d25692e277ba26814820b20', 'build_date': '2023-11-04T10:04:57.184859352Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


# ingest pipeline definition

In [3]:
PIPELINE_ID="vectorize_books_elser"

es.ingest.put_pipeline(id=PIPELINE_ID, processors=[{
     "foreach": {
         "field": "synopsis_passages",
         "processor": {
           "inference": {
             "field_map": {
               "_ingest._value.text": "text_field"
             },
             "model_id": ".elser_model_2",
             "target_field": "_ingest._value.vector",
             "on_failure": [
               {
                 "append": {
                   "field": "_source._ingest.inference_errors",
                   "value": [
                     {
                       "message": "Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'",
                       "pipeline": "ml-inference-title-vector",
                       "timestamp": "{{{ _ingest.timestamp }}}"
                     }
                   ]
                 }
               }
             ]
           }
         }
       }
}])

ObjectApiResponse({'acknowledged': True})

# Define the mapping

In [4]:
mappings = {
   "properties": {
       "title": {"type": "text"},
       "published_date": {"type": "text"},
       "synopsis": {"type": "text"},
       "synopsis_passages": {
         "type": "nested",
         "properties": {
             "vector": {
               "properties": {
                 "is_truncated": {
                   "type": "boolean"
                 },
                 "model_id": {
                   "type": "text",
                   "fields": {
                     "keyword": {
                       "type": "keyword",
                       "ignore_above": 256
                     }
                   }
                 },
                 "predicted_value": {
                   "type": "sparse_vector"
                 }
            }
         }
     }
   }
}
}
# Create the index (deleting any previously existing index)
es.indices.delete(index="books", ignore_unavailable=True)
es.indices.create(index="books", mappings=mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'books'})

In [5]:
import json

with open('book_summaries_1000_chunked.json') as f:
   books = json.load(f)

print("length of books: %d" %(len(books)))

length of books: 999


In [6]:
from elasticsearch.helpers import streaming_bulk
count = 0
def generate_actions(books):
 for book in books:
   doc = {}
   doc["_index"] = "books"
   doc["pipeline"] = "vectorize_books_elser"
   doc["_source"] = book
   yield doc


for ok, info in streaming_bulk(
    client=es, 
    index="books", 
    actions=generate_actions(books),
    max_retries=3, 
    request_timeout=60*3, 
    chunk_size=10):
 if not ok:
   print(f"Unable to index {info['index']['_id']}: {info['index']['error']}")


  for ok, info in streaming_bulk(
