In [2]:
from elasticsearch import Elasticsearch
import os

elastic_user=os.getenv('ES_USER')
elastic_password=os.getenv('ES_PASSWORD')
elastic_endpoint=os.getenv("ES_ENDPOINT")
 
url = f"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200"
es = Elasticsearch(url, ca_certs = "./http_ca.crt", verify_certs = True)
 
print(es.info())

{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'SXGzrN4dSXW1t0pkWXGfjg', 'version': {'number': '8.11.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'd9ec3fa628c7b0ba3d25692e277ba26814820b20', 'build_date': '2023-11-04T10:04:57.184859352Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


# Create Ingestion pipeline with lowercase

In [3]:
es.ingest.put_pipeline(
    id="ingest-pipeline-lowercase", 
    description="Ingest pipeline to change title to lowercase",
    processors=[
    {
      "lowercase": {
        "field": "title"
      }
    }
  ]
)

ObjectApiResponse({'acknowledged': True})

# Create index - movies with mappings

In [4]:
es.indices.delete(index="movies",ignore_unavailable=True)
es.indices.create(
  index="movies",
  settings={
      "index": {
          "number_of_shards": 1,
          "number_of_replicas": 1,
          "default_pipeline": "ingest-pipeline-lowercase"
      }
  },
  mappings={
    "properties": {
      "plot": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
    }
  }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'})

# Insert Documents

In [8]:
import json
from elasticsearch import helpers
import time
 
with open('movies.json') as f:
   data_json = json.load(f)

# Prepare the documents to be indexed
documents = []
for doc in data_json:
    documents.append({
        "_index": "movies",
        "_source": doc,
    })

# Use helpers.bulk to index
helpers.bulk(es, documents)

print("Done indexing documents into `movies` index!")
time.sleep(5)

Done indexing documents into `movies` index!


# Create a new pipeline with ELSER

In [9]:
es.ingest.put_pipeline(
    id="elser-ingest-pipeline", 
    description="Ingest pipeline for ELSER",
    processors=[
    {
      "inference": {
        "model_id": ".elser_model_2",
        "input_output": [
            {
              "input_field": "plot",
              "output_field": "plot_embedding"
            }
          ]
      }
    }
  ]
)

ObjectApiResponse({'acknowledged': True})

# Create a index with mappings

In [10]:
es.indices.delete(index="elser-movies",ignore_unavailable=True)
es.indices.create(
  index="elser-movies",
  mappings={
    "properties": {
      "plot": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "plot_embedding": { 
        "type": "sparse_vector" 
      }
    }
  }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'elser-movies'})

# Reindex with updated pipeline

In [11]:
es.reindex(source={
    "index": "movies"
  }, dest={
    "index": "elser-movies",
    "pipeline":  "elser-ingest-pipeline"
  })
time.sleep(7)

# Querying documents with ELSER

In [12]:
response = es.search(
    index='elser-movies', 
    size=3,
    query={
        "text_expansion": {
            "plot_embedding": {
                "model_id":".elser_model_2",
                "model_text":"investigation"
            }
        }
    }
)

for hit in response['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    title = hit['_source']['title']
    plot = hit['_source']['plot']
    print(f"Score: {score}\nTitle: {title}\nPlot: {plot}\n")

Score: 6.4037457
Title: Se7en
Plot: Two detectives, a rookie and a veteran, hunt a serial killer who uses the seven deadly sins as his motives.

Score: 3.6703415
Title: The Departed
Plot: An undercover cop and a mole in the police attempt to identify each other while infiltrating an Irish gang in South Boston.

Score: 2.935915
Title: The Usual Suspects
Plot: A sole survivor tells of the twisty events leading up to a horrific gun battle on a boat, which began when five criminals met at a seemingly random police lineup.



# Create a new ingestion pipeline

In [13]:
es.ingest.put_pipeline(
    id="elser-pipeline-upgrade-demo", 
    description="Ingest pipeline for ELSER upgrade demo",
    processors=[
    {
      "inference": {
        "model_id": ".elser_model_2",
        "input_output": [
            {
              "input_field": "plot",
              "output_field": "plot_embedding"
            }
          ]
      }
    }
  ]
)

ObjectApiResponse({'acknowledged': True})

# Create a new index with mappings

In [14]:
es.indices.delete(index="elser-upgrade-index-demo", ignore_unavailable=True)
es.indices.create(
  index="elser-upgrade-index-demo",
  mappings={
    "properties": {
      "plot": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "plot_embedding": {
        "type": "sparse_vector"
      },
    }
  }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'elser-upgrade-index-demo'})

# Use Reindex API

In [16]:
es.reindex(source={
    "index": "elser-movies", # replace with your index name
    "_source": {
      "excludes": ["plot_embedding"]  # replace with the field-name from your index, that has previously generated tokens
    }}, 
    dest={
    "index": "elser-upgrade-index-demo",
    "pipeline":  "elser-pipeline-upgrade-demo"
  })
time.sleep(5)

# Querying your data

In [17]:
response = es.search(
    index='elser-upgrade-index-demo', 
    size=3,
    query={
        "text_expansion": {
            "plot_embedding": {
                "model_id":".elser_model_2",
                "model_text":"child toy"
            }
        }
    }
)

for hit in response['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    title = hit['_source']['title']
    plot = hit['_source']['plot']
    print(f"Score: {score}\nTitle: {title}\nPlot: {plot}\n")


Score: 3.3168304
Title: Fight Club
Plot: An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.

Score: 1.5777304
Title: The Godfather
Plot: An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.

Score: 1.1162583
Title: The Matrix
Plot: A computer hacker learns from mysterious rebels about the true nature of his reality and his role in the war against its controllers.



In [18]:
es.indices.get(index="blogs")

ObjectApiResponse({'blogs': {'aliases': {}, 'mappings': {'properties': {'genre': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'keyScene': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'plot': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'released': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'runtime': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'text_embedding': {'properties': {'is_truncated': {'type': 'boolean'}, 'model_id': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'predicted_value': {'type': 'dense_vector', 'dims': 384, 'index': True, 'similarity': 'l2_norm'}}}, 'title': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}, 'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 

# Create ingestion pipeline

In [19]:
es.ingest.put_pipeline(
    id="elser-pipeline-blogs", 
    description="Ingest pipeline for ELSER upgrade",
    processors=[
    {
      "inference": {
        "model_id": ".elser_model_2",
        "input_output": [
          {
            "input_field": "title",
            "output_field": "title_embedding"
          }
        ]
      }
    }
  ]
)

ObjectApiResponse({'acknowledged': True})

# Create index with mappings

In [20]:
es.indices.delete(index="elser-blogs", ignore_unavailable=True)
es.indices.create(
  index="elser-blogs",
  mappings={
    "properties": {
      "title": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "title_embedding": {
        "type": "sparse_vector"
      },
    }
  }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'elser-blogs'})

# Reindex API

In [21]:
es.reindex(source={
    "index": "blogs",
    "_source": {
      "excludes": ["text_embedding"]
    }
  }, dest={
    "index": "elser-blogs",
    "pipeline":  "elser-pipeline-blogs"
  })
time.sleep(5)

# Querying your data

In [22]:
response = es.search(
    index='elser-blogs', 
    size=3,
    query={
        "text_expansion": {
            "title_embedding": {
                "model_id":".elser_model_2",
                "model_text":"Track network connections"
            }
        }
    }
)

for hit in response['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    title = hit['_source']['title']
    print(f"Score: {score}\nTitle: {title}")

Score: 0.15847498
Title: Se7en
Score: 0.048573207
Title: Inception
Score: 0.0070162327
Title: The Departed
