In [1]:
import pandas as pd, json
from elasticsearch import Elasticsearch
from getpass import getpass
from urllib.request import urlopen

In [6]:
ELASTCSEARCH_CERT_PATH = "/Users/liuxg/elastic/elasticsearch-8.10.0/config/certs/http_ca.crt"
 
es = Elasticsearch(  ['https://localhost:9200'],
    basic_auth = ('elastic', 'vXDWYtL*my3vnKY9zCfL'),
    ca_certs = ELASTCSEARCH_CERT_PATH,
    verify_certs = True)
print(es.info())

{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'lM5ZgEQNTkO8phtT5qZAHQ', 'version': {'number': '8.10.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'e338da74c79465dfdc204971e600342b0aa87b6b', 'build_date': '2023-09-07T08:16:21.960703010Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [7]:
# ingest pipeline definition
PIPELINE_ID="vectorize_blogs"

es.ingest.put_pipeline(id=PIPELINE_ID, processors=[{
        "inference": {
          "model_id": "sentence-transformers__all-minilm-l6-v2",
          "target_field": "text_embedding",
          "field_map": {
            "title": "text_field"
          }
        }
      }])

ObjectApiResponse({'acknowledged': True})

In [12]:
# define index name
INDEX_NAME="blogs"

# flag to check if index has to be deleted before creating
SHOULD_DELETE_INDEX=True

# define index mapping
INDEX_MAPPING = {
    "properties": {
      "title": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      
      "text_embedding": {
        "properties": {
          "is_truncated": {
            "type": "boolean"
          },
          "model_id": {
            "type": "text",
            "fields": {
              "keyword": {
                "type": "keyword",
                "ignore_above": 256
              }
            }
          },
          "predicted_value": {
            "type": "dense_vector",
            "dims": 384,
            "index": True,
            "similarity": "l2_norm"
          }
        }
      }
    }
  }

INDEX_SETTINGS = {
    "index": {
      "number_of_replicas": "1",
      "number_of_shards": "1",
      "default_pipeline": PIPELINE_ID
    }
}

# check if we want to delete index before creating the index
if(SHOULD_DELETE_INDEX):
  if es.indices.exists(index=INDEX_NAME):
    print("Deleting existing %s" % INDEX_NAME)
    client.options(ignore_status=[400, 404]).indices.delete(index=INDEX_NAME)

print("Creating index %s" % INDEX_NAME)
es.options(ignore_status=[400,404]).indices.create(index=INDEX_NAME, mappings=INDEX_MAPPING, settings=INDEX_SETTINGS)

Deleting existing blogs
Creating index blogs


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'blogs'})

In [16]:
from elasticsearch import helpers

# Load data into a JSON object
with open('data.json') as f:
   data_json = json.load(f)
 
print(data_json)
 
# Prepare the documents to be indexed
documents = []
for doc in data_json:
    documents.append({
        "_index": "blogs",
        "_source": doc,
    })
 
# Use helpers.bulk to index
helpers.bulk(client, documents)

[{'title': 'Pulp Fiction', 'runtime': '154', 'plot': 'The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.', 'keyScene': "John Travolta is forced to inject adrenaline directly into Uma Thurman's heart after she overdoses on heroin.", 'genre': 'Crime, Drama', 'released': '1994'}, {'title': 'The Dark Knight', 'runtime': '152', 'plot': 'When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.', 'keyScene': "Batman angrily responds 'I’m Batman' when asked who he is by Falcone.", 'genre': 'Action, Crime, Drama, Thriller', 'released': '2008'}, {'title': 'Fight Club', 'runtime': '139', 'plot': 'An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.', 'keyScene': 'Brad Pitt explains the rules of 

(12, [])

In [18]:
INDEX_NAME="blogs"

source_fields = [ "id", "title"]

query = {
  "field": "text_embedding.predicted_value",
  "k": 10,
  "num_candidates": 50,
  "query_vector_builder": {
    "text_embedding": {"model_id": "sentence-transformers__all-minilm-l6-v2",
      "model_text": "dark knight"
    }
  }
}

response = es.search(
    index=INDEX_NAME,
    fields=source_fields,
    knn=query,
    source=False)


results = pd.json_normalize(json.loads(json.dumps(response.body['hits']['hits'])))

# shows the result
results[['_id', '_score', 'fields.title']]


Unnamed: 0,_id,_score,fields.title
0,W7_SDIsByaLf0EuTEs2L,0.936852,[The Dark Knight]
1,ZL_SDIsByaLf0EuTEs2L,0.488973,[The Departed]
2,Xr_SDIsByaLf0EuTEs2L,0.467606,[The Matrix]
3,Y7_SDIsByaLf0EuTEs2L,0.466467,[The Godfather]
4,YL_SDIsByaLf0EuTEs2L,0.465213,[Goodfellas]
5,X7_SDIsByaLf0EuTEs2L,0.464054,[The Shawshank Redemption]
6,Wr_SDIsByaLf0EuTEs2L,0.446297,[Pulp Fiction]
7,XL_SDIsByaLf0EuTEs2L,0.443128,[Fight Club]
8,Zb_SDIsByaLf0EuTEs2L,0.431134,[The Usual Suspects]
9,Xb_SDIsByaLf0EuTEs2L,0.426258,[Inception]
