In [1]:
# import modules
import pandas as pd, json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from getpass import getpass
from urllib.request import urlopen
from pprint import pprint

In [3]:
elastic_user = "elastic"
elastic_password = "o6G_pvRL=8P*7on+o6XH"
elastic_endpoint = "localhost"
url = f"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200"
es = Elasticsearch(url, ca_certs = "./http_ca.crt", verify_certs = True)

es.info()

ObjectApiResponse({'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'n1BjmRPcR2GObT6ZMbJ9xA', 'version': {'number': '8.11.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'd9ec3fa628c7b0ba3d25692e277ba26814820b20', 'build_date': '2023-11-04T10:04:57.184859352Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

# Ingest pipeline setup

In [4]:
pipeline = {
  "processors": [
    {
      "inference": {
        "field_map": {
          "my_text": "text_field"
        },
        "model_id": "sentence-transformers__all-distilroberta-v1",
        "target_field": "ml.inference.my_vector",
        "on_failure": [
          {
            "append": {
              "field": "_source._ingest.inference_errors",
              "value": [
                {
                  "message": "Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'",
                  "pipeline": "ml-inference-title-vector",
                  "timestamp": "{{{ _ingest.timestamp }}}"
                }
              ]
            }
          }
        ]
      }
    },
    {
      "set": {
        "field": "my_vector",
        "if": "ctx?.ml?.inference != null && ctx.ml.inference['my_vector'] != null",
        "copy_from": "ml.inference.my_vector.predicted_value",
        "description": "Copy the predicted_value to 'my_vector'"
      }
    },
    {
      "remove": {
        "field": "ml.inference.my_vector",
        "ignore_missing": True
      }
    }
  ]
}

pipeline_id = 'vector_embedding_demo'
response = es.ingest.put_pipeline(id=pipeline_id, body=pipeline)

# Print the response
print(response)

{'acknowledged': True}


  response = es.ingest.put_pipeline(id=pipeline_id, body=pipeline)


# Index Mapping / Template setup

In [5]:
index_patterns = [
    "my_vector_index-*"
  ]

order = 1

settings = {
      "number_of_shards": 1,
      "number_of_replicas": 1,
      "index.default_pipeline": pipeline_id
    }

mappings = {
      "properties": {
        "my_vector": {
          "type": "dense_vector",
          "dims": 768,
          "index": True,
          "similarity": "dot_product"
        },
        "my_text": {
          "type": "text"
        }
      },
      "_source": {
        "excludes": [
          "my_vector"
        ]
      }
    }


# Create the index template
response = es.indices.put_template(name="my_vector_index",
                                   index_patterns=index_patterns,
                                   order=order,
                                   settings=settings,
                                   mappings=mappings
                                   )


# Print the response
print(response)

{'acknowledged': True}


  response = es.indices.put_template(name="my_vector_index",


# Indexing Data

In [6]:
index_name = 'my_vector_index-01'

In [7]:
data = [
    ("Hey, careful, man, there's a beverage here!", "The Dude"),
    ("I’m The Dude. So, that’s what you call me. You know, that or, uh, His Dudeness, or, uh, Duder, or El Duderino, if you’re not into the whole brevity thing", "The Dude"),
    ("You don't go out looking for a job dressed like that? On a weekday?", "The Big Lebowski"),
    ("What do you mean brought it bowling, Dude?", "Walter Sobchak"),
    ("Donny was a good bowler, and a good man. He was one of us. He was a man who loved the outdoors... and bowling, and as a surfer he explored the beaches of Southern California, from La Jolla to Leo Carrillo and... up to... Pismo", "Walter Sobchak")
]

actions = [
    {
        "_op_type": "index",
        "_index": "my_vector_index-01",
        "_source": {
            "my_text": text,
            "my_metadata": metadata
        }
    } for text, metadata in data
]

bulk(es, actions)

# Refresh the index to make sure all data is searchable
es.indices.refresh(index="my_vector_index-01")

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

# Querying Data
Approximate k-nearest neighbor (kNN)

In [9]:
knn = {
      "field": "my_vector",
      "k": 1,
      "num_candidates": 5,
      "query_vector_builder": {
        "text_embedding": {
          "model_id": "sentence-transformers__all-distilroberta-v1",
          "model_text": "Watchout I have a drink"
        }
      }
    }

response = es.search(
    index=index_name,
    knn=knn,
    source=True)

pprint(response['hits']['hits'])

[{'_id': '5In5FIwBCEhyMiaWbFjR',
  '_index': 'my_vector_index-01',
  '_score': 0.7817012,
  '_source': {'ml': {'inference': {}},
              'my_metadata': 'The Dude',
              'my_text': "Hey, careful, man, there's a beverage here!"}}]


## Hybrid Searching (kNN + BM25) with RRF

In [10]:
query = {
    "match": {
      "my_text": "bowling"
    }
  }

knn ={
      "field": "my_vector",
      "k": 3,
      "num_candidates": 5,
      "query_vector_builder": {
        "text_embedding": {
          "model_id": "sentence-transformers__all-distilroberta-v1",
          "model_text": "He enjoyed the game"
        }
      }
    }

rank: {
        "rrf": {}
    }

fields = [
    "my_text",
    "my_metadata"
  ]


response = es.search(
    index=index_name,
    fields=fields,
    knn=knn,
    query=query,
    size=2,
    source=False
    )

pprint(response['hits']['hits'])

[{'_id': '54n5FIwBCEhyMiaWbFjR',
  '_index': 'my_vector_index-01',
  '_score': 1.8080788,
  'fields': {'my_metadata': ['Walter Sobchak'],
             'my_text': ['What do you mean brought it bowling, Dude?']}},
 {'_id': '6In5FIwBCEhyMiaWbFjR',
  '_index': 'my_vector_index-01',
  '_score': 1.235873,
  'fields': {'my_metadata': ['Walter Sobchak'],
             'my_text': ['Donny was a good bowler, and a good man. He was one '
                         'of us. He was a man who loved the outdoors... and '
                         'bowling, and as a surfer he explored the beaches of '
                         'Southern California, from La Jolla to Leo Carrillo '
                         'and... up to... Pismo']}}]


## Filtering

In [15]:
knn ={
    "field": "my_vector",
    "k": 1,
    "num_candidates": 5,
    "query_vector_builder": {
      "text_embedding": {
        "model_id": "sentence-transformers__all-distilroberta-v1",
        "model_text": "Did you bring the dog?"
      }
    },
    "filter": {
      "term": {
        "my_metadata.keyword": "The Dude"
      }
    }
  }

fields = [
    "my_text",
    "my_metadata"
  ]

response = es.search(
    index=index_name,
    fields=fields,
    knn=knn,
    source=False
    )

pprint(response['hits']['hits'])

[{'_id': '5In5FIwBCEhyMiaWbFjR',
  '_index': 'my_vector_index-01',
  '_score': 0.59285694,
  'fields': {'my_metadata': ['The Dude'],
             'my_text': ["Hey, careful, man, there's a beverage here!"]}}]


# Aggregrations
and Select fields returned

In [16]:
knn = {
    "field": "my_vector",
    "k": 2,
    "num_candidates": 5,
    "query_vector_builder": {
      "text_embedding": {
        "model_id": "sentence-transformers__all-distilroberta-v1",
        "model_text": "did you bring it?"
      }
    }
  }

aggs = {
    "metadata": {
      "terms": {
        "field": "my_metadata"
      }
    }
  }

fields = [
    "my_text",
    "my_metadata"
  ]

response = es.search(
    index=index_name,
    fields=fields,
    aggs=aggs,
    knn=knn,
    source=False
    )

pprint(response['hits']['hits'])

BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'Fielddata is disabled on [my_metadata] in [my_vector_index-01]. Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default. Please use a keyword field instead. Alternatively, set fielddata=true on [my_metadata] in order to load field data by uninverting the inverted index. Note that this can use significant memory.')