# Connect to Elasticsearch

In [8]:
from elasticsearch import Elasticsearch
from dotenv import load_dotenv
import os
from elasticsearch import Elasticsearch

load_dotenv()
 
elastic_user=os.getenv('ES_USER')
elastic_password=os.getenv('ES_PASSWORD')
elastic_endpoint=os.getenv("ES_ENDPOINT")

url = f"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200"
client = Elasticsearch(url, ca_certs = "./http_ca.crt", verify_certs = True)
 
print(client.info())

{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'h2QwONxsT4Kt-lTRKmPrhg', 'version': {'number': '8.12.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '1665f706fd9354802c02146c1e6b5c0fbcddfbc9', 'build_date': '2024-01-11T10:05:27.953830042Z', 'build_snapshot': False, 'lucene_version': '9.9.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


# Load Model from hugging face

In [9]:
MODEL_ID = "sentence-transformers__all-minilm-l6-v2"

!eland_import_hub_model --url https://elastic:xnLj56lTrH98Lf_6n76y@localhost:9200 \
	--hub-model-id sentence-transformers/all-MiniLM-L6-v2 \
	--task-type text_embedding \
	--ca-cert ./http_ca.crt \
	--clear-previous \
	--start

2024-02-08 09:30:35,871 INFO : Establishing connection to Elasticsearch
2024-02-08 09:30:35,886 INFO : Connected to cluster named 'elasticsearch' (version: 8.12.0)
2024-02-08 09:30:35,886 INFO : Loading HuggingFace transformer tokenizer and model 'sentence-transformers/all-MiniLM-L6-v2'
STAGE:2024-02-08 09:30:38 8945:3610134 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-02-08 09:30:38 8945:3610134 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-02-08 09:30:38 8945:3610134 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
2024-02-08 09:30:40,055 INFO : Stopping deployment for model with id 'sentence-transformers__all-minilm-l6-v2'
2024-02-08 09:30:40,176 INFO : Deleting model with id 'sentence-transformers__all-minilm-l6-v2'
2024-02-08 09:30:40,400 INFO : Creating model with id 'sentence-transformers__all-minilm-l6-v2'
2024-02-08 09:30:40,571 INFO : Uploading model definition
100%|███████████████████████████████████

# Chunk and Infer in pipeline

In [10]:
# Setup the pipeline
CHUNK_SIZE = 400

client.ingest.put_pipeline(
  id="chunk_text_to_passages",
  processors=[
    {
      "script": {
        "description": "Chunk body_content into sentences by looking for . followed by a space",
        "lang": "painless",
        "source": """
          String[] envSplit = /((?<!M(r|s|rs)\.)(?<=\.) |(?<=\!) |(?<=\?) )/.split(ctx['text']);
          ctx['passages'] = new ArrayList();
          int i = 0;
          boolean remaining = true;
          if (envSplit.length == 0) {
            return
          } else if (envSplit.length == 1) {
            Map passage = ['text': envSplit[0]];ctx['passages'].add(passage)
          } else {
            while (remaining) {
              Map passage = ['text': envSplit[i++]];
              while (i < envSplit.length && passage.text.length() + envSplit[i].length() < params.model_limit) {passage.text = passage.text + ' ' + envSplit[i++]}
              if (i == envSplit.length) {remaining = false}
              ctx['passages'].add(passage)
            }
          }
          """,
        "params": {
          "model_limit": CHUNK_SIZE
        }
      }
    },
    {
      "foreach": {
        "field": "passages",
        "processor": {
          "inference": {
            "field_map": {
              "_ingest._value.text": "text_field"
            },
            "model_id": MODEL_ID,
            "target_field": "_ingest._value.vector",
            "on_failure": [
              {
                "append": {
                  "field": "_source._ingest.inference_errors",
                  "value": [
                    {
                      "message": "Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'",
                      "pipeline": "ml-inference-title-vector",
                      "timestamp": "{{{ _ingest.timestamp }}}"
                    }
                  ]
                }
              }
            ]
          }
        }
      }
    }
  ]
)

ObjectApiResponse({'acknowledged': True})

# Setup Index

In [16]:
INDEX_NAME = "chunk_passages_example"

# Setup the index
client.indices.create( 
  index=INDEX_NAME, 
  settings={
    "index": {
      "default_pipeline": "chunk_text_to_passages"
    }
  },
  mappings={
    "dynamic": "true",
    "properties": {
      "passages": {
        "type": "nested",
        "properties": {
          "vector": {
            "properties": {
              "predicted_value": {
                "type": "dense_vector",
                "index": True,
                "dims": 384,
                "similarity": "dot_product"
              }
            }
          }
        }
      }
    }
  }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'chunk_passages_example'})

# Add some Documents

In [17]:
import json
from elasticsearch import helpers

with open('workplace-docs.json') as f:
   docs = json.load(f)

operations = [
    {
      "_index": INDEX_NAME,
      "_id": i,
      "text": doc["content"],
      "name": doc["name"]
    } for i, doc in enumerate(docs)
]

# Add the documents to the index directly
response = helpers.bulk(
  client,
  operations,
  refresh=True,
)

# Aside: Pretty printing Elasticsearch responses

In [18]:
def pretty_response(response):
    if len(response['hits']['hits']) == 0:
        print('Your search returned no results.')
    else:
        for hit in response['hits']['hits']:
            id = hit['_id']
            score = hit['_score']
            doc_title = hit['_source']['name']
            passage_text = ""

            for passage in hit['inner_hits']['passages']['hits']['hits']:
                passage_text += passage["fields"]["passages"][0]['text'][0] + "\n\n"

            pretty_output = (f"\nID: {id}\nDoc Title: {doc_title}\nPassage Text:\n{passage_text}\nScore: {score}\n")
            print(pretty_output)
            print("---")

# Making queries

In [19]:
response = client.search(
  index=INDEX_NAME,
  knn={
    "inner_hits": {
      "size": 1,
      "_source": False,
      "fields": [
        "passages.text"
      ]
    },
    "field": "passages.vector.predicted_value",
    "k": 3,
    "num_candidates": 100,
    "query_vector_builder": {
      "text_embedding": {
        "model_id": MODEL_ID,
        "model_text": "Whats the work from home policy?"
      }
    }
  }
)

pretty_response(response)


ID: 0
Doc Title: Work From Home Policy
Passage Text:
Effective: March 2020
Purpose

The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.
Scope

This policy applies to all employees who are eligible for remote work as determined by their role and responsibilities.


Score: 0.8549611

---

ID: 7
Doc Title: Intellectual Property Policy
Passage Text:
This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.

Scope
This policy applies to all employees, including full-time, part-time, temporary, and contract employees.

Definitions
a.


Score: 0.7664342

---

ID: 4
Doc Title: Company Vacation Policy
Passage Text:
Purpose

The purpose of this vacation policy is to outline the guidelines and procedures for requesting and taki