# Import packages and make a connection to Elasticsearch

In [1]:
from elasticsearch import Elasticsearch, helpers, exceptions
import json
import time,os
from dotenv import load_dotenv

load_dotenv()

openai_api_key=os.getenv('OPENAI_API_KEY')
elastic_user=os.getenv('ES_USER')
elastic_password=os.getenv('ES_PASSWORD')


url = f"https://{elastic_user}:{elastic_password}@localhost:9200"
client = Elasticsearch(url, ca_certs = "./http_ca.crt", verify_certs = True)

print(client.info())

{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'h2QwONxsT4Kt-lTRKmPrhg', 'version': {'number': '8.12.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '1665f706fd9354802c02146c1e6b5c0fbcddfbc9', 'build_date': '2024-01-11T10:05:27.953830042Z', 'build_snapshot': False, 'lucene_version': '9.9.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


# Create the inference task

In [2]:
client.inference.put_model(
    task_type="text_embedding",
    model_id="my_openai_embedding_model",
    body={
        "service": "openai",
        "service_settings": {
            "api_key": openai_api_key
        },
        "task_settings": {
            "model": "text-embedding-ada-002"
        }
    }
)

ObjectApiResponse({'model_id': 'my_openai_embedding_model', 'task_type': 'text_embedding', 'service': 'openai', 'service_settings': {'similarity': 'dot_product', 'dimensions': 1536}, 'task_settings': {'model': 'text-embedding-ada-002'}})

# Create an ingest pipeline with an inference processor

In [3]:
client.ingest.put_pipeline(
    id="openai_embeddings_pipeline", 
    description="Ingest pipeline for OpenAI inference.",
    processors=[
    {
      "inference": {
        "model_id": "my_openai_embedding_model",
        "input_output": {
              "input_field": "plot",
              "output_field": "plot_embedding"
            }
      }
    }
  ]
)

ObjectApiResponse({'acknowledged': True})

# Create index

In [4]:
client.indices.delete(index="openai-movie-embeddings", ignore_unavailable=True)
client.indices.create(
  index="openai-movie-embeddings",
  settings={
      "index": {
          "default_pipeline": "openai_embeddings_pipeline"
      }
  },
  mappings={
    "properties": {
      "plot_embedding": { 
        "type": "dense_vector", 
        "dims": 1536, 
        "similarity": "dot_product" 
      },
      "plot": {
        "type": "text"
        }
      }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'openai-movie-embeddings'})

# Insert Documents

In [5]:
from elasticsearch import helpers
 
with open('movies.json') as f:
   data_json = json.load(f)

# Prepare the documents to be indexed
documents = []
for doc in data_json:
    documents.append({
        "_index": "openai-movie-embeddings",
        "_source": doc,
    })

# Use helpers.bulk to index
helpers.bulk(client, documents)

print("Done indexing documents into `openai-movie-embeddings` index!")
time.sleep(3)

Done indexing documents into `openai-movie-embeddings` index!


# Semantic search

In [6]:
response = client.search(
    index='openai-movie-embeddings', 
    size=3,
    knn={
        "field": "plot_embedding",
        "query_vector_builder": {
            "text_embedding": {
                "model_id": "my_openai_embedding_model",
                "model_text": "Fighting movie"
            }
        },
        "k": 10,
        "num_candidates": 100
        }
)

for hit in response['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    title = hit['_source']['title']
    plot = hit['_source']['plot']
    print(f"Score: {score}\nTitle: {title}\nPlot: {plot}\n")

Score: 0.91674197
Title: Fight Club
Plot: An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.

Score: 0.9069591
Title: Pulp Fiction
Plot: The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.

Score: 0.8992071
Title: The Dark Knight
Plot: When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.

