In [None]:
!pip install -r requirements.txt

### Getting Started
For embedding, we'll be using a model from [SentenceTransformers](https://sbert.net/), which allows us to use a fast and light version of BERT without the heavy compute overhead.

In [22]:
import os
import logging
import json
from getpass import getpass

from elasticsearch import Elasticsearch, helpers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L-12-v3")

Make sure you have access to your [Elastic Cloud ID](https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id) and [Elastic API Key](https://www.elastic.co/guide/en/cloud/current/ec-api-authentication.html#ec-api-keys). The next code snippet will prompt you for both and connect to Elasticsearch.

In [None]:
cloud_endpoint = getpass("Elastic deployment Cloud Endpoint: ")
cloud_api_key = getpass("Elastic deployment API Key: ")
INDEX_NAME = "books-local"

es = Elasticsearch(
    hosts=cloud_endpoint,
    api_key=cloud_api_key,
)

## 1. Embedding text into vectors
Our book documents are located in `../data/books.json`. They do not currently have any vectors. The next section will parse through a small batch of 25 book objects and create vector embeddings for each book_description. If you would like to run this on all 10,909 book objects, change the file_path to `../data/books.json` instead of `../data/small_books.json`.

In [24]:
# Open the small books json file
file_path = "../data/small_books.json"
with open(file_path, "r") as file:
    books = json.load(file)

# pull out all book_description fields
book_descriptions = [book["book_description"] for book in books]

In [None]:
# convert the array of "book_description" text to vectors
pool = model.start_multi_process_pool()
embedded_books = model.encode_multi_process(book_descriptions, pool)
model.stop_multi_process_pool(pool)
print(f"{len(embedded_books)} vectors created!")

In [None]:
# add the new vectors bacvk into each book object under the new field "description_embedding"
for i, book in enumerate(books):
    book["description_embedding"] = embedded_books[i].tolist()

print(f"Embeddings added to {len(books)} books.")


# write the embedded books to a new json array of documents
output_file = "../data/small_books_embedded.json"
with open(output_file, "w") as file:
    file.write("[")
    for i, book in enumerate(books):
        json.dump(book, file)
        if i != len(books) - 1:
            file.write(",")
    file.write("]")
print(f"{len(books)} embedded books saved to file: {output_file}.")

We can inspect the file and observe that there are now vector embeddings added to each document. Let's take a look at the first book in the `small_books_embedded.json` file

In [None]:
import pprint

pp = pprint.PrettyPrinter(indent=2)

with open(output_file, "r") as file:
    books = json.load(file)
    pp.pprint(books[0])

Now lets create an index for our books using the Elasticsearch client.

In [None]:
# Lets define our mappings
mappings = {
    "mappings": {
        "properties": {
            "book_title": {"type": "text"},
            "author_name": {"type": "text"},
            "rating_score": {"type": "float"},
            "rating_votes": {"type": "integer"},
            "review_number": {"type": "integer"},
            "book_description": {"type": "text"},
            "genres": {"type": "keyword"},
            "year_published": {"type": "integer"},
            "url": {"type": "text"},
            "description_embedding": {"type": "dense_vector", "dims": 384},
        }
    }
}

# Delete any previous index
es.indices.delete(index=INDEX_NAME, ignore_unavailable=True)
es.indices.create(index=INDEX_NAME, body=mappings)
print(f"Index '{INDEX_NAME}' created.")

Now that we have created an index in Elasticsearch, we can index our local book objects. This `bulk_ingest_books` method will make indexing documents much faster than if we were to run an index function on each individual book.

In [None]:
file_path = "../data/small_books_embedded.json"
with open(file_path, "r") as file:
    books = json.load(file)

# create an array of index actions, with each element holding one document
actions = [
    {"_index": INDEX_NAME, "_id": book.get("id", None), "_source": book}
    for book in books
]

try:
    helpers.bulk(es, actions, chunk_size=1000)
    print(f"Successfully added {len(actions)} books into the '{INDEX_NAME}' index.")

except helpers.BulkIndexError as e:
    print(f"Error occurred while ingesting books: {e}")

Lets also add one book, as this would be a standard function as you add new books to your vector database

In [18]:
with open("../data/one_book_embedded.json", "r") as file:
    book = json.load(file)

In [None]:
try:
    resp = es.index(
        index=INDEX_NAME,
        id=book.get("id", None),
        body=book,
    )

    print(f"Successfully indexed book! Result: {resp.get('result', None)}")

except Exception as e:
    print(f"Error occurred while indexing book: {e}")