In [None]:
!pip freeze

In [None]:
!pip install --upgrade elasticsearch tensorflow tensorflow-hub tensorflow-text urllib3

#### Imports

In [None]:
import tensorflow_text as tf_text
import tensorflow_hub  as tf_hub

from elasticsearch import Elasticsearch
from datasets      import load_dataset
from IPython       import display

#### Constants

In [None]:
MODEL_MULTILINGUAL_SENTENCE_ENCODER: str = "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/multilingual-large/versions/2"

#### Client Elasticsearch

In [None]:
es_host:     str   = ""
es_username: str   = ""
es_password: str   = ""

In [None]:
es = Elasticsearch(
    hosts        = es_host,
    basic_auth   = (es_username, es_password),
    verify_certs = False
)

In [None]:
es.info()

#### Download dataset BBC News

In [None]:
bbc_news_dataset = load_dataset("SetFit/bbc-news")

#### Download Multilingual Universal Sentence Encoder

In [None]:
model = tf_hub.load(MODEL_MULTILINGUAL_SENTENCE_ENCODER)

#### Test model

In [None]:
model("Hello World, Machine Learning ElasticSearch!")[0].numpy()

#### Create index to dataset

In [None]:
bbc_news_index: str = "bbc_news"

es.indices.create(
    index = bbc_news_index,
    settings = {
        "number_of_shards": 2,
        "number_of_replicas": 1
    },
    mappings = {
        "properties": {
            "text": {"type": "text"},
            "label": {"type": "integer"},
            "label_text": {"type": "text"},
            "dataset_type": {"type": "text"},
            "text_embeddings": {"type": "dense_vector", "dims": 512}
        }
    }
)

#### Indexing dataset BBC News on Elasticsearch

In [None]:
for dataset_type in bbc_news_dataset:
    dataset = bbc_news_dataset[dataset_type]
    size    = len(dataset)

    for index, item in enumerate(dataset, start=1):
        display.clear_output(wait=True)
        print(f"Indexing BBC NEws {dataset_type}, dataset: {index} / {size}")

        document: dict = {
            "text": item["text"],
            "label": item["label"],
            "label_text": item["label_text"],
            "dataset_type": dataset_type,
            "text_embbedings": model(item["text"])[0].numpy()
        }

        es.index(
            index = bbc_news_index,
            document = document
        )

#### Create Search Function

In [None]:
def build_query(text: str) -> dict:
    query: dict = {
        "scripts_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.text_embbedings, 'text_embeddings') + 1.0",
                "params": {"text_embbedings": model(text)[0].numpy()}
            }
        }
    }

    return query

In [None]:
def semantic_search(text: str) -> None:
    query = build_query(text)

    result = es.search(index=bbc_news_index, query=query, size=1)
    result = result["hits", "hits"]

    if len(result) == 0:
        print("No results found...")
        return

    result = result[0]

    print(f"Score: {result["_score"]}")
    print(f"Label: {result["_source"]["label_text"]}")
    print(f"Text:  {result["_score"]["text"]}")

#### Create main function

In [None]:
def main() -> None:
    semantic_search("economic growth")
    semantic_search("crescimento econômico")
    semantic_search("crecimiento económico")

In [None]:
if __name__ == "__main__":
    main()