In [None]:
!pip freeze

In [None]:
%pip install --upgrade elasticsearch tensorflow tensorflow-hub tensorflow-text urllib3

#### Imports

In [2]:
import tensorflow_text as tf_text
import tensorflow_hub  as tf_hub

from elasticsearch import Elasticsearch
from datasets      import load_dataset
from IPython       import display
import urllib3

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
urllib3.disable_warnings()

#### Constants

In [4]:
MODEL_MULTILINGUAL_SENTENCE_ENCODER: str = "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/multilingual-large/versions/2"

#### Client Elasticsearch

In [5]:
es_host:     str   = "https://127.0.0.1:9200/"
es_username: str   = "elastic"
es_password: str   = ""

In [6]:
es = Elasticsearch(
    hosts        = es_host,
    basic_auth   = (es_username, es_password),
    verify_certs = False
)

  _transport = transport_class(


In [7]:
es.info()["tagline"]

'You Know, for Search'

#### Download dataset BBC News

In [8]:
bbc_news_dataset = load_dataset("SetFit/bbc-news")

Repo card metadata block was not found. Setting CardData to empty.


In [9]:
bbc_news_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1225
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1000
    })
})

#### Download Multilingual Universal Sentence Encoder

In [10]:
model = tf_hub.load(MODEL_MULTILINGUAL_SENTENCE_ENCODER)

#### Test model

In [11]:
model("Hello World, Machine Learning ElasticSearch!")[0].numpy()

array([ 0.01557496, -0.03167885,  0.03958492, -0.02724898, -0.05013365,
       -0.03052995,  0.08521674, -0.05683852, -0.01631888, -0.00668247,
       -0.07508523, -0.07110775,  0.05708567, -0.03136145,  0.00650919,
       -0.00293858,  0.04844021,  0.02195126,  0.04281716,  0.00250192,
        0.09268538, -0.05472551,  0.02698815,  0.02663275,  0.04330843,
        0.04074765,  0.02422643, -0.02287512,  0.01208858,  0.07331301,
        0.07293201, -0.07729743,  0.02792338, -0.01446387, -0.01396071,
        0.01847963,  0.04072797,  0.047226  , -0.00339188, -0.04884171,
        0.03322484,  0.00852153, -0.0442864 , -0.02766304,  0.07173017,
        0.02564158, -0.00111083,  0.01233467,  0.001432  , -0.05230822,
        0.06596167, -0.02677762, -0.09242167,  0.05081501,  0.02615864,
       -0.00569609, -0.05499454, -0.01306279,  0.00563923, -0.00710892,
        0.0486436 ,  0.02045606, -0.01976168, -0.0570779 ,  0.01903506,
       -0.09717009,  0.04094103,  0.0415053 ,  0.02314106,  0.06

#### Create index to dataset

In [14]:
bbc_news_index: str = "bbc_news"

es.indices.create(
    index = bbc_news_index,
    settings = {
        "number_of_shards": 2,
        "number_of_replicas": 1
    },
    mappings = {
        "properties": {
            "text": {"type": "text"},
            "label": {"type": "integer"},
            "label_text": {"type": "text"},
            "dataset_type": {"type": "text"},
            "text_embeddings": {"type": "dense_vector", "dims": 512}
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'bbc_news'})

In [13]:
# es.indices.delete(index="bbc_news")

ObjectApiResponse({'acknowledged': True})

#### Indexing dataset BBC News on Elasticsearch

In [15]:
for dataset_type in bbc_news_dataset:
    dataset = bbc_news_dataset[dataset_type]
    size    = len(dataset)

    for index, item in enumerate(dataset, start=1):
        display.clear_output(wait=True)
        print(f"Indexing BBC News {dataset_type}, dataset: {index} / {size}")

        document: dict = {
            "text": item["text"],
            "label": item["label"],
            "label_text": item["label_text"],
            "dataset_type": dataset_type,
            "text_embeddings": model(item["text"])[0].numpy()
        }

        es.index(
            index = bbc_news_index,
            document = document
        )

Indexing BBC News test, dataset: 1000 / 1000


In [16]:
es.count(index=bbc_news_index)

ObjectApiResponse({'count': 2225, '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0}})

#### Create Search Function

In [19]:
def build_query(text: str) -> dict:
    query: dict = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.text_embeddings, 'text_embeddings') + 1.0",
                "params": {"text_embeddings": model(text)[0].numpy()}
            }
        }
    }

    return query

In [20]:
def semantic_search(text: str) -> None:
    query = build_query(text)

    result = es.search(index=bbc_news_index, query=query, size=1)
    result = result["hits"]["hits"]

    if len(result) == 0:
        print("No results found...")
        return

    result = result[0]
    
    print(f"Score: ", result["_score"])
    print(f"Label: ", result["_source"]["label_text"])
    print(f"Text:  ", result["_source"]["text"])

#### Create main function

In [23]:
def main() -> None:
    semantic_search("economic growth")
    semantic_search("crescimento econômico")
    semantic_search("crecimiento económico")

In [24]:
if __name__ == "__main__":
    main()

Score:  1.3477879
Label:  business
Text:   us economy shows solid gdp growth the us economy has grown more than expected  expanding at an annual rate of 3.8% in the last quarter of 2004.  the gross domestic product figure was ahead of the 3.1% the government estimated a month ago. the rise reflects stronger spending by businesses on capital equipment and a smaller-than-expected trade deficit. gdp is a measure of a country s economic health  reflecting the value of the goods and services it produces.  the new gdp figure  announced by the commerce department on friday  also topped the 3.5% growth rate that economists had forecast ahead of friday s announcement. growth was at an annual rate of 4% in the third quarter of 2004 and for the year it came in at 4.4%  the best figure in five years. however  the positive economic climate may lead to a rise in interest rates  with many expecting us rates to rise on 22 march. in the january-to-march quarter  the economy is expected to grow at an an