In [1]:
import csv
from elasticsearch import Elasticsearch, helpers

CHUNK_SIZE = 400
INDEX_NAME = "es_tripadvisor_nyc_idx"
MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MODEL_ID_ES = "sentence-transformers__all-minilm-l6-v2"
MODEL_DIM = 384
MODEL_SIMILARITY = "dot_product"

ES_HOST = "https://localhost:9200/"
ES_PASS = "y5AADXZR0l63CvTz1AsWznNiAM1Ukq7KSd3MEra"
COHERE_API_KEY = "9DUothnkQyEhX9NW7Jr5lr7XsugovOuzYhptkMai"

In [2]:
# Create the client instance
client = Elasticsearch(
    # For local development
    hosts=[ES_HOST],
    basic_auth=('elastic', ES_PASS), 
    verify_certs=False
)
print(client.info())

{'name': 'es01', 'cluster_name': 'docker-cluster', 'cluster_uuid': '8KpPKPbgQKiA5VpXnNoX3Q', 'version': {'number': '8.13.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '09df99393193b2c53d92899662a8b8b3c55b45cd', 'build_date': '2024-03-22T03:35:46.757803203Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


  _transport = transport_class(


In [6]:
!eland_import_hub_model \
    -u elastic -p $ES_PASS \
    --url $ES_HOST \
    --hub-model-id $MODEL_ID \
    --task-type text_embedding \
    --insecure \
    --clear-previous \
    --start

2024-05-02 08:04:09,968 INFO : Establishing connection to Elasticsearch
  _transport = transport_class(
2024-05-02 08:04:09,983 INFO : Connected to cluster named 'docker-cluster' (version: 8.13.0)
2024-05-02 08:04:09,983 INFO : Loading HuggingFace transformer tokenizer and model 'sentence-transformers/all-MiniLM-L6-v2'
INFO:2024-05-02 08:04:17 4527:4527 init.cpp:158] If you see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti
STAGE:2024-05-02 08:04:17 4527:4527 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-05-02 08:04:17 4527:4527 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-05-02 08:04:17 4527:4527 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
2024-05-02 08:04:19,232 INFO : Stopping deployment for model with id 'sentence-transformers__all-minilm-l6-v2'
2024-05-02 08:04:19,234 INFO : Deleting model with id 'sentence-transform

In [None]:
# Setup the pipeline
client.ingest.put_pipeline(
    id="chunk_text_to_passages",
    processors=[
        {
            "script": {
                "description": "Chunk body_content into sentences by looking for . followed by a space",
                "lang": "painless",
                "source": """
          String[] envSplit = /((?<!M(r|s|rs)\.)(?<=\.) |(?<=\!) |(?<=\?) )/.split(ctx['text']);
          ctx['passages'] = new ArrayList();
          int i = 0;
          boolean remaining = true;
          if (envSplit.length == 0) {
            return
          } else if (envSplit.length == 1) {
            Map passage = ['text': envSplit[0]];ctx['passages'].add(passage)
          } else {
            while (remaining) {
              Map passage = ['text': envSplit[i++]];
              while (i < envSplit.length && passage.text.length() + envSplit[i].length() < params.model_limit) {passage.text = passage.text + ' ' + envSplit[i++]}
              if (i == envSplit.length) {remaining = false}
              ctx['passages'].add(passage)
            }
          }
          """,
                "params": {"model_limit": CHUNK_SIZE},
            }
        },
        {
            "foreach": {
                "field": "passages",
                "processor": {
                    "inference": {
                        "model_id": MODEL_ID_ES,
                        "input_output": [
                            { 
                                "input_field": "_ingest._value.text",
                                "output_field": "_ingest._value.vector.predicted_value"
                            }
                        ],
                        "on_failure": [
                            {
                                "append": {
                                    "field": "_source._ingest.inference_errors",
                                    "value": [
                                        {
                                            "message": "Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'",
                                            "pipeline": "ml-inference-title-vector",
                                            "timestamp": "{{{ _ingest.timestamp }}}",
                                        }
                                    ],
                                }
                            }
                        ],
                    }
                },
            }
        },
    ],
)



ObjectApiResponse({'acknowledged': True})

In [None]:
client.indices.delete(index=INDEX_NAME, ignore_unavailable=True)

# Setup the index
client.indices.create(
    index=INDEX_NAME,
    settings={"index": {"default_pipeline": "chunk_text_to_passages"}},
    mappings={
        "dynamic": "true",
        "properties": {
            "passages": {
                "type": "nested",
                "properties": {
                    "vector": {
                        "properties": {
                            "predicted_value": {
                                "type": "dense_vector",
                                "index": True,
                                "dims": MODEL_DIM,
                                "similarity": MODEL_SIMILARITY,
                            }
                        }
                    }
                },
            }
        },
    },
)



ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'es_tripadvisor_nyc_idx'})

In [None]:
file = 'csv/New_York_reviews.csv'

#Read CSV File
def read_CSV(csv_file):
    csv_rows = []
    with open(csv_file) as csvfile:
        reader = csv.DictReader(csvfile)
        fields = reader.fieldnames
        fields[0] = '_id'
        for row in reader:
            new_row = {fields[i]:row[fields[i]] for i in range(len(fields))}
            new_row['_index'] = INDEX_NAME
            new_row['name'] = new_row.pop('review_id')
            new_row['text'] = new_row.pop('review_full')
            csv_rows.append(new_row)
        return csv_rows

docs = read_CSV(file)

print(len(docs))

# Add the documents to the index directly
# response = helpers.bulk(
#     client,
#     docs,
#     refresh=True,
#     request_timeout=60 * 10,
# )

510463


In [4]:
def pretty_response(response):
    if len(response["hits"]["hits"]) == 0:
        print("Your search returned no results.")
    else:
        for hit in response["hits"]["hits"]:
            id = hit["_id"]
            score = hit["_score"]
            doc_title = hit["_source"]["name"]
            restaurant_name = hit["_source"]["restaurant_name"]
            title_review = hit["_source"]["title_review"]
            passage_text = ""

            for passage in hit["inner_hits"]["passages"]["hits"]["hits"]:
                passage_text += passage["fields"]["passages"][0]["text"][0] + "\n\n"

            pretty_output = f"\nID: {id}\nDoc Title: {doc_title}\nRestaurant Name: {restaurant_name}\nTitle Review: {title_review}\nPassage Text:\n{passage_text}\nScore: {score}\n"
            print(pretty_output)
            print("---")

In [7]:
response = client.search(
    index=INDEX_NAME,
    knn={
        "inner_hits": {"size": 1, "_source": False, "fields": ["passages.text"]},
        "field": "passages.vector.predicted_value",
        "k": 3,
        "num_candidates": 100,
        "query_vector_builder": {
            "text_embedding": {
                "model_id": MODEL_ID_ES,
                "model_text": "Best pasta in New York",
            }
        },
    },
)

pretty_response(response)


ID: 32390
Doc Title: review_720045084
Restaurant Name: Tony_s_Di_Napoli_Midtown
Title Review: Fantastic food!
Passage Text:
The best pasta in New York! Great dessert and friendly staff. A bit noisy on a Sunday evening but a really nice evening close to Times square.


Score: 0.9367155

---

ID: 52686
Doc Title: review_651849097
Restaurant Name: Carmine_s_Italian_Restaurant_Times_Square
Title Review: Wonderful
Passage Text:
The best pasta in New York. The only problem is the size of the plates. They must do smaller plates. For one person for example.


Score: 0.90883017

---

ID: 73133
Doc Title: review_628690226
Restaurant Name: Il_Gattopardo
Title Review: Excellence
Passage Text:
Perhaps the best pasta in NY. They can deliver pasta al dente, as they have done that for us in the past.


Score: 0.89915013

---


