In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

import json

In [2]:
# This is a new library compared to the previous modules. 
# Please perform "pip install sentence_transformers==2.7.0"
from sentence_transformers import SentenceTransformer

# if you get an error do the following:
# 1. Uninstall numpy 
# 2. Uninstall torch
# 3. pip install numpy==1.26.4
# 4. pip install torch
# run the above cell, it should work
model = SentenceTransformer("all-mpnet-base-v2")



In [2]:
# Initialize Elasticsearch client
es = Elasticsearch("http://localhost:9200")

# Define index settings and mappings
index_name = 'products'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

In [3]:
es.indices.create(
    index=index_name,
    body={
        "mappings": {
            "properties": {
                "title": {"type": "text"},
                "description": {"type": "text"},
                "category": {"type": "keyword"},
                "sub_category": {"type": "keyword"},
                "brand": {"type": "keyword"},
                "product_details": {
                    "type": "nested",
                    "dynamic": "true"
                    # "properties": {
                    #      # // Define properties if you know some common fields
                    #      #  "common_field": {
                    #      # "type": "text"
                    #      #    }
                    # }
              },
                # "actual_price": {"type": "float"},
                "average_rating": {"type": "float"},
                # "discount": {"type": "text"},
                "url": {"type": "text"},
                "out_of_stock": {"type": "boolean"},
                "title_and_description_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'products'})

In [12]:
documents_filename = '../data/cleaned_data.json'


In [15]:
def index_documents(file_name, index_name, es_client):
    index = 0
    documents = []
    # # Open the file containing the JSON data to index.
    with open(file_name, "r") as json_file:
        json_data = json.load(json_file)
        for doc in json_data:
    #         new_doc ={}
    #         new_doc["title"] = doc['title']
    #         new_doc["description"] = doc['description']
    #         new_doc["category"] = doc['category']
    #         new_doc["sub_category"] = doc['sub_category']
    #         new_doc["brand"] = doc['brand']
    #         new_doc['doc_id'] = index
    #         # new_doc["product_details"] = {}
    #         # new_doc['product_details'] = doc['product_details']    
    #         # new_doc['selling_price'] = doc['selling_price']    
    #         # new_doc['actual_price'] = doc['actual_price'] 
    #         new_doc['average_rating'] = doc['average_rating'] 
    #         # new_doc['discount'] = doc['discount'] 
    #         new_doc['url'] = doc['url'] 
    #         new_doc['out_of_stock'] = doc['out_of_stock']
    #         title_and_description = doc['title'] + doc['description']
    #         new_doc["title_and_description_vector"] = model.encode(title_and_description).tolist()
            
    #         documents.append(new_doc)
    #         index = index + 1

    #         # if index == 106:
    #         #     break

    # for doc in documents:
    #     try:
    #         es_client.index(index=index_name, document=doc)
    #     except Exception as e:
    #         print(e)

    


   
            new_doc ={}
            new_doc["title"] = doc['title']
            new_doc["description"] = doc['description']
            new_doc["category"] = doc['category']
            new_doc["sub_category"] = doc['sub_category']
            new_doc["brand"] = doc['brand']
            new_doc['doc_id'] = index
            new_doc['product_details'] = doc['product_details'][0]    
            new_doc['average_rating'] = doc['average_rating'] 
            new_doc['url'] = doc['url'] 
            new_doc['out_of_stock'] = doc['out_of_stock']
            title_and_description = doc['title'] + doc['description']
            new_doc["title_and_description_vector"] = model.encode(title_and_description).tolist()
        
            documents.append(new_doc)
            index = index + 1

            print(f'Processed {index} documents')
    
            if index == 100:
                break

    for doc in documents:
        try:
            es_client.index(index=index_name, document=doc)
        except Exception as e:
            print(e)     
        
        


In [16]:
index_documents(documents_filename, index_name, es)

Processed 1 documents
Processed 2 documents
Processed 3 documents
Processed 4 documents
Processed 5 documents
Processed 6 documents
Processed 7 documents
Processed 8 documents
Processed 9 documents
Processed 10 documents
Processed 11 documents
Processed 12 documents
Processed 13 documents
Processed 14 documents
Processed 15 documents
Processed 16 documents
Processed 17 documents
Processed 18 documents
Processed 19 documents
Processed 20 documents
Processed 21 documents
Processed 22 documents
Processed 23 documents
Processed 24 documents
Processed 25 documents
Processed 26 documents
Processed 27 documents
Processed 28 documents
Processed 29 documents
Processed 30 documents
Processed 31 documents
Processed 32 documents
Processed 33 documents
Processed 34 documents
Processed 35 documents
Processed 36 documents
Processed 37 documents
Processed 38 documents
Processed 39 documents
Processed 40 documents
Processed 41 documents
Processed 42 documents
Processed 43 documents
Processed 44 documen

In [17]:
# Get the document count
response = es.count(index=index_name)
document_count = response['count']
print(f"Number of documents in the index '{index_name}': {document_count}")

Number of documents in the index 'products': 100


In [18]:
# Text search
def full_text_search(query):
    response = es.search(
        index=index_name,
        body={
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": ["title", "description", "product_details"]
                }
            }
        }
    )
    return response['hits']['hits']

query = "bag for women"
results = full_text_search(query)
for result in results:
    print(result['_source'])

{'title': 'nu-Lite Satin Tie & Cufflink\xa0\xa0(Grey)', 'description': "We NextEdgeRetails are a professional tie manufacturer. We focus on ties for many years and In our store, we have many other styles (neckties,bow ties,tie sets,etc.) for sell which are bound to meet your needs, you can click our brand name nu-Lite to choose. Our products are famous for their comfort, durability and design because of the reliable quality, our products have been highly favored of the majority of customers. Besides they also would be perfect gifts or excellent additions to your wardrobe as well as for all fashion forward men, this tie is going to be a perfect pick. Make your Monday meetings more fun with this tie and show the world your style. Wearing an elegant man tie is an effective way to make you look charming and energetic. Be stylish and make a new style statement with nu-Lite that includes a great range of formal/casual slim neckties with pocket square for men. These ties looks good with most 

In [19]:
# semantic search
query = "track pants"
query_vector = model.encode(query)

query = {
    "field": "title_and_description_vector",
    "query_vector": query_vector,
    "k": 5,
    "num_candidates": 1000, 
}

res = es.search(index=index_name, knn=query, source=["title", "description", "category", "sub_category", "brand", "doc_id"])
res["hits"]["hits"]

[{'_index': 'products',
  '_id': '2O79sJEBqWRnhF1bC8Zh',
  '_score': 0.8702997,
  '_source': {'sub_category': 'Bottomwear',
   'description': '',
   'title': 'Solid Men Multicolor Track Pants',
   'category': 'Clothing and Accessories',
   'brand': 'York',
   'doc_id': 11}},
 {'_index': 'products',
  '_id': '2e79sJEBqWRnhF1bC8Zq',
  '_score': 0.8702997,
  '_source': {'sub_category': 'Bottomwear',
   'description': '',
   'title': 'Solid Men Multicolor Track Pants',
   'category': 'Clothing and Accessories',
   'brand': 'York',
   'doc_id': 12}},
 {'_index': 'products',
  '_id': '3O79sJEBqWRnhF1bC8aC',
  '_score': 0.8702997,
  '_source': {'sub_category': 'Bottomwear',
   'description': '',
   'title': 'Solid Men Multicolor Track Pants',
   'category': 'Clothing and Accessories',
   'brand': 'York',
   'doc_id': 15}},
 {'_index': 'products',
  '_id': '7O79sJEBqWRnhF1bDMYH',
  '_score': 0.8640954,
  '_source': {'sub_category': 'Bottomwear',
   'description': '',
   'title': 'Solid Men Blu

In [20]:
# hybrid search

knn_query = {
    "field": "title_and_description_vector",
    "query_vector": query_vector,
    "k": 5,
    "num_candidates": 1000
}

response = es.search(
    index=index_name,
    query={
        "match": {"category": "Clothing and Accessories"},
    },
    knn=knn_query,
    size=5
)

response["hits"]["hits"]


[{'_index': 'products',
  '_id': '2O79sJEBqWRnhF1bC8Zh',
  '_score': 0.8752625,
  '_source': {'title': 'Solid Men Multicolor Track Pants',
   'description': '',
   'category': 'Clothing and Accessories',
   'sub_category': 'Bottomwear',
   'brand': 'York',
   'doc_id': 11,
   'product_details': {'Style Code': 'fully cotton'},
   'average_rating': '3.3',
   'url': 'https://www.flipkart.com/yorker-solid-men-multicolor-track-pants/p/itm6294ab9c88fa0?pid=TKPFZFSHHACG3FHC&lid=LSTTKPFZFSHHACG3FHC5MQRMV&marketplace=FLIPKART&srno=b_1_12&otracker=browse&fm=organic&iid=177a46eb-d053-4732-b3de-fcad6ff59cbd.TKPFZFSHHACG3FHC.SEARCH&ssid=utkd4t3gb40000001612415717799',
   'out_of_stock': False,
   'title_and_description_vector': [-0.033772412687540054,
    0.024880798533558846,
    -0.0008130788337439299,
    0.029053103178739548,
    -0.014377662912011147,
    -0.03907518461346626,
    0.03255363181233406,
    0.040364183485507965,
    -0.10658635944128036,
    -0.05419078469276428,
    0.016468197