In [105]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

import json

In [106]:
# This is a new library compared to the previous modules. 
# Please perform "pip install sentence_transformers==2.7.0"
from sentence_transformers import SentenceTransformer

# if you get an error do the following:
# 1. Uninstall numpy 
# 2. Uninstall torch
# 3. pip install numpy==1.26.4
# 4. pip install torch
# run the above cell, it should work
model = SentenceTransformer("all-mpnet-base-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [133]:
# Initialize Elasticsearch client
es = Elasticsearch("http://localhost:9200")

# Define index settings and mappings
index_name = 'products'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

In [134]:
es.indices.create(
    index=index_name,
    body={
        "mappings": {
            "properties": {
                "title": {"type": "text"},
                "description": {"type": "text"},
                "category": {"type": "keyword"},
                "sub_category": {"type": "keyword"},
                "brand": {"type": "keyword"},
                # "product_details": {"type": "nested", "properties": {"Style Code": {"type": "text"}, "Closure": {"type": "text"}, "Pockets": {"type": "text"}, "Fabric": {"type": "text"}, "Pattern": {"type": "text"}, "Color": {"type": "text"}}},
                # "selling_price": {"type": "float"},
                # "actual_price": {"type": "float"},
                "average_rating": {"type": "float"},
                # "discount": {"type": "text"},
                "url": {"type": "text"},
                "out_of_stock": {"type": "boolean"},
                "title_and_description_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'products'})

In [135]:
def index_documents(documents_filename, index_name, es_client):
    index = 0
    # Open the file containing the JSON data to index.
    with open(documents_filename, "r") as json_file:
        json_data = json.load(json_file)
        documents = []
        for doc in json_data:
            new_doc ={}
            new_doc["title"] = doc['title']
            new_doc["description"] = doc['description']
            new_doc["category"] = doc['category']
            new_doc["sub_category"] = doc['sub_category']
            new_doc["brand"] = doc['brand']
            # new_doc["product_details"] = {}
            # new_doc['product_details'] = doc['product_details']    
            # new_doc['selling_price'] = doc['selling_price']    
            # new_doc['actual_price'] = doc['actual_price'] 
            new_doc['average_rating'] = doc['average_rating'] 
            # new_doc['discount'] = doc['discount'] 
            new_doc['url'] = doc['url'] 
            new_doc['out_of_stock'] = doc['out_of_stock']
            title_and_description = doc['title'] + doc['description']
            new_doc["title_and_description_vector"] = model.encode(title_and_description).tolist()

            documents.append(new_doc)
            index = index + 1

            if index == 200:
                break

    for doc in documents:
        try:
            es_client.index(index=index_name, document=doc)
        except Exception as e:
            print(e)


In [136]:
index_documents('../data/flipkart_fashion_products_dataset.json', index_name, es)

In [137]:
# Get the document count
response = es.count(index=index_name)
document_count = response['count']
print(f"Number of documents in the index '{index_name}': {document_count}")

Number of documents in the index 'products': 200


In [138]:
# Text search
def full_text_search(query):
    response = es.search(
        index=index_name,
        body={
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": ["title", "description"]
                }
            }
        }
    )
    return response['hits']['hits']

query = "track pants"
results = full_text_search(query)
for result in results:
    print(result['_source'])

{'title': 'Solid Men Multicolor Track Pants', 'description': 'Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India', 'category': 'Clothing and Accessories', 'sub_category': 'Bottomwear', 'brand': 'York', 'average_rating': '3.9', 'url': 'https://www.flipkart.com/yorker-solid-men-multicolor-track-pants/p/itmd2c76aadce459?pid=TKPFCZ9EA7H5FYZH&lid=LSTTKPFCZ9EA7H5FYZHVYXWP0&marketplace=FLIPKART&srno=b_1_1&otracker=browse&fm=organic&iid=177a46eb-d053-4732-b3de-fcad6ff59cbd.TKPFCZ9EA7H5FYZH.SEARCH&ssid=utkd4t3gb40000001612415717799', 'out_of_stock': False, 'title_and_description_vector': [-0.04162813723087311, 0.003835424780845642, -0.0027257739566266537, 0.023723645135760307, -0.03532840684056282, -0.015687942504882812, -0.02404077723622322, 0.05117395520210266, -0.08992418646812439, -0.008086122572422028, 0.027893325313925743, 0.006087297108024359, -0.01451

In [139]:
# semantic search
query = "track pants"
query_vector = model.encode(query)

query = {
    "field": "title_and_description_vector",
    "query_vector": query_vector,
    "k": 5,
    "num_candidates": 1000, 
}

res = es.search(index=index_name, knn=query, source=["title", "description", "category", "sub_category", "brand"])
res["hits"]["hits"]

[{'_index': 'products',
  '_id': 'XU-blZEB_cruYIrbxgSo',
  '_score': 0.8702997,
  '_source': {'sub_category': 'Bottomwear',
   'description': '',
   'title': 'Solid Men Multicolor Track Pants',
   'category': 'Clothing and Accessories',
   'brand': 'York'}},
 {'_index': 'products',
  '_id': 'Xk-blZEB_cruYIrbxgSt',
  '_score': 0.8702997,
  '_source': {'sub_category': 'Bottomwear',
   'description': '',
   'title': 'Solid Men Multicolor Track Pants',
   'category': 'Clothing and Accessories',
   'brand': 'York'}},
 {'_index': 'products',
  '_id': 'YU-blZEB_cruYIrbxgTC',
  '_score': 0.8702997,
  '_source': {'sub_category': 'Bottomwear',
   'description': '',
   'title': 'Solid Men Multicolor Track Pants',
   'category': 'Clothing and Accessories',
   'brand': 'York'}},
 {'_index': 'products',
  '_id': 'cU-blZEB_cruYIrbxwQ3',
  '_score': 0.8640954,
  '_source': {'sub_category': 'Bottomwear',
   'description': '',
   'title': 'Solid Men Blue Track Pants',
   'category': 'Clothing and Access

In [141]:
# hybrid search

knn_query = {
    "field": "title_and_description_vector",
    "query_vector": query_vector,
    "k": 5,
    "num_candidates": 1000
}

response = es.search(
    index=index_name,
    query={
        "match": {"category": "Clothing and Accessories"},
    },
    knn=knn_query,
    size=5
)

response["hits"]["hits"]


[{'_index': 'products',
  '_id': 'XU-blZEB_cruYIrbxgSo',
  '_score': 0.87279034,
  '_source': {'title': 'Solid Men Multicolor Track Pants',
   'description': '',
   'category': 'Clothing and Accessories',
   'sub_category': 'Bottomwear',
   'brand': 'York',
   'average_rating': '3.3',
   'url': 'https://www.flipkart.com/yorker-solid-men-multicolor-track-pants/p/itm6294ab9c88fa0?pid=TKPFZFSHHACG3FHC&lid=LSTTKPFZFSHHACG3FHC5MQRMV&marketplace=FLIPKART&srno=b_1_12&otracker=browse&fm=organic&iid=177a46eb-d053-4732-b3de-fcad6ff59cbd.TKPFZFSHHACG3FHC.SEARCH&ssid=utkd4t3gb40000001612415717799',
   'out_of_stock': False,
   'title_and_description_vector': [-0.033772412687540054,
    0.024880798533558846,
    -0.0008130788337439299,
    0.029053103178739548,
    -0.014377662912011147,
    -0.03907518461346626,
    0.03255363181233406,
    0.040364183485507965,
    -0.10658635944128036,
    -0.05419078469276428,
    0.016468197107315063,
    -0.0047315992414951324,
    -0.015498240478336811,
    