In [1]:
import hopsworks

proj = hopsworks.login() 
fs = proj.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://35.204.36.223/p/121
Connected. Call `.close()` to terminate connection gracefully.


In [2]:
import pandas as pd

df = pd.read_csv("data/ecommerce/product.csv")
df

Unnamed: 0,product_id,product_category_name,product_name_len,product_description_len,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0
...,...,...,...,...,...,...,...,...,...
32335,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,45.0,67.0,2.0,12300.0,40.0,40.0,40.0
32336,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,41.0,971.0,1.0,1700.0,16.0,19.0,16.0
32337,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,50.0,799.0,1.0,1400.0,27.0,7.0,27.0
32338,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,60.0,156.0,2.0,700.0,31.0,13.0,20.0


In [12]:
product_fg = fs.create_feature_group(
    name="product_fg",
    primary_key=["product_id"],
    version=1,
    description="Product details",
    online_enabled=True
)

product_fg.insert(df)

Feature Group created successfully, explore it at 
https://35.204.36.223/p/121/fs/69/fg/17


Uploading Dataframe: 0.00% |          | Rows 0/32340 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: product_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://35.204.36.223/p/121/jobs/named/product_fg_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x7f4174a47790>, None)

In [19]:
product_fg = fs.get_feature_group("product_fg", version=1)
query = product_fg.select_all()

feature_view = fs.create_feature_view(
    name="products",
    description="Product features",
    query=query
)

Feature view created successfully, explore it at 
https://35.204.36.223/p/121/fs/69/fv/products/version/1


### Create product embeddings

In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

2023-11-08 12:33:56,417 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-11-08 12:33:56,966 INFO: Use pytorch device: cuda


In [4]:
product_ids, products_info = [], []
for index, row in df.iterrows():
    product_ids.append(row["product_id"])
    products_info.append(row.to_string().replace("   ", "").replace("\n", ", "))

In [5]:
products_info[0]

'product_id  1e9e8ef04dbcff4541ed26657ea517e5, product_category_name perfumaria, product_name_len40.0, product_description_len 287.0, product_photos_qty  1.0, product_weight_g  225.0, product_length_cm  16.0, product_height_cm  10.0, product_width_cm14.0'

In [6]:
# create product embeddings
embeddings = model.encode(products_info)

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

### Create products index in OpenSearch

In [7]:
from opensearchpy import OpenSearch

opensearch_api = proj.get_opensearch_api()
client = OpenSearch(**opensearch_api.get_default_py_config())

In [8]:
index_name = opensearch_api.get_project_index("products_index")
emb_dim = 384

In [10]:
# To delete the indices
# response = client.indices.delete(
#     index = index_name
# )
# print(response)

2023-11-08 12:36:14,304 INFO: DELETE https://10.164.0.54:9200/ragllm_products_index [status:200 request:0.144s]
{'acknowledged': True}




In [11]:
# Dimensionality of candidate embeddings.

index_body = {
    "settings": {
        "knn": True,
        "knn.algo_param.ef_search": 100,
    },
    "mappings": {
        "properties": {
            "my_vector1": {
                "type": "knn_vector",
                "dimension": emb_dim,
                "method": {
                    "name": "hnsw",
                    "space_type": "innerproduct",
                    "engine": "faiss",
                    "parameters": {
                        "ef_construction": 256,
                        "m": 48
                    }
                }
            }
        }
    }
}

response = client.indices.create(index_name, body=index_body)
print(response)

2023-11-08 12:36:18,351 INFO: PUT https://10.164.0.54:9200/ragllm_products_index [status:200 request:0.080s]
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ragllm_products_index'}


### Insert product embeddings

In [12]:
from opensearchpy.helpers import bulk
import numpy as np

actions = []
for product_id, embedding in zip(product_ids, embeddings):    
    actions.append({
        "_index": index_name,
        "_id": product_id,
        "_source": {
            "my_vector1": embedding,
        }
    })

In [13]:
# Bulk insertion.
bulk(client, actions)

2023-11-08 12:36:19,405 INFO: POST https://10.164.0.54:9200/_bulk [status:200 request:0.269s]
2023-11-08 12:36:19,820 INFO: POST https://10.164.0.54:9200/_bulk [status:200 request:0.254s]
2023-11-08 12:36:20,248 INFO: POST https://10.164.0.54:9200/_bulk [status:200 request:0.263s]
2023-11-08 12:36:20,656 INFO: POST https://10.164.0.54:9200/_bulk [status:200 request:0.247s]
2023-11-08 12:36:21,075 INFO: POST https://10.164.0.54:9200/_bulk [status:200 request:0.257s]
2023-11-08 12:36:21,485 INFO: POST https://10.164.0.54:9200/_bulk [status:200 request:0.245s]
2023-11-08 12:36:21,907 INFO: POST https://10.164.0.54:9200/_bulk [status:200 request:0.253s]
2023-11-08 12:36:22,339 INFO: POST https://10.164.0.54:9200/_bulk [status:200 request:0.263s]
2023-11-08 12:36:22,754 INFO: POST https://10.164.0.54:9200/_bulk [status:200 request:0.253s]
2023-11-08 12:36:23,196 INFO: POST https://10.164.0.54:9200/_bulk [status:200 request:0.268s]
2023-11-08 12:36:23,602 INFO: POST https://10.164.0.54:9200/

(32340, [])

In [14]:
import pprint
import numpy as np

embedding = np.random.rand(emb_dim)

query = {
  "size": 10,
  "query": {
    "knn": {
      "my_vector1": {
        "vector": embedding,
        "k": 10
      }
    }
  }
}

response = client.search(
    body = query,
    index = index_name
)

pprint.pprint(response)

2023-11-08 12:36:47,588 INFO: POST https://10.164.0.54:9200/ragllm_products_index/_search [status:200 request:0.319s]
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '1613b819ab5dae53aead2dbb4ebdb378',
                    '_index': 'ragllm_products_index',
                    '_score': 1.0709013,
                    '_source': {'my_vector1': [-0.03800640255212784,
                                               0.021348556503653526,
                                               -0.053857214748859406,
                                               -0.005620013922452927,
                                               -0.008040418848395348,
                                               -0.02155541442334652,
                                               0.04243312403559685,
                                               0.05385737866163254,
                                               -0.04369688779115677,
                               