In [1]:
import pandas as pd
import numpy as np

## Fetch products

In [2]:
product_df = pd.read_csv("dataset/product.csv", sep='\t')
product_df.head()

Unnamed: 0,product_id,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
0,0,solid wood platform bed,Beds,Furniture / Bedroom Furniture / Beds & Headboa...,"good , deep sleep can be quite difficult to ha...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0
1,1,all-clad 7 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,"create delicious slow-cooked meals , from tend...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0
2,2,all-clad electrics 6.5 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,prepare home-cooked meals on any schedule with...,features : keep warm setting|capacityquarts:6....,208.0,3.0,181.0
3,3,all-clad all professional tools pizza cutter,"Slicers, Peelers And Graters",Browse By Brand / All-Clad,this original stainless tool was designed to c...,overallwidth-sidetoside:3.5|warrantylength : l...,69.0,4.5,42.0
4,4,baldwin prestige alcott passage knob with roun...,Door Knobs,Home Improvement / Doors & Door Hardware / Doo...,the hardware has a rich heritage of delivering...,compatibledoorthickness:1.375 '' |countryofori...,70.0,5.0,42.0


In [3]:
## Assemble the product text use our product description as that text unless there is no description in which case we will use the product name.

In [78]:
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42994 entries, 0 to 42993
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   product_id           42994 non-null  int64  
 1   product_name         42994 non-null  object 
 2   product_class        40142 non-null  object 
 3   category hierarchy   41438 non-null  object 
 4   product_description  36986 non-null  object 
 5   product_features     42994 non-null  object 
 6   rating_count         33542 non-null  float64
 7   average_rating       33542 non-null  float64
 8   review_count         33542 non-null  float64
 9   product_text         42994 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 3.3+ MB


In [4]:
product_df['product_text'] = np.where(pd.notna(product_df['product_description']), 
                                      product_df['product_description'], 
                                      product_df['product_name'])

In [5]:
product_df.iloc[42990]

product_id                                                         42990
product_name                       emmeline 5 piece breakfast dining set
product_class                                          Dining Table Sets
category hierarchy     Furniture / Kitchen & Dining Furniture / Dinin...
product_description                                                  NaN
product_features       basematerialdetails : steel| : gray wood|ofhar...
rating_count                                                      1314.0
average_rating                                                       4.5
review_count                                                       864.0
product_text                       emmeline 5 piece breakfast dining set
Name: 42990, dtype: object

In [6]:
from llama_index.core import Document

documents = [
    Document(
        text=row['product_text'],
        metadata={
            'product_id': row['product_id'],
            'product_name': row['product_name'],
        }
    )
    for _, row in product_df.iterrows()
]

In [7]:
print(documents[0])

Doc ID: 973ffb25-b9ec-4d78-b794-3193309d9e49
Text: good , deep sleep can be quite difficult to have in this busy
age . fortunately , there ’ s an antidote to such a problem : a nice ,
quality bed frame like the acacia kaylin . solidly constructed from
acacia wood , this bed frame will stand the test of time and is fit to
rest your shoulders on for years and years . its sleek , natural wood
grain...


In [8]:
from llama_index.core.schema import MetadataMode

In [9]:
print(documents[0].get_content(metadata_mode=MetadataMode.ALL))

product_id: 0
product_name: solid wood platform bed

good , deep sleep can be quite difficult to have in this busy age . fortunately , there ’ s an antidote to such a problem : a nice , quality bed frame like the acacia kaylin . solidly constructed from acacia wood , this bed frame will stand the test of time and is fit to rest your shoulders on for years and years . its sleek , natural wood grain appearance provides a pleasant aesthetic to adorn any bedroom , acting both as a decorative piece as well as a place to give comfort after a hard day of work . our bed frame is designed to give ample under-bed space for easy cleaning and other usages , with a headboard attached to further express the craftiness . it can be used with other accessories such as a nightstand or bookcase headboard and is compatible with many types of mattresses including memory foam , spring , or hybrid ones . there ’ s nowhere better to relax than your own home , and with this bed frame that feeling of homeliness

In [10]:
print(documents[0].metadata)

{'product_id': 0, 'product_name': 'solid wood platform bed'}


### Convert Product Info into Embeddings

In [2]:
import os
import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read local .env file
# openai.api_key = os.getenv('OPENAI_API_KEY')

### Creating a Weaviate Client

In [3]:
import weaviate

In [4]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [7]:
client = weaviate.Client(
    url=os.getenv("WEAVIATE_URL"), # Replace with your Weaviate Cloud URL
    auth_client_secret=weaviate.auth.AuthApiKey(os.getenv("WEAVIATE_API_KEY")),  # Replace w/ your Weaviate instance API key
    additional_headers={"X-Cohere-Api-Key": os.environ['COHERE_API_KEY'],}
)

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [8]:
client.is_ready()

True

### Extract Nodes

In [9]:
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.node_parser import SentenceSplitter

In [38]:
# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [39]:
# Extract nodes from documents
nodes = node_parser.get_nodes_from_documents(documents)

In [40]:
print(f"Text: \n{nodes[0].text}")
print("------------------")
print(f"Window: \n{nodes[0].metadata['window']}")

Text: 
good , deep sleep can be quite difficult to have in this busy age . 
------------------
Window: 
good , deep sleep can be quite difficult to have in this busy age .  fortunately , there ’ s an antidote to such a problem : a nice , quality bed frame like the acacia kaylin .  solidly constructed from acacia wood , this bed frame will stand the test of time and is fit to rest your shoulders on for years and years .  its sleek , natural wood grain appearance provides a pleasant aesthetic to adorn any bedroom , acting both as a decorative piece as well as a place to give comfort after a hard day of work . 


In [41]:
len(nodes)

164163

In [13]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore

In [43]:
# construct vector store
vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name="LlamaIndex"
)

In [61]:
# Set up the storage for the embeddings
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [79]:
# If an index with the same index name already exists within Weaviate, delete it
# if client.schema.exists("LlamaIndex"):
#     client.schema.delete_class("LlamaIndex")

# index = VectorStoreIndex(
#     nodes,
#     storage_context = storage_context,
# )

In [11]:
import json

index_name="LlamaIndex"
response = client.schema.get(index_name)

# print(json.dumps(response, indent=2))

### Loading the index

In [14]:
vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name="LlamaIndex"
)

index = VectorStoreIndex.from_vector_store(vector_store)

### Query Index

In [20]:
query_engine = index.as_query_engine(similarity_top_k=2)
response = query_engine.query("wicker outdoor bar")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [21]:
from IPython.display import Markdown, display

In [22]:
display(Markdown(f"<b>{response}</b>"))

<b>The products mentioned are suitable for outdoor use and are designed for bar settings.</b>

### Query Index with Hybrid Search

In [26]:
from llama_index.core.response.notebook_utils import display_response

In [45]:
query_engine = index.as_query_engine(
    vector_store_query_mode="hybrid", similarity_top_k=2
)
response = query_engine.query(
    "salon chair",
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [46]:
display_response(response)

**`Final Response:`** The product_id 7465 is associated with the product_name "hair salon chair," which is designed to showcase the uniqueness of each salon.

In [65]:
# from llama_index.core.postprocessor import MetadataReplacementPostProcessor

# # The target key defaults to `window` to match the node_parser's default
# postproc = MetadataReplacementPostProcessor(
#     target_metadata_key="window"
# )

In [61]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine(
    vector_store_query_mode="hybrid", similarity_top_k=5, alpha=0.0
)
response = query_engine.query(
    "salon chair",
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [62]:
display_response(response)

**`Final Response:`** The salon chair is designed to showcase the uniqueness of each salon and create a fashionable salon interior. It offers a wide selection of professional salon products including styling chair, salon hairdryer, salon equipment, etc. The salon chair is made with the highest quality parts to provide long-lasting performance and a comfortable experience for customers.

In [63]:
response.metadata

{'e05f0fc2-15d3-40e1-a17f-d4ba60f6199b': {'window': 'offers a wide selection of professional salon products including styling chair , salon hairdryer , salon equipment , etc .  to showcase the uniqueness of each salon',
  'original_text': 'to showcase the uniqueness of each salon',
  'product_id': 7465,
  'product_name': 'hair salon chair'},
 '81dead1c-a774-4be8-93bf-3ff79461f823': {'window': 'offers a wide selection of professional salon products including styling chair , salon hairdryer , salon equipment , etc .  to showcase the uniqueness of each salon',
  'original_text': 'offers a wide selection of professional salon products including styling chair , salon hairdryer , salon equipment , etc . ',
  'product_id': 7465,
  'product_name': 'hair salon chair'},
 'd3baf129-0497-47ad-a4ae-2bc95ea791d3': {'window': 'mercer41 beauty offers a wide selection professional beauty salon equipment including salon styling chair , salon hair dryer , barber pole light , etc .  to showcase the unique

In [39]:
# get manually labeled groundtruth lables
label_df = pd.read_csv("dataset/label.csv", sep='\t')

In [40]:
label_df[(label_df['query_id'] == 0) & (label_df['product_id'] == 7465)]

Unnamed: 0,id,query_id,product_id,label
80,80,0,7465,Exact


In [64]:
label_df[(label_df['query_id'] == 0) & (label_df['product_id'] == 7468)]

Unnamed: 0,id,query_id,product_id,label
104,104,0,7468,Exact


In [65]:
label_df[(label_df['query_id'] == 0) & (label_df['product_id'] == 33691)]

Unnamed: 0,id,query_id,product_id,label
98,98,0,33691,Partial


In [75]:
query_df = pd.read_csv("dataset/query.csv", sep='\t')
n = 10
query = query_df.head(n)

In [102]:
def process_row(row):
    query_id = row['query_id']
    item = row['query']
    raw_response = query_engine.query(item)
    results = []
    for v in raw_response.metadata.values():
        results.append({
            'query_id': query_id,
            'query': item,
            'product_id': v['product_id'],
            'product_name': v['product_name'],
            'product_text': v['original_text'],
        })
    return results

In [103]:
results = query.apply(process_row, axis=1).explode().tolist()
df = pd.DataFrame(results)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx

In [104]:
df.head()

Unnamed: 0,query_id,query,product_id,product_name,product_text
0,0,salon chair,7465,hair salon chair,to showcase the uniqueness of each salon
1,0,salon chair,24010,bar salon task chair,"it features a curved backrest design , an adju..."
2,1,smart coffee table,33698,smart coffee table with storage,the sobro coffee table is designed to be plug ...
3,1,smart coffee table,33698,smart coffee table with storage,never miss a goal or a moment of the conversat...
4,2,dinosaur,27515,stegosaurus canvas art,the dinosaur on this painting looks like he 's...


### Query With MetadataReplacementPostProcessor

In [92]:
from llama_index.core.postprocessor import MetadataReplacementPostProcessor

In [93]:
query_engine = index.as_query_engine(
    similarity_top_k=2,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

In [94]:
window_response = query_engine.query(
    "salon chair"
)
print(window_response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
The product referred to as a "salon chair" is a professional salon product that is part of a wide selection of salon equipment offered to showcase the uniqueness of each salon.


In [95]:
window_response.get_formatted_sources()

'> Source (Doc id: e05f0fc2-15d3-40e1-a17f-d4ba60f6199b): offers a wide selection of professional salon products including styling chair , salon hairdryer ...\n\n> Source (Doc id: 2c2fa1b0-e694-4eb6-a734-99f06ecb18b2): the office chair is ideal for any casual or professional working area .  it provides comfortable ...'

In [96]:
window_response.metadata

{'e05f0fc2-15d3-40e1-a17f-d4ba60f6199b': {'window': 'offers a wide selection of professional salon products including styling chair , salon hairdryer , salon equipment , etc .  to showcase the uniqueness of each salon',
  'original_text': 'to showcase the uniqueness of each salon',
  'product_id': 7465,
  'product_name': 'hair salon chair'},
 '2c2fa1b0-e694-4eb6-a734-99f06ecb18b2': {'window': 'the office chair is ideal for any casual or professional working area .  it provides comfortable support with a cozy and convenient design .  carefully crafted .  it features a curved backrest design , an adjustable lift seat , and a five-star base with casters for easy mobility .',
  'original_text': 'it features a curved backrest design , an adjustable lift seat , and a five-star base with casters for easy mobility .',
  'product_id': 24010,
  'product_name': 'bar salon task chair'}}

In [97]:
label_df[(label_df['query_id'] == 0) & (label_df['product_id'] == 7465)]

Unnamed: 0,id,query_id,product_id,label
80,80,0,7465,Exact


In [100]:
window = window_response.source_nodes[0].node.metadata["window"]
sentence = window_response.source_nodes[0].node.metadata["original_text"]

print(f"Window: {window}")
print("------------------")
print(f"Original Sentence: {sentence}")

Window: offers a wide selection of professional salon products including styling chair , salon hairdryer , salon equipment , etc .  to showcase the uniqueness of each salon
------------------
Original Sentence: to showcase the uniqueness of each salon


In [98]:
label_df[(label_df['query_id'] == 0) & (label_df['product_id'] == 24010)]

Unnamed: 0,id,query_id,product_id,label
23,23,0,24010,Partial


In [101]:
window = window_response.source_nodes[1].node.metadata["window"]
sentence = window_response.source_nodes[1].node.metadata["original_text"]

print(f"Window: {window}")
print("------------------")
print(f"Original Sentence: {sentence}")

Window: the office chair is ideal for any casual or professional working area .  it provides comfortable support with a cozy and convenient design .  carefully crafted .  it features a curved backrest design , an adjustable lift seat , and a five-star base with casters for easy mobility .
------------------
Original Sentence: it features a curved backrest design , an adjustable lift seat , and a five-star base with casters for easy mobility .


### Add Re-ranker