In [3]:
import pandas as pd

sales_df = pd.read_csv('../data/salesItem.csv', sep=';')
sales_df = sales_df[:1000]
sales_df.head()



Unnamed: 0,SalesItemId,SalesItemCode,SalesItemName,ActiveIngredientsName,Formularium,OrganizationId
0,19000000062,MCD00062,2 MTM BODY LOTION (DR MUTTY),KOSMETIK,NORMAL,19
1,100000000146092,KJD00002,2-4 (ZWAVEL ZALF) 30GRAM - N/A,"SULFUR, SALICYLIC ACID",NORMAL,331
2,100000000146091,KJD00001,2-4 ZALF /GRAM - N/A,"SULFUR, SALICYLIC ACID",NORMAL,3531
3,100000000142499,DT00146R,3TC 150MG TAB,LAMIVUDINE,NORMAL,317273243
4,117701,LVD00001,3TC-HBV 100MG TAB,LAMIVUDINE,NORMAL,234561014151719323739


In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "intfloat/multilingual-e5-large",
    trust_remote_code=True
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
item_embeddings = model.encode(
    sales_df["SalesItemName"].tolist()
)

combined_texts = [
    f"{q}\n{a}" for q, a in zip(sales_df["SalesItemName"], sales_df["ActiveIngredientsName"])
]

combined_embeddings = model.encode(combined_texts)

item_contain_texts = [
    f"item {item} contain {active}"
    for item, active in zip(sales_df["SalesItemName"], sales_df["ActiveIngredientsName"])
]
item_contain_embeddings = model.encode(item_contain_texts)

print("Question embeddings shape:", item_embeddings.shape)
print("Combined embeddings shape:", combined_embeddings.shape)
print("Item SalesItemName Contain ActiveIngredientsName embeddings shape:", item_contain_embeddings.shape)


Question embeddings shape: (1000, 1024)
Combined embeddings shape: (1000, 1024)
Item SalesItemName Contain ActiveIngredientsName embeddings shape: (1000, 1024)


In [5]:
item_embeddings

array([[ 0.01636285,  0.02247839, -0.04719856, ...,  0.01911049,
        -0.02705739, -0.01888753],
       [ 0.01602141,  0.02800992, -0.0178182 , ..., -0.01012114,
        -0.0473279 ,  0.00737774],
       [ 0.03043888,  0.01910612, -0.01876098, ...,  0.00026839,
        -0.03432632,  0.00722637],
       ...,
       [ 0.02297836,  0.0040061 , -0.01395704, ..., -0.00854408,
        -0.02498958,  0.00151528],
       [ 0.01183954,  0.00848651, -0.03614064, ..., -0.0005096 ,
        -0.02317958, -0.0076756 ],
       [ 0.00805693, -0.00040696, -0.02758144, ..., -0.0135575 ,
        -0.02165226, -0.0047709 ]], dtype=float32)

In [6]:
from qdrant_client import QdrantClient, models

dimensions = 1024
distance = models.Distance.COSINE

client = QdrantClient("http://localhost:6333")
# client.delete_collection("sales-item")
client.create_collection(
    collection_name="sales-item",
    vectors_config={
        "item": models.VectorParams(
            size=dimensions,
            distance=distance,
        ),
        "combined": models.VectorParams(
            size=dimensions,
            distance=distance,
        ),
        "item_contain": models.VectorParams(
            size=dimensions,
            distance=distance,
        ),
    },
    optimizers_config=models.OptimizersConfigDiff(
        default_segment_number=2,
        indexing_threshold=100,
    ),
)


True

In [7]:
client.upload_collection(
    collection_name="sales-item",
    vectors={
        "item": item_embeddings,
        "combined": combined_embeddings,
        "item_contain": item_contain_embeddings
    },
    payload=sales_df.to_dict(orient="records"),
    ids=sales_df.index.tolist(),
    batch_size=64,
)


In [8]:
client.count("sales-item")

CountResult(count=1000)

In [9]:
import time

time.sleep(1.0)
collection = client.get_collection("sales-item")
while collection.status != models.CollectionStatus.GREEN:
    time.sleep(1.0)
    collection = client.get_collection("sales-item")
    
collection


CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=3000, points_count=1000, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors={'combined': VectorParams(size=1024, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), 'item': VectorParams(size=1024, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), 'item_contain': VectorParams(size=1024, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None)}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000

In [21]:
import pandas as pd
from collections import defaultdict
from ranx import Qrels, Run, compare

# 1. Prepare Qrels (Ground Truth)
# -------------------------------
# Each question should retrieve its own FAQ entry
qrels_data = [{
    "query_id": f"q_{idx}",
    "doc_id": str(idx),
    "score": 10  # Exact match
} for idx in sales_df.index]

qrels = Qrels.from_df(
    pd.DataFrame(qrels_data),
    q_id_col="query_id",
    doc_id_col="doc_id",
    score_col="score",
)

# 2. Encode Questions
# -------------------
item_embeddings = model.encode(sales_df["SalesItemName"].tolist())

# 3. Create Search Runs
# ---------------------
item_run_dict = defaultdict(dict)
combined_run_dict = defaultdict(dict)
item_contain_run_dict = defaultdict(dict)

for idx, (q_emb, row) in enumerate(zip(item_embeddings, sales_df.iterrows())):
    query_id = f"q_{idx}"
    
    # Search against SalesItemName vectors
    item_results = client.search(
        collection_name="sales-item",
        query_vector=models.NamedVector(
            name="item",
            vector=q_emb.tolist()
        ),
        limit=20,
    )
    for hit in item_results:
        item_run_dict[query_id][str(hit.id)] = hit.score
    
    # Search against SalesItemName ActiveIngredientsName vectors
    combined_results = client.search(
        collection_name="sales-item",
        query_vector=models.NamedVector(
            name="combined",
            vector=q_emb.tolist()
        ),
        limit=20,
    )
    for hit in combined_results:
        combined_run_dict[query_id][str(hit.id)] = hit.score

    # Search against "item SalesItemName contain ActiveIngredientsName" vectors
    item_contain_results = client.search(
        collection_name="sales-item",
        query_vector=models.NamedVector(
            name="item_contain", 
            vector=q_emb.tolist()
        ),
        limit=20,
    )
    for hit in item_contain_results:
        item_contain_run_dict[query_id][str(hit.id)] = hit.score

# 4. Create Run Objects
# ---------------------
item_run = Run(item_run_dict, name="item_vectors")
combined_run_run = Run(combined_run_dict, name="combined_vectors")
item_contain_run = Run(item_contain_run_dict, name="item_contain_vector")

# 5. Compare at k=20
# ------------------
report = compare(
    qrels=qrels,
    runs=[item_run, combined_run_run, item_contain_run],
    metrics=["precision@20", "recall@20", "mrr@20", "ndcg@20"],
)

print(report)

  item_results = client.search(
  combined_results = client.search(
  item_contain_results = client.search(


#    Model                  P@20    Recall@20  MRR@20    NDCG@20
---  -------------------  ------  -----------  --------  ---------
a    item_vectors           0.05            1  1.000ᵇᶜ   1.000ᵇᶜ
b    combined_vectors       0.05            1  0.987     0.991
c    item_contain_vector    0.05            1  0.995ᵇ    0.996ᵇ
