### imports

In [22]:
import re
import json
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
import os
from fastembed import TextEmbedding,LateInteractionTextEmbedding,SparseTextEmbedding

## Load, Clean and Chunking of json

In [3]:
file_path = "camera-screen-guards.json"
base_name = os.path.splitext(os.path.basename(file_path))[0]


In [4]:
with open(file_path,'r',encoding='utf-8') as fd:
    data = json.load(fd)

In [6]:
type(data)

dict

In [9]:
item_list = data["mods"]["listItems"]

In [None]:
item_list

In [10]:
type(item_list)

list

In [11]:
type(item_list[0])

dict

In [19]:
def chunks_of_each_doc(doc,chunk_size=128):
    chunks = []
    doc = json.dumps(doc)
    start = 0
    tokens = re.findall(r'\w+|[{}[\]:,",]',doc)
    while start<=len(tokens):
        end = min(start+chunk_size,len(tokens))
        chunk = " ".join(tokens[start:end])
        chunks.append(chunk)
        start+=chunk_size
    return chunks

In [20]:
all_chunks = []
for doc in item_list:
    chunks = chunks_of_each_doc(doc)
    all_chunks.append(chunks)
    

In [21]:
all_chunks

[['{ " name " : " Redmi Note 14 5g Camera Protector Mettalic Ring Glass " , " nid " : " 361653530 " , " itemId " : " 361653530 " , " icons " : [ { " domClass " : " 150565 " , " type " : " img " , " group " : " 6 " , " showType " : " 0 " } , { " domClass " : " 175175 " , " type " : " img " , " group " : " 3 " , " showType " : " 0 " } ] , " image " : " https : static 01 daraz com np p 59a3b411d874aeeea2f115c998b18143 jpg " , " isSmartImage " :',
  'false , " utLogMap " : { " srp_name " : " LazadaMainSrp " , " x_object_type " : " item " , " src " : " organic " , " trafficType " : " organic " , " x_sku_ids " : " 1592835298 " , " x_item_ids " : " 361653530 " , " iconList " : " 150565 175175 " , " SN " : " feb7c324dba78761f11a6be643f46dc6 " , " current_price " : " 239 " , " x_object_id " : " 361653530 " } , " originalPriceShow " : " " , " priceShow " : " Rs 239 " , " ratingScore " : " 4 4 " , " review " : " 5 " , " location " : " Bagmati',
  'Province " , " description " : [ ] , " thumbs " :

## Qdrant Operations

In [25]:
collection_name = "Reranking_Hybrid_Search"
dense_encoder = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
sparse_encoder = SparseTextEmbedding("Qdrant/bm25")
late_colbert_embedder = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 5 files: 100%|██████████| 5/5 [00:41<00:00,  8.37s/it]


In [26]:
client = QdrantClient(url = "http://localhost:6333")

In [33]:
dense_encoder.embedding_size

384

In [74]:
late_colbert_embedder.embedding_size

128

In [98]:
if not client.collection_exists(collection_name = collection_name):
    client.create_collection(
        collection_name = collection_name,
        vectors_config= {
            "dense" : models.VectorParams(
                size = dense_encoder.embedding_size,
                distance= models.Distance.COSINE
            ),
            "lateInteraction" : models.VectorParams(
                size = late_colbert_embedder.embedding_size,
                distance = models.Distance.COSINE,
                multivector_config=models.MultiVectorConfig(
                    comparator=models.MultiVectorComparator.MAX_SIM
                ),
                hnsw_config=models.HnswConfigDiff(m=0)
            )
        },
        sparse_vectors_config={"sparse": models.SparseVectorParams(modifier = models.Modifier.IDF)}
        )
    

In [35]:
client.create_payload_index(
    collection_name=collection_name,
    field_name = "doc_id",
    field_schema = "integer"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [36]:
client.create_payload_index(
    collection_name=collection_name,
    field_name = "chunk_id",
    field_schema = "integer"
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [37]:
client.create_payload_index(
    collection_name=collection_name,
    field_name = "file_name",
    field_schema = "keyword"
)

UpdateResult(operation_id=5, status=<UpdateStatus.COMPLETED: 'completed'>)

In [42]:
len(all_chunks)

40

In [43]:
all_chunks[0]

['{ " name " : " Redmi Note 14 5g Camera Protector Mettalic Ring Glass " , " nid " : " 361653530 " , " itemId " : " 361653530 " , " icons " : [ { " domClass " : " 150565 " , " type " : " img " , " group " : " 6 " , " showType " : " 0 " } , { " domClass " : " 175175 " , " type " : " img " , " group " : " 3 " , " showType " : " 0 " } ] , " image " : " https : static 01 daraz com np p 59a3b411d874aeeea2f115c998b18143 jpg " , " isSmartImage " :',
 'false , " utLogMap " : { " srp_name " : " LazadaMainSrp " , " x_object_type " : " item " , " src " : " organic " , " trafficType " : " organic " , " x_sku_ids " : " 1592835298 " , " x_item_ids " : " 361653530 " , " iconList " : " 150565 175175 " , " SN " : " feb7c324dba78761f11a6be643f46dc6 " , " current_price " : " 239 " , " x_object_id " : " 361653530 " } , " originalPriceShow " : " " , " priceShow " : " Rs 239 " , " ratingScore " : " 4 4 " , " review " : " 5 " , " location " : " Bagmati',
 'Province " , " description " : [ ] , " thumbs " : [ 

In [60]:
all_embeds = []
for i in range(len(all_chunks)):
    dense_embed = list(dense_encoder.embed(chunk for chunk in all_chunks[i]))
    all_embeds.append(dense_embed)

In [65]:
len(all_embeds[39][0])

384

In [66]:
all_sparse_embeds = []
for i in range(len(all_chunks)):
    sparse_embeds = list(sparse_encoder.embed(chunk for chunk in all_chunks[i]))
    all_sparse_embeds.append(sparse_embeds)

In [67]:
len(all_sparse_embeds)

40

In [88]:
all_sparse_embeds[0][0]

SparseEmbedding(values=array([1.51969779, 1.51969779, 1.51969779, 1.51969779, 1.51969779,
       1.51969779, 1.51969779, 1.51969779, 1.51969779, 1.51969779,
       1.51969779, 1.79763805, 1.51969779, 1.51969779, 1.79763805,
       1.51969779, 1.79763805, 1.79763805, 1.79763805, 1.51969779,
       1.79763805, 1.79763805, 1.51969779, 1.51969779, 1.51969779,
       1.51969779, 1.51969779, 1.51969779, 1.51969779, 1.51969779,
       1.51969779, 1.51969779, 1.51969779, 1.51969779, 1.51969779]), indices=array([ 609270800, 1778632963, 1062750627, 1749031367, 1059329584,
        841855028, 1456707543,   11975937, 1787999043, 1048581230,
       1857907475, 1016489826,  655156632,  784623807, 1805940582,
       1389699125, 2045916435, 1180690720, 1021187622,  670727360,
       2146037506,  764297089,  571576912,  264741300,  301030427,
        494071086, 1733861963, 1642882560, 1394524943, 1746685302,
       1962653560, 1737498726, 1740037055,  492009405,  171018802]))

In [68]:
all_colbert_embeds = []
for i in range(len(all_chunks)):
    colbert_embeds = list(late_colbert_embedder.embed(chunk for chunk in all_chunks[i]))
    all_colbert_embeds.append(colbert_embeds)

In [69]:
len(all_colbert_embeds)

40

In [70]:
len(all_colbert_embeds[0])

6

In [72]:
all_colbert_embeds[0][0]

array([[ 0.00251025, -0.01207562, -0.04687465, ...,  0.18435912,
         0.03935328,  0.14785266],
       [ 0.012442  , -0.02606458,  0.02447613, ...,  0.12791172,
         0.10145351,  0.1175167 ],
       [ 0.26488927, -0.02670211, -0.01263453, ...,  0.00032795,
         0.02624566, -0.04959461],
       ...,
       [ 0.01959956, -0.16498293, -0.05470958, ...,  0.20559762,
         0.00072175,  0.08538444],
       [ 0.00889958, -0.07320152, -0.10042106, ...,  0.17369933,
         0.10223102,  0.05024001],
       [-0.03807788, -0.05215594, -0.00461329, ...,  0.10503891,
         0.05580763,  0.08187871]], shape=(97, 128), dtype=float32)

In [99]:
#embeddings
offset = 0
doc_offset = 0
info = client.get_collection(collection_name=collection_name)

count = info.points_count
if(count!=0):
    res, _ = client.scroll(
        collection_name = collection_name,
        limit = 1,
        with_payload=True,
        with_vectors=False,
        order_by={
            "key" : "chunk_id",
            "direction" : "desc"
        }
    )
    if(res):
        doc_number = res[0].payload.get("doc_id")
        last_id = res[0].id
        offset = last_id + 1
        doc_offset = doc_number + 1
    else:
        offset = 0
        doc_offset = 0
else:
    offset = 0
    doc_offset = 0

for doc in range (len(all_chunks)):
    for idx in range(len(all_chunks[doc])):
        client.upsert(
            collection_name = collection_name,
            points = [
                models.PointStruct(
                    id = offset,
                    payload = {
                        "doc_id" : doc_offset,
                        "chunk_id" : offset,
                        "chunk": all_chunks[doc][idx],
                        "file_name" : base_name
                    },
                    vector = {
                        "dense" : all_embeds[doc][idx],
                        "sparse" : all_sparse_embeds[doc][idx].as_object(),
                        "lateInteraction" : all_colbert_embeds[doc][idx]
                    }
                )
            ]
        )
        offset+=1
    doc_offset+=1

## Retrieval


In [75]:
query = "Redmi Note 14 (5g) Camera Protector Mettalic Ring Glass"

In [None]:
#Conversion of query into dense,sparse and multi vector(late interaction) with colbert


In [85]:
dense_query = next(dense_encoder.query_embed(query))
sparse_query = next(sparse_encoder.query_embed(query))
late_query = next(late_colbert_embedder.query_embed(query))

In [79]:
len(dense_query)

384

In [82]:
len(sparse_query.as_object())

2

In [83]:
sparse_query

SparseEmbedding(values=array([1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32), indices=array([  11975937, 1778632963, 1062750627, 1787999043, 1749031367,
       1048581230, 1059329584,  841855028, 1456707543], dtype=int32))

In [86]:
late_query

array([[ 0.00888275,  0.07050345, -0.01304437, ...,  0.14082532,
        -0.00021858,  0.1043042 ],
       [ 0.03422824,  0.08853906,  0.08569521, ...,  0.04264636,
        -0.07035516,  0.05971263],
       [ 0.01190052,  0.06758404,  0.12808558, ...,  0.03874952,
        -0.07684994,  0.03781619],
       ...,
       [-0.09446603, -0.01095389,  0.0261399 , ..., -0.01319735,
        -0.02223087,  0.04420255],
       [-0.09299886, -0.1114521 , -0.15131466, ..., -0.14390084,
         0.00923404,  0.03162364],
       [-0.07768752, -0.10909093, -0.15863313, ..., -0.1447693 ,
         0.00255403,  0.01892057]], shape=(32, 128), dtype=float32)

In [91]:
prefetch = [
    models.Prefetch(query = dense_query,
    using = "dense",
    limit = 20,
    ),
    models.Prefetch(query = models.SparseVector(**sparse_query.as_object()),
                    using = "sparse",
                    limit = 20)
]

In [92]:
query_results = client.query_points(
    collection_name = collection_name,
    prefetch=prefetch,
    query = late_query,
    using = "lateInteraction",
    limit = 5,
    with_payload = True,
    with_vectors= False
)

In [100]:
query_results

QueryResponse(points=[ScoredPoint(id=0, version=6, score=27.49305, payload={'doc_id': [0, 0], 'chunk_id': 0, 'chunk': '{ " name " : " Redmi Note 14 5g Camera Protector Mettalic Ring Glass " , " nid " : " 361653530 " , " itemId " : " 361653530 " , " icons " : [ { " domClass " : " 150565 " , " type " : " img " , " group " : " 6 " , " showType " : " 0 " } , { " domClass " : " 175175 " , " type " : " img " , " group " : " 3 " , " showType " : " 0 " } ] , " image " : " https : static 01 daraz com np p 59a3b411d874aeeea2f115c998b18143 jpg " , " isSmartImage " :', 'file_name': 'camera-screen-guards'}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=19, version=25, score=26.960112, payload={'doc_id': [0, 3], 'chunk_id': 19, 'chunk': '{ " name " : " Redmi Note 14 Pro 5g Camera Protector Mettalic Ring Glass " , " nid " : " 361670427 " , " itemId " : " 361670427 " , " icons " : [ { " domClass " : " 150565 " , " type " : " img " , " group " : " 6 " , " showType " : " 0 " } , { " dom

In [94]:
query_results[0]

TypeError: 'QueryResponse' object is not subscriptable

In [93]:
for point in query_results:
    print("score = ",point.score, "Point_id=", point.id, "doc_id = ", point.payload.get("doc_id"), "file_name = ",point.payload.get("file_name"), "chunk_id = " , point.payload.get("chunk_id"))

AttributeError: 'tuple' object has no attribute 'score'

In [102]:
client.scroll(
    collection_name=collection_name,
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(key="doc_id", match=models.MatchValue(value=0)),
        ]
    ),
    limit=10,
    with_payload=True,
    with_vectors=False,
)

([Record(id=0, payload={'doc_id': 0, 'chunk_id': 0, 'chunk': '{ " name " : " Redmi Note 14 5g Camera Protector Mettalic Ring Glass " , " nid " : " 361653530 " , " itemId " : " 361653530 " , " icons " : [ { " domClass " : " 150565 " , " type " : " img " , " group " : " 6 " , " showType " : " 0 " } , { " domClass " : " 175175 " , " type " : " img " , " group " : " 3 " , " showType " : " 0 " } ] , " image " : " https : static 01 daraz com np p 59a3b411d874aeeea2f115c998b18143 jpg " , " isSmartImage " :', 'file_name': 'camera-screen-guards'}, vector=None, shard_key=None, order_value=None),
  Record(id=1, payload={'doc_id': 0, 'chunk_id': 1, 'chunk': 'false , " utLogMap " : { " srp_name " : " LazadaMainSrp " , " x_object_type " : " item " , " src " : " organic " , " trafficType " : " organic " , " x_sku_ids " : " 1592835298 " , " x_item_ids " : " 361653530 " , " iconList " : " 150565 175175 " , " SN " : " feb7c324dba78761f11a6be643f46dc6 " , " current_price " : " 239 " , " x_object_id " : "