### imports

In [22]:
import re
import json
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
import os
from fastembed import TextEmbedding,LateInteractionTextEmbedding,SparseTextEmbedding

## Load, Clean and Chunking of json

In [178]:
file_path = "sport-action-camera-mounts.json"
base_name = os.path.splitext(os.path.basename(file_path))[0]


In [179]:
with open(file_path,'r',encoding='utf-8') as fd:
    data = json.load(fd)

In [6]:
type(data)

dict

In [180]:
item_list = data["mods"]["listItems"]

In [181]:
item_list

[{'name': 'Camera Mounting Bracket Steel Video Surveillance Security Camera Mounts Wall Ceiling Mount Camera Support JIJIN',
  'nid': '407439706',
  'itemId': '407439706',
  'icons': [{'domClass': '150565',
    'type': 'img',
    'group': '6',
    'showType': '0'},
   {'domClass': '150318',
    'text': 'Gems save Rs. 146',
    'type': 'text',
    'group': '2',
    'showType': '0'}],
  'image': 'https://static-01.daraz.com.np/p/02223668a0aa904a21b38bfba8445f01.jpg',
  'isSmartImage': False,
  'utLogMap': {'srp_name': 'LazadaMainSrp',
   'x_object_type': 'item',
   'src': 'organic',
   'trafficType': 'organic',
   'x_sku_ids': '1745869525',
   'x_item_ids': '407439706',
   'iconList': '150565;150318',
   'SN': 'f96c003b2898b046e564c850a8f7bbdc',
   'current_price': '7278',
   'x_object_id': '407439706'},
  'originalPriceShow': '',
  'priceShow': 'Rs. 7,278',
  'ratingScore': '',
  'review': '',
  'location': 'Overseas',
  'description': ['1. High-hardness steel material: The monitoring b

In [10]:
type(item_list)

list

In [11]:
type(item_list[0])

dict

In [182]:
def chunks_of_each_doc(doc,chunk_size=128):
    chunks = []
    doc = json.dumps(doc)
    start = 0
    tokens = re.findall(r'\w+|[{}[\]:,",]',doc)
    while start<=len(tokens):
        end = min(start+chunk_size,len(tokens))
        chunk = " ".join(tokens[start:end])
        chunks.append(chunk)
        start+=chunk_size
    return chunks

In [183]:
all_chunks = []
for doc in item_list:
    chunks = chunks_of_each_doc(doc)
    all_chunks.append(chunks)
    

In [184]:
all_chunks

[['{ " name " : " Camera Mounting Bracket Steel Video Surveillance Security Camera Mounts Wall Ceiling Mount Camera Support JIJIN " , " nid " : " 407439706 " , " itemId " : " 407439706 " , " icons " : [ { " domClass " : " 150565 " , " type " : " img " , " group " : " 6 " , " showType " : " 0 " } , { " domClass " : " 150318 " , " text " : " Gems save Rs 146 " , " type " : " text " , " group " : " 2 " , " showType " : " 0 " } ] , " image " :',
  '" https : static 01 daraz com np p 02223668a0aa904a21b38bfba8445f01 jpg " , " isSmartImage " : false , " utLogMap " : { " srp_name " : " LazadaMainSrp " , " x_object_type " : " item " , " src " : " organic " , " trafficType " : " organic " , " x_sku_ids " : " 1745869525 " , " x_item_ids " : " 407439706 " , " iconList " : " 150565 150318 " , " SN " : " f96c003b2898b046e564c850a8f7bbdc " , " current_price " : " 7278 " , " x_object_id " : " 407439706 " } , " originalPriceShow " : " " , " priceShow " : " Rs 7 , 278 " , " ratingScore " :',
  '" " , "

## Qdrant Operations

In [25]:
collection_name = "Reranking_Hybrid_Search"
dense_encoder = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
sparse_encoder = SparseTextEmbedding("Qdrant/bm25")
late_colbert_embedder = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 5 files: 100%|██████████| 5/5 [00:41<00:00,  8.37s/it]


In [26]:
client = QdrantClient(url = "http://localhost:6333")

In [33]:
dense_encoder.embedding_size

384

In [74]:
late_colbert_embedder.embedding_size

128

In [98]:
if not client.collection_exists(collection_name = collection_name):
    client.create_collection(
        collection_name = collection_name,
        vectors_config= {
            "dense" : models.VectorParams(
                size = dense_encoder.embedding_size,
                distance= models.Distance.COSINE
            ),
            "lateInteraction" : models.VectorParams(
                size = late_colbert_embedder.embedding_size,
                distance = models.Distance.COSINE,
                multivector_config=models.MultiVectorConfig(
                    comparator=models.MultiVectorComparator.MAX_SIM
                ),
                hnsw_config=models.HnswConfigDiff(m=0)
            )
        },
        sparse_vectors_config={"sparse": models.SparseVectorParams(modifier = models.Modifier.IDF)}
        )
    

In [110]:
client.create_payload_index(
    collection_name=collection_name,
    field_name = "doc_id",
    field_schema = "integer"
)

UpdateResult(operation_id=263, status=<UpdateStatus.COMPLETED: 'completed'>)

In [111]:
client.create_payload_index(
    collection_name=collection_name,
    field_name = "chunk_id",
    field_schema = "integer"
)

UpdateResult(operation_id=265, status=<UpdateStatus.COMPLETED: 'completed'>)

In [112]:
client.create_payload_index(
    collection_name=collection_name,
    field_name = "file_name",
    field_schema = "keyword"
)

UpdateResult(operation_id=267, status=<UpdateStatus.COMPLETED: 'completed'>)

In [42]:
len(all_chunks)

40

In [185]:
all_chunks[0]

['{ " name " : " Camera Mounting Bracket Steel Video Surveillance Security Camera Mounts Wall Ceiling Mount Camera Support JIJIN " , " nid " : " 407439706 " , " itemId " : " 407439706 " , " icons " : [ { " domClass " : " 150565 " , " type " : " img " , " group " : " 6 " , " showType " : " 0 " } , { " domClass " : " 150318 " , " text " : " Gems save Rs 146 " , " type " : " text " , " group " : " 2 " , " showType " : " 0 " } ] , " image " :',
 '" https : static 01 daraz com np p 02223668a0aa904a21b38bfba8445f01 jpg " , " isSmartImage " : false , " utLogMap " : { " srp_name " : " LazadaMainSrp " , " x_object_type " : " item " , " src " : " organic " , " trafficType " : " organic " , " x_sku_ids " : " 1745869525 " , " x_item_ids " : " 407439706 " , " iconList " : " 150565 150318 " , " SN " : " f96c003b2898b046e564c850a8f7bbdc " , " current_price " : " 7278 " , " x_object_id " : " 407439706 " } , " originalPriceShow " : " " , " priceShow " : " Rs 7 , 278 " , " ratingScore " :',
 '" " , " re

In [186]:
all_embeds = []
for i in range(len(all_chunks)):
    dense_embed = list(dense_encoder.embed(chunk for chunk in all_chunks[i]))
    all_embeds.append(dense_embed)

In [156]:
len(all_embeds[39][0])

384

In [187]:
all_sparse_embeds = []
for i in range(len(all_chunks)):
    sparse_embeds = list(sparse_encoder.embed(chunk for chunk in all_chunks[i]))
    all_sparse_embeds.append(sparse_embeds)

In [67]:
len(all_sparse_embeds)

40

In [188]:
all_sparse_embeds[0][0]

SparseEmbedding(values=array([1.51601615, 1.91239389, 1.91239389, 1.51601615, 1.51601615,
       1.51601615, 1.51601615, 1.51601615, 1.51601615, 1.51601615,
       1.51601615, 1.51601615, 1.51601615, 1.79505976, 1.51601615,
       1.51601615, 1.79505976, 1.51601615, 1.79505976, 1.51601615,
       1.79505976, 1.51601615, 1.79505976, 1.79505976, 1.51601615,
       1.79505976, 1.51601615, 1.51601615, 1.51601615, 1.51601615,
       1.51601615, 1.51601615]), indices=array([ 609270800,  841855028, 1470026668, 1133660292,  778187652,
        639288724, 1651761900,   84077638, 1946547886, 2127882423,
       1100855371, 1760930035, 1857907475,  286560239,  655156632,
        784623807, 1805940582, 1389699125, 2045916435, 1180690720,
       1021187622,  670727360, 2146037506,  764297089,  120528566,
        970674652, 2096753742,  575623047, 1272204281,  365389727,
         19522071,  301030427]))

In [189]:
all_colbert_embeds = []
for i in range(len(all_chunks)):
    colbert_embeds = list(late_colbert_embedder.embed(chunk for chunk in all_chunks[i]))
    all_colbert_embeds.append(colbert_embeds)

In [175]:
len(all_colbert_embeds)

40

In [118]:
len(all_colbert_embeds[0])

5

In [190]:
all_colbert_embeds[0][0]

array([[ 0.01626336, -0.0484901 , -0.0169001 , ...,  0.16917203,
        -0.00306627,  0.08282709],
       [ 0.00811026, -0.06637967,  0.02650257, ...,  0.08633719,
         0.0118856 ,  0.00654055],
       [ 0.28889728, -0.05278518, -0.0204819 , ..., -0.029353  ,
         0.02332336, -0.07939331],
       ...,
       [ 0.18275379, -0.01257339, -0.03911347, ...,  0.03938904,
         0.07022741,  0.04409259],
       [ 0.09066415, -0.07528304,  0.06078962, ...,  0.03172921,
         0.1558329 ,  0.05830738],
       [-0.04202987, -0.08008897, -0.00846908, ...,  0.07373112,
         0.0381279 ,  0.08524797]], shape=(66, 128), dtype=float32)

In [191]:
#embeddings
offset = 0
doc_offset = 0
info = client.get_collection(collection_name=collection_name)

count = info.points_count
if(count!=0):
    res, _ = client.scroll(
        collection_name = collection_name,
        limit = 1,
        with_payload=True,
        with_vectors=False,
        order_by={
            "key" : "chunk_id",
            "direction" : "desc"
        }
    )
    if(res):
        doc_number = res[0].payload.get("doc_id")
        last_id = res[0].id
        offset = last_id + 1
        doc_offset = doc_number + 1
    else:
        offset = 0
        doc_offset = 0
else:
    offset = 0
    doc_offset = 0

for doc in range (len(all_chunks)):
    for idx in range(len(all_chunks[doc])):
        client.upsert(
            collection_name = collection_name,
            points = [
                models.PointStruct(
                    id = offset,
                    payload = {
                        "doc_id" : doc_offset,
                        "chunk_id" : offset,
                        "chunk": all_chunks[doc][idx],
                        "file_name" : base_name
                    },
                    vector = {
                        "dense" : all_embeds[doc][idx],
                        "sparse" : all_sparse_embeds[doc][idx].as_object(),
                        "lateInteraction" : all_colbert_embeds[doc][idx]
                    }
                )
            ]
        )
        offset+=1
    doc_offset+=1

## Retrieval


In [199]:
query = "Samsung Products"

In [None]:
#Conversion of query into dense,sparse and multi vector(late interaction) with colbert


In [200]:
dense_query = next(dense_encoder.query_embed(query))
sparse_query = next(sparse_encoder.query_embed(query))
late_query = next(late_colbert_embedder.query_embed(query))

In [79]:
len(dense_query)

384

In [82]:
len(sparse_query.as_object())

2

In [201]:
sparse_query

SparseEmbedding(values=array([1, 1], dtype=int32), indices=array([ 818319522, 2002612373], dtype=int32))

In [202]:
late_query

array([[ 0.02266615,  0.0887066 , -0.09786274, ...,  0.03623376,
        -0.04266724,  0.10006028],
       [-0.08162071,  0.14767797,  0.04735919, ..., -0.0572216 ,
         0.02271252,  0.10062034],
       [-0.04956964,  0.16446337,  0.07586619, ..., -0.10789849,
         0.02131145,  0.06859025],
       ...,
       [ 0.07434604,  0.01111738, -0.14294711, ..., -0.0062704 ,
        -0.06423527,  0.04614387],
       [ 0.0679182 ,  0.01358873, -0.14556439, ..., -0.00658098,
        -0.06776115,  0.05981359],
       [ 0.06943252,  0.01398829, -0.14055593, ..., -0.02898094,
        -0.07032734,  0.07387357]], shape=(32, 128), dtype=float32)

In [203]:
prefetch = [
    models.Prefetch(query = dense_query,
    using = "dense",
    limit = 20,
    ),
    models.Prefetch(query = models.SparseVector(**sparse_query.as_object()),
                    using = "sparse",
                    limit = 20)
]

In [204]:
query_results = client.query_points(
    collection_name = collection_name,
    prefetch=prefetch,
    query = late_query,
    using = "lateInteraction",
    limit = 5,
    with_payload = True,
    with_vectors= False
)

In [205]:
query_results

QueryResponse(points=[ScoredPoint(id=228, version=228, score=21.662779, payload={'doc_id': 34, 'chunk_id': 228, 'chunk': '" : " www daraz com np products samsung galaxy a26 samsung a36 samsung a56 camera glass screen protector i365900189 html " , " querystring " : " fs_ab 1 priceCompare skuId 3A1620626496 3Bsource 3Alazada search voucher 3Bsn 3Afeb7c324dba78761f11a6be643f46dc6 3BoriginPrice 3A49900 3BdisplayPrice 3A49900 3BsinglePromotionId 3A 1 3BsingleToolCode 3AmockedSalePrice 3BvoucherPricePlugin 3A0 3Btimestamp 3A1758872502749 c ratingscore 5 0 freeshipping 0 source search channelLpJumpArgs fuse_fs search 1 sale 4 price 499 review 1 location Bagmati 20Province stock 1 lang en request_id feb7c324dba78761f11a6be643f46dc6 clickTrackInfo query 253Acamera 253Bnid 253A365900189 253Bsrc 253ALazadaMainSrp 253Brn 253Afeb7c324dba78761f11a6be643f46dc6 253Bregion 253Anp 253Bsku 253A365900189_NP 253Bprice 253A499 253Bclient 253Adesktop 253Bsupplier_id 253A900152283013 253Bbiz_source 253Ah5_unk

In [94]:
query_results[0]

TypeError: 'QueryResponse' object is not subscriptable

In [93]:
for point in query_results:
    print("score = ",point.score, "Point_id=", point.id, "doc_id = ", point.payload.get("doc_id"), "file_name = ",point.payload.get("file_name"), "chunk_id = " , point.payload.get("chunk_id"))

AttributeError: 'tuple' object has no attribute 'score'

In [207]:
client.scroll(
    collection_name=collection_name,
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(key="doc_id", match=models.MatchValue(value=14)),
        ]
    ),
    limit=10,
    with_payload=True,
    with_vectors=False,
)

([Record(id=90, payload={'doc_id': 14, 'chunk_id': 90, 'chunk': '{ " name " : " For Samsung Galaxy A16 5G 9H Rear Camera Lens Aluminum Alloy Ring Tempered Glass Film Black " , " nid " : " 188477533 " , " itemId " : " 188477533 " , " icons " : [ { " domClass " : " 150564 " , " type " : " img " , " group " : " 6 " , " showType " : " 0 " } , { " domClass " : " 175175 " , " type " : " img " , " group " : " 3 " , " showType " : " 0 " } ] , " image " : " https : static 01 daraz com np p bc8d479cd058b669424939a4b3167e2a', 'file_name': 'camera-screen-guards'}, vector=None, shard_key=None, order_value=None),
  Record(id=91, payload={'doc_id': 14, 'chunk_id': 91, 'chunk': 'jpg " , " isSmartImage " : false , " utLogMap " : { " srp_name " : " LazadaMainSrp " , " x_object_type " : " item " , " src " : " organic " , " trafficType " : " organic " , " x_sku_ids " : " 1231381669 " , " x_item_ids " : " 188477533 " , " iconList " : " 150564 175175 " , " SN " : " feb7c324dba78761f11a6be643f46dc6 " , " cur

## Using Gemini API Calls

In [226]:
from google import genai
import os
from dotenv import load_dotenv
from google.genai import types

In [228]:
load_dotenv()
def chat(model_name:str, question:str):
    client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
    response = client.models.generate_content(
    model = model_name, contents=question,
    config=types.GenerateContentConfig(thinking_config = types.ThinkingConfig(thinkingBudget=0))
    )
    print(response)



In [229]:

question = input("Enter your question here: ")
model = input("Which model do you want to use: ")
chat(model,question)

sdk_http_response=HttpResponse(
  headers=<dict len=11>
) candidates=[Candidate(
  content=Content(
    parts=[
      Part(
        text="""Google does not publicly disclose the specific context window size for Gemini-2.5-flash-lite.

Here's why and what we can generally infer:

*   **Proprietary Information:** The exact context window (and other architectural details) for their models is usually considered proprietary and a competitive advantage.
*   **Varies by Task/API:** Sometimes, the effective context window can vary depending on the specific API call, the task, and the user's quota. They might offer different versions or configurations that allow for slightly different lengths.
*   **"Flash" Implies Efficiency:** The "flash" in its name suggests that it's optimized for speed and efficiency. This often comes with a trade-off, where the context window might be shorter than their "Pro" or "Ultra" models, but still substantial enough for many common applications.
*   **General Trend