# Retrieval of Document and Chunks from the JSON file

In [7]:
import os
import json
import re


In [2]:
from qdrant_client import QdrantClient,models
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
with open("camera-screen-guards.json",'r',encoding="utf-8") as f:
    data = json.load(f) #retrievs a dictionary from the json file

In [9]:
items_list = data["mods"]["listItems"]

In [10]:
type(items_list)

list

In [11]:
items_list[0]

{'name': 'Redmi Note 14 (5g) Camera Protector Mettalic Ring Glass',
 'nid': '361653530',
 'itemId': '361653530',
 'icons': [{'domClass': '150565',
   'type': 'img',
   'group': '6',
   'showType': '0'},
  {'domClass': '175175', 'type': 'img', 'group': '3', 'showType': '0'}],
 'image': 'https://static-01.daraz.com.np/p/59a3b411d874aeeea2f115c998b18143.jpg',
 'isSmartImage': False,
 'utLogMap': {'srp_name': 'LazadaMainSrp',
  'x_object_type': 'item',
  'src': 'organic',
  'trafficType': 'organic',
  'x_sku_ids': '1592835298',
  'x_item_ids': '361653530',
  'iconList': '150565;175175',
  'SN': 'feb7c324dba78761f11a6be643f46dc6',
  'current_price': '239',
  'x_object_id': '361653530'},
 'originalPriceShow': '',
 'priceShow': 'Rs. 239',
 'ratingScore': '4.4',
 'review': '5',
 'location': 'Bagmati Province',
 'description': [],
 'thumbs': [],
 'sellerName': 'Zap Home',
 'sellerId': '900224992449',
 'brandName': 'No Brand',
 'brandId': '39704',
 'cheapest_sku': '361653530_NP-1592835298',
 'sk

In [3]:
encoder = SentenceTransformer("all-MiniLM-L6-v2")

In [12]:
#configuration
json_file_path = "camera-screen-guards.json"
base_name = os.path.splitext(os.path.basename(json_file_path))[0]
chunk_size = 128


In [13]:
def chunking_each_doc(doc,chunk_size=chunk_size):
    start = 0
    doc = json.dumps(doc)
    tokens = re.findall(r'\w+|[{}[\]:,",]',doc)
    chunks = []
    while start<=len(tokens):
        end = min(start+chunk_size,len(tokens))
        chunk = " ".join(tokens[start:end])
        chunks.append(chunk)
        start+=chunk_size
    return chunks

In [14]:
all_chunks = []
for item in items_list:
    chunks_of_each_doc = chunking_each_doc(item,chunk_size=chunk_size)
    all_chunks.append(chunks_of_each_doc)

In [15]:
for item in items_list:
    print(item)

{'name': 'Redmi Note 14 (5g) Camera Protector Mettalic Ring Glass', 'nid': '361653530', 'itemId': '361653530', 'icons': [{'domClass': '150565', 'type': 'img', 'group': '6', 'showType': '0'}, {'domClass': '175175', 'type': 'img', 'group': '3', 'showType': '0'}], 'image': 'https://static-01.daraz.com.np/p/59a3b411d874aeeea2f115c998b18143.jpg', 'isSmartImage': False, 'utLogMap': {'srp_name': 'LazadaMainSrp', 'x_object_type': 'item', 'src': 'organic', 'trafficType': 'organic', 'x_sku_ids': '1592835298', 'x_item_ids': '361653530', 'iconList': '150565;175175', 'SN': 'feb7c324dba78761f11a6be643f46dc6', 'current_price': '239', 'x_object_id': '361653530'}, 'originalPriceShow': '', 'priceShow': 'Rs. 239', 'ratingScore': '4.4', 'review': '5', 'location': 'Bagmati Province', 'description': [], 'thumbs': [], 'sellerName': 'Zap Home', 'sellerId': '900224992449', 'brandName': 'No Brand', 'brandId': '39704', 'cheapest_sku': '361653530_NP-1592835298', 'skuId': '1592835298', 'sku': '361653530_NP', 'cate

In [16]:
len(all_chunks)

40

In [44]:
all_chunks[0][0]

'{ " name " : " Redmi Note 14 5g Camera Protector Mettalic Ring Glass " , " nid " : " 361653530 " , " itemId " : " 361653530 " , " icons " : [ { " domClass " : " 150565 " , " type " : " img " , " group " : " 6 " , " showType " : " 0 " } , { " domClass " : " 175175 " , " type " : " img " , " group " : " 3 " , " showType " : " 0 " } ] , " image " : " https : static 01 daraz com np p 59a3b411d874aeeea2f115c998b18143 jpg " , " isSmartImage " :'

In [4]:
client = QdrantClient(url = "http://localhost:6333")

In [52]:
client.create_collection(
    collection_name = "daraz_items_with_docs",
    vectors_config = models.VectorParams(size = encoder.get_sentence_embedding_dimension(),distance = models.Distance.COSINE)
)

True

In [17]:
final_embeddings_of_json = []
for i in range(len(all_chunks)):
    embeddings = encoder.encode(all_chunks[i],show_progress_bar = True,convert_to_numpy = True)
    final_embeddings_of_json.append(embeddings)


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.01it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.92it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.30it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.47it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.55it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.45it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.90it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.54it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.09it/s]
Batches: 1

In [18]:
final_embeddings_of_json

[array([[-0.01524544,  0.11217152,  0.01946851, ...,  0.02310828,
          0.03304934,  0.03856251],
        [-0.00372009,  0.02864468, -0.07242841, ..., -0.0312691 ,
          0.00417812,  0.07721543],
        [-0.0112862 , -0.00651247, -0.05137919, ..., -0.06196759,
         -0.10690705,  0.08370256],
        [-0.04091834,  0.08021604, -0.08623229, ..., -0.02984095,
         -0.03074864,  0.04868243],
        [-0.05845777,  0.02152266, -0.08699869, ..., -0.03636439,
         -0.01718228,  0.05314913],
        [ 0.0064926 , -0.03356766, -0.05606353, ..., -0.0600832 ,
         -0.03997165,  0.06155678]], shape=(6, 384), dtype=float32),
 array([[-0.03561339,  0.10013042,  0.00377591, ..., -0.00324291,
         -0.00762909,  0.06073818],
        [ 0.00279725,  0.06036287, -0.03809437, ..., -0.01679109,
          0.01709243,  0.02467602],
        [-0.16405948,  0.00689422,  0.02147788, ...,  0.01122284,
         -0.0270455 ,  0.01229977],
        ...,
        [-0.03826234,  0.07369687, -

In [19]:
len(final_embeddings_of_json)

40

In [20]:
offset = 0

In [56]:
offset = 0
info = client.get_collection(collection_name= "daraz_items")




for doc_number in range(len(final_embeddings_of_json)):
    if(info.points_count != 0):
            res,_ = client.scroll(
            collection_name = "daraz_items_with_docs",
            limit=1,
            with_payload=False,
            with_vectors=False,
            order_by = {
                        "key" : "chunk_id",
                        "direction" : "desc",
                        },
            )
            if(res):
                last_id = res[0].id
                offset = last_id+1
            else:
                offset = 0
    else:
        offset = 0
    for idx in range(len(final_embeddings_of_json[doc_number])):
        client.upsert(
            collection_name = "daraz_items_with_docs",
            points = [models.PointStruct(
                id = idx+offset,
                payload = {
                "doc_id":doc_number,
                "chunk_id":idx+offset,
                "chunk_desc" : all_chunks[doc_number][idx],
                "file_name" : base_name
            },
            vector = final_embeddings_of_json[doc_number][idx].tolist()
            )]
        
        )

In [53]:
client.create_payload_index(
    collection_name = "daraz_items_with_docs",
    field_name = "chunk_id",
    field_schema = "integer"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [54]:
client.create_payload_index(
    collection_name = "daraz_items_with_docs",
    field_name = "doc_id",
    field_schema = "integer"
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [55]:
client.create_payload_index(
    collection_name = "daraz_items_with_docs",
    field_name = "file_name",
    field_schema="keyword"
)

UpdateResult(operation_id=5, status=<UpdateStatus.COMPLETED: 'completed'>)

In [26]:
len(final_embeddings_of_json)

40

In [30]:
for doc in range(len(final_embeddings_of_json)):
    print(len(final_embeddings_of_json[doc]))

6
7
6
6
6
6
7
7
6
6
7
7
7
6
6
6
7
7
6
6
6
6
6
7
6
9
7
7
7
6
7
6
7
8
6
6
6
7
7
7


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)