In [None]:
from qdrant_client import QdrantClient , models



In [None]:
client = QdrantClient(url="http://localhost:6333")

In [None]:
client.create_collection(
    collection_name="qdrant_basics",
    vectors_config = models.VectorParams(size=3,distance=models.Distance.COSINE)
)

In [None]:
client.create_collection(
    collection_name = "random",
    vectors_config=models.VectorParams(size=4,distance = models.Distance.COSINE)
)

In [None]:
client.upsert(
    collection_name="qdrant_basics",
    points=[
        models.PointStruct(
            id=1,
            payload={"color":"red"},
            vector=[0.1,0.2,0.3]
        ),
        models.PointStruct(
            id=2,
            payload={"color":"blue"},
            vector=[0.22,0.45,0.55]
        )

            ]
    
)

In [None]:
client.upsert(
    collection_name="qdrant_basics",
    points = [
        models.PointStruct(
            id=3,
            payload={"color":"red"},
            vector=[0.1,0.22,0.5]
        ),
        models.PointStruct(
            id=4,
            payload={"color":"red"},
            vector=[0.2,0.56,0.33]
        )

    ]
)

# Updating the Payloads

In [None]:
client.set_payload(
    collection_name="qdrant_basics",
    payload={"color":"green",
             "size":"medium"},
    points=[1,4]
)

In [None]:
client.set_payload(
    collection_name = "qdrant_basics",
    payload = {"color":"green","size":"medium"},

    points = models.Filter(
        must = [models.FieldCondition(
                key = "color",
                match = models.MatchValue(value = "red")

        ),
        models.FieldCondition(
            key="size",
            match = models.MatchValue(value="large")
        )
        ]
    )

)

In [None]:
#deleting the payloads

client.clear_payload(
    collection_name = "qdrant_basics",

    points_selector = [1]

)

In [None]:
#with filter

client.clear_payload(
    collection_name = "qdrant_basics",
    points_selector = models.Filter(
        must = [
            models.FieldCondition(
                key = "color",
                match = models.MatchValue(value = "red")
            ),
            models.FieldCondition(
                key = "size",
                match = models.MatchValue(value = "small")
            )
        ]
    )
)



In [None]:
#creating indexes according to the payload values

client.create_payload_index(
    collection_name = "qdrant_basics",
    field_name = "color",
    field_schema= "keyword" #this denotes that color is string type with categorical value
)

In [None]:
#Inserting ten datas with payloads size and color

client.upsert(
    collection_name = "qdrant_basics",
    points = [
        models.PointStruct(
            id=5,
            payload = {
                "color" : "red",
                "size" : "medium"
            },
            vector = [0.11,0.34,0.33]
        ),
        models.PointStruct(
            id=6,
            payload = {
                "color":"black",
                "size" : "small"
            },
            vector = [0.2,0.35,0.27]
        ),
        models.PointStruct(
            id=7,
            payload = {
                "color":"blue",
                "size" : "large"
            },
            vector = [0.21,0.32,0.29]
        ),
        models.PointStruct(
            id=8,
            payload = {
                "color":"green",
                "size" : "medium"
            },
            vector = [0.32,0.11,0.56]
        ),
        models.PointStruct(
            id=9,
            payload = {
                "color":"red",
                "size" : "small"
            },
            vector = [0.22,0.51,0.01]
        ),
        models.PointStruct(
            id=10,
            payload = {
                "color":"blue",
                "size" : "large"
            },
            vector = [0.2,0.35,0.27]
        ),
        models.PointStruct(
            id=11,
            payload = {
                "color":"red",
                "size" : "large"
            },
            vector = [0.11,0.63,0.92]
        ),
        models.PointStruct(
            id=12,
            payload = {
                "color":"green",
                "size" : "medium"
            },
            vector = [0.23,0.82,0.61]
        )
        
    ]
)

In [None]:
#setting payloads for points having no size payload

client.set_payload(
    collection_name = "qdrant_basics",
    payload = {"size":"small"},
    points = [2,3]
)

In [None]:
client.set_payload(
    collection_name= "qdrant_basics",
    payload = {
        "color" : "black",
        "size" : "large"
    },
    points = [1]
)

In [None]:
client.create_payload_index(
    collection_name="qdrant_basics",
    field_name = "size",
    field_schema = "keyword"
)

In [None]:
#Facet Counts is like the MySQL count aggregation with where and group by clause in vector database
client.facet(
    collection_name = "qdrant_basics",
    key = "color", #this is like the gorup by clause where the count is done grouped by color key(payload)
    facet_filter= models.Filter(must=[models.FieldCondition(
        key = "size",
        match = models.MatchValue(value="large")
    )])
    #this is like the where clause where the count is 
    #only done on the basis of the points where size = large
    #this is optional field 
)

In [None]:
client.facet(
    collection_name = "qdrant_basics",
    key = "color"
)

# Searching


In [None]:
client.query_points(
    collection_name = "qdrant_basics",
    query = [0.1,0.2,0.3]
)

In [None]:
#searching with Filtering, Limits and Params

client.query_points(
    collection_name = "qdrant_basics",
    query = [0.22,0.34,0.45],

    query_filter = models.Filter(
        must = [
            models.FieldCondition(
                key = "color",
                match = models.MatchValue(value = "red")
            )
        ]

    ),
    search_params = models.SearchParams(hnsw_ef = 128,exact = False), #here the search params are the custom parameters for search
    #hnsw = graph search algorithm and the ef paramater determines the number nearest neighbours to expand at once, 
    # exact determines whether or not to search for the point with the exact vector
    #indexed_only = determines whether or not to include the vectors that are indexed
    limit = 4 #how many points to return from the search

)

In [None]:
#searching with more params

client.query_points(
    collection_name = "qdrant_basics",
    query = [0.21,0.32,0.43],
    with_vectors = True,
    with_payload = models.PayloadSelectorExclude(
        exclude = ["color"]
    )
)

In [None]:
#batch search 
# so we create different single requests and batch them together in a single request 
filter_ = models.Filter(
    must = [
        models.FieldCondition(
            key = "color",
            match = models.MatchValue(value = "red")
        )
    ]
)
search_params = models.SearchParams(hnsw_ef = 128, exact = False )
search_queries = [
    models.QueryRequest(query=[0.1,0.2,0.3],filter=filter_,limit = 3),
    models.QueryRequest(query = [0.11,0.22,0.33],filter = filter_, limit = 5, params = search_params)
]

client.query_batch_points(collection_name = "qdrant_basics",requests = search_queries)   

In [None]:
#query by id
#using the id of a point which fetches the vector of that point and use it as query

client.query_points(
    collection_name = "qdrant_basics",
    query = 1 #point 1 is excluded
)

In [None]:
#search with an offset. So if the offset is set to 100 the search result skips all the 100 search result and starts
#after the first 100 results. This is known as pagination

client.query_points(
    collection_name = "qdrant_basics",
    query = [0.11,0.22,0.23],
    offset = 2,
    with_vectors = True,
    with_payload = True,
    limit = 4
)

In [None]:
# Group Searching

client.query_points_groups(
    collection_name = "qdrant_basics",
    group_by = "color",
    limit = 4,
    query = [0.1,0.2,0.3],
    group_size = 2
)

In [None]:
#Random Sampling
#this is used to use random sample just to retrieve the points
client.query_points(
    collection_name = "qdrant_basics",
    query = models.SampleQuery(sample = models.Sample.RANDOM)
)

In [None]:
#Parametarized Indexing of the payloads
client.create_payload_index(
    collection_name = "qdrant_basics",
    field_name = "age",
    field_schema = models.IntegerIndexParams(
        type = models.IntegerIndexType.INTEGER,
        lookup = False,
        range = True
    )
)

In [None]:
client.create_payload_index(
    collection_name = "qdrant_basics",
    field_name = "color",
    field_schema = models.KeywordIndexParams(
        type = models.KeywordIndexType.KEYWORD,
        on_disk=True
    )
)

In [None]:
client.set_payload(
    collection_name = "qdrant_basics",
    payload = {"tenant_id":"A"},
    points = [1,2,3,4,5,6]
)

In [None]:
client.set_payload(
    collection_name = "qdrant_basics",
    payload = {"tenant_id":"B"},
    points = [7,8,9,10,11,12]
)

In [None]:
client.create_payload_index(
    collection_name = "qdrant_basics",
    field_name = "tenant_id",
    field_schema=models.KeywordIndexParams(
        type = models.KeywordIndexType.KEYWORD,
        is_tenant = True
    )
    
)

In [None]:
client.set_payload(
    collection_name = "qdrant_basics",
    payload = {
        "age" :12
    },
    points= [1,2,3,4]
)

In [None]:
client.set_payload(
    collection_name = "qdrant_basics",
    payload = {
        "age" :13
    },
    points= [5,6,7,8]
)

In [None]:
client.set_payload(
    collection_name = "qdrant_basics",
    payload = {
        "age" :14
    },
    points= [9,10,11,12]
)

In [None]:
client.query_points(
    collection_name="qdrant_basics",
    query=[0.1,0.2,0.3],
    query_filter=models.Filter(
        must=[models.FieldCondition(
            key="age", 
            range=models.Range(gte=10, lte=13)
        )]
    )
)

# Named Vectors Collection

In [None]:
#creation of multiple vectors for a point named vectors

client.create_collection(
    collection_name = "named_vectored_collection",
    vectors_config = {
        "image" : models.VectorParams(size = 3, distance=models.Distance.COSINE),
        "text" : models.VectorParams(size = 4, distance=models.Distance.COSINE)
    }
)

In [None]:
#inserting 10 points in the named Vectors

client.upsert(
    collection_name = "named_vectored_collection",
    points = [
        models.PointStruct(
            id=1,
            payload = {"color":"red", "size" : "XXL" },
            vector = {
                "image" :[0.1,0.2,0.3],
                "text" : [0.11,0.21,0.31,0.2]
            },

        )
    ]
)

In [None]:
client.upsert(
    collection_name = "named_vectored_collection",
    points = [
        models.PointStruct(
            id=2,
            vector = {
                "image" :[0.23,0.44,0.33],
                "text" : [0.11,0.2,0.23,0.71]
            }
        ),
        models.PointStruct(
            id=3,
            vector = {
                "image" :[0.21,0.34,0.83],
                "text" : [0.71,0.87,0.91,0.21]
            }
        ),
        models.PointStruct(
            id=4,
            vector = {
                "image" :[0.12,0.73,0.93],
                "text" : [0.01,0.02,0.03,0.11]
            }
        )
    ]
)

In [None]:
client.set_payload(
    collection_name = "named_vectored_collection",
    payload = {
        "color":"black",
        "size":"XL"
    },
    points = [2,3,4]
)

In [None]:
client.create_payload_index(
    collection_name = "named_vectored_collection",
    field_name = "color",
    field_schema = "keyword"
)


In [None]:
client.create_payload_index(
    collection_name = "named_vectored_collection",
    field_name = "size",
    field_schema = "keyword"
)

In [None]:
#searching in the named vector

client.query_points(
    collection_name = "named_vectored_collection",
    query = [0.2,0.1,0.3],
    using = "image" # this specifies which vector among the text and image to use for the search 
)

# Snapshots

In [None]:
client.create_snapshot(
    collection_name = "qdrant_basics"
)

In [None]:
client.list_snapshots(
    collection_name = "qdrant_basics"
)

# RRF and Hybrid Queries


In [None]:
client.create_collection(
    collection_name="for_rrf",
    vectors_config={"dense":models.VectorParams(size=3, distance=models.Distance.COSINE)},
    sparse_vectors_config = {"sparse":models.SparseVectorParams()}
)

In [None]:
client.upsert(
    collection_name = "for_rrf",
    points= [
    models.PointStruct(
        id =2,
        payload = {"text":"Cats are lovely pets"},
        vector = {
            "dense":[0.11, 0.25, 0.37],
            "sparse":models.SparseVector(indices=[10, 42],values=[0.7, 0.3])
            
        },
        
    ),
    models.PointStruct(
        id =3,
        payload = {"text":"Dogs are loyal companion"},
        vector = {
            "dense":[0.15, 0.22, 0.39],
            "sparse":models.SparseVector(indices=[15, 42],values=[0.5, 0.4])
            
        },
        
    ),
    models.PointStruct(
        id =4,
        payload = {"text":"Cats and dogs often play together"},
        vector = {
            "dense":[0.14, 0.28, 0.36],
            "sparse":models.SparseVector(indices=[10, 15],values=[0.6, 0.5])
            
        },
        
    ),
    ]
)

In [None]:
client.query_points(
    collection_name = "for_rrf",
    prefetch = [models.Prefetch(
        query = [0.1,0.2,0.3],
        using = "dense"
    ),
    models.Prefetch(
        query = models.SparseVector(indices = [10,42],values=[0.61,0.43]),
        using = "sparse"
    ),

    ],
    query = models.FusionQuery(fusion = models.Fusion.RRF)
)

# Qdrant Basic for a Json File

In [1]:
from qdrant_client import QdrantClient, models


In [2]:
import json
import re
import os

In [3]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [88]:
#variables for Qdrant Database creation
json_file_path = "camera-screen-guards.json"
base_name = os.path.splitext(os.path.basename(json_file_path))[0]
chunk_size = 256
embedding_model_name = "all-MiniLM-L6-v2"

In [89]:
#chunking function

def chunk_text(text,chunk_size):
    start = 0
    tokens = re.findall(r'\w+|[{}[\]:,",]', text)
    chunks = []
    while start<=len(tokens):
        end = min(start+chunk_size,len(tokens))
        chunk = " ".join(tokens[start:end])
        chunks.append(chunk)
        start+=chunk_size

    return chunks

In [90]:
#loading the json file and dumping it into a string

with open(json_file_path,'r',encoding="utf-8") as f:
    data=json.load(f)

#dumping into a raw string

raw_text = json.dumps(data)

In [91]:
chunks = chunk_text(raw_text,chunk_size=256)

print(len(chunks))


135


In [8]:
#embedding and storing in a collection

encoder = SentenceTransformer("all-MiniLM-L6-v2")

In [9]:
client = QdrantClient(url="http://localhost:6333")

In [10]:
encoder.get_sentence_embedding_dimension()

384

In [12]:
client.create_collection(
    collection_name = "daraz_items",
    vectors_config = models.VectorParams(size = encoder.get_sentence_embedding_dimension(), distance = models.Distance.COSINE)
)

True

In [92]:
info = client.get_collection(collection_name= "daraz_items")

In [93]:
number = info.points_count
number

716

In [94]:
embeddings = encoder.encode(chunks,show_progress_bar = True, convert_to_numpy = True)

Batches: 100%|██████████| 5/5 [00:04<00:00,  1.09it/s]


In [95]:
embeddings

array([[ 0.00843654,  0.11279827,  0.01548663, ..., -0.00138694,
         0.04105727,  0.035195  ],
       [-0.01681552, -0.01440025, -0.09341506, ..., -0.04374988,
        -0.08263876,  0.06124899],
       [-0.05197982,  0.03899287, -0.07455286, ...,  0.0030145 ,
        -0.0104704 ,  0.05836742],
       ...,
       [-0.09090393,  0.07039408, -0.00985325, ..., -0.0489689 ,
         0.01599411,  0.07248552],
       [ 0.05407335,  0.09096282,  0.01182684, ..., -0.06614245,
        -0.02529055,  0.04595781],
       [-0.03317895,  0.04520515, -0.06388604, ..., -0.08347257,
         0.00757957,  0.07976647]], shape=(135, 384), dtype=float32)

# Storing in the database

In [19]:
offset = 0

In [96]:
offset = 0
res, _ = client.scroll(
    collection_name="daraz_items",
    limit=100,
    with_payload=False,
    with_vectors=False,
    order_by = {
        "key" : "chunk_id",
        "direction" : "desc"
    }
)
print(res)

if res:
    last_id = res[0].id
    offset = last_id + 1
else:
    offset = 0 

[Record(id=715, payload=None, vector=None, shard_key=None, order_value=715), Record(id=714, payload=None, vector=None, shard_key=None, order_value=714), Record(id=713, payload=None, vector=None, shard_key=None, order_value=713), Record(id=712, payload=None, vector=None, shard_key=None, order_value=712), Record(id=711, payload=None, vector=None, shard_key=None, order_value=711), Record(id=710, payload=None, vector=None, shard_key=None, order_value=710), Record(id=709, payload=None, vector=None, shard_key=None, order_value=709), Record(id=708, payload=None, vector=None, shard_key=None, order_value=708), Record(id=707, payload=None, vector=None, shard_key=None, order_value=707), Record(id=706, payload=None, vector=None, shard_key=None, order_value=706), Record(id=705, payload=None, vector=None, shard_key=None, order_value=705), Record(id=704, payload=None, vector=None, shard_key=None, order_value=704), Record(id=703, payload=None, vector=None, shard_key=None, order_value=703), Record(id=7

In [97]:
offset

716

In [98]:
base_name

'camera-screen-guards'

In [99]:
client.upsert(
    collection_name = "daraz_items",
    points = [
        models.PointStruct(
            id = idx + offset,
            vector = embeddings[idx].tolist(), payload = {
                "chunk_id" : idx + offset,
                "file_name":base_name,
                "chunk" : chunks[idx]
            })
            for idx in range(len(chunks))
    ]
)

UpdateResult(operation_id=16, status=<UpdateStatus.COMPLETED: 'completed'>)

In [23]:
client.create_payload_index(
    collection_name = "daraz_items",
    field_name= "chunk_id",
    field_schema="integer"
)

UpdateResult(operation_id=6, status=<UpdateStatus.COMPLETED: 'completed'>)

In [25]:
client.create_payload_index(
    collection_name = "daraz_items",
    field_name = "file_name",
    field_schema = models.KeywordIndexParams(type = models.KeywordIndexType.KEYWORD)
)

UpdateResult(operation_id=10, status=<UpdateStatus.COMPLETED: 'completed'>)

In [49]:
client.create_payload_index(
    collection_name = "daraz_items",
    field_name = "chunk",
    field_schema = models.KeywordIndexParams(type = models.KeywordIndexType.KEYWORD)
)

UpdateResult(operation_id=6, status=<UpdateStatus.COMPLETED: 'completed'>)

# Testing out the Vector Database


In [40]:
query = "MRK3C Adongruoan 256*192 Pixels Thermal Camera -20°C~550°C Android Type C Mobile Phone Infrared Thermal Imager For Electrical Repair"

In [41]:
hits = client.query_points(
    collection_name = "daraz_items",
    query = encoder.encode([query])[0].tolist(),
    limit = 3
    
).points

for hit in hits:
    print(hit.id,"score",hit.score,"payload",hit.payload.get("file_name"))

107 score 0.62338877 payload measuring-levelling
111 score 0.54712486 payload measuring-levelling
106 score 0.5195533 payload measuring-levelling


In [71]:
res = client.retrieve(
    collection_name="daraz_items",
    ids = [115]
)

In [72]:
res

[Record(id=115, payload={'id': 115, 'file_name': 'lens-caps', 'chunk': '5 6G VR 18 55mm Lens Cap Nikon 55mm Front For 18 55mm AF P18 55mm f 3 5 5 6G " , " nid " : " 433833617 " , " itemId " : " 433833617 " , " icons " : [ ] , " image " : " https : static 01 daraz com np p 78438e6822b546ba98f7e003f8a53721 jpg " , " isSmartImage " : false , " originalPriceShow " : " " , " priceShow " : " Rs 3 , 474 " , " ratingScore " : " " , " review " : " " , " location " : " Overseas " , " thumbs " : [ ] , " sellerName " : " 9366c14c " , " sellerId " : " 900274208781 " , " brandName " : " No Brand " , " brandId " : " 39704 " , " cheapest_sku " : " 433833617_NP 1867948234 " , " skuId " : " 1867948234 " , " sku " : " 433833617_NP " , " categories " : [ 240 , 8217 , 8220 , 8223 ] , " price " : " 3474 " , " inStock " : true , " originalPrice " : " 3543 " , " clickTrace " : " query : camera nid : 433833617 src : LazadaMainSrp rn : 3e3ad865f4f6894a6f06a36a8a95a2fe region : np sku : 433833617_NP price : 3474