In [1]:
import pandas as pd

df_sdv = pd.read_csv('/home/roman/PycharmProjects/personal/diploma/create_embeddings/data/2024-10-21_03-14_products_embeddings_STROYDVOR_title_MULTILINGUAL_E5_LARGE_INSTRUCT.csv')
df_obi = pd.read_csv('/home/roman/PycharmProjects/personal/diploma/create_embeddings/data/2024-10-21_08-35_products_embeddings_OBI_title_MULTILINGUAL_E5_LARGE_INSTRUCT.csv')

In [2]:
import json
import numpy as np

embeddings_sdv = np.array(df_sdv['embedding'].apply(json.loads).tolist())
embeddings_obi = np.array(df_obi['embedding'].apply(json.loads).tolist())

In [13]:
from qdrant_client import QdrantClient
client = QdrantClient(":memory:")

In [None]:
# client.close()

In [25]:
from qdrant_client.http.models import models
from qdrant_client.http.exceptions import UnexpectedResponse

BATCH_SIZE = 2500

def populate_embeddings(df, data, rows_uids, collection_suffix, final_volume=0):
    collection_name = f"len_{final_volume or len(rows_uids)}_{collection_suffix}"
    try:
        client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(
                size=len(data[0]), distance=models.Distance.COSINE
            ),
            hnsw_config=models.HnswConfigDiff(m=64, ef_construct=512),
            optimizers_config=models.OptimizersConfigDiff(indexing_threshold=0),
        )
        print(f'collection created {collection_name}')
    except (UnexpectedResponse, ValueError) as e:
        print(f'collection already exists {collection_name}')

    batch = []
    vectors_to_populate = data
    for i, uid in enumerate(rows_uids):
        index = df[df['uid'] == uid]['uid'].index
        batch.append(models.PointStruct(
            id=uid,
            vector=vectors_to_populate[index],
        ))

        if len(batch) == BATCH_SIZE:
            print(f'upload {i} points to collection {collection_name}')
            client.upload_points(
                collection_name=collection_name,
                points=batch,
            )
            batch = []

    print(f'upload {i} points to collection {collection_name}')
    client.upload_points(
        collection_name=collection_name,
        points=batch,
    )
    batch = []
    if final_volume > len(rows_uids):
        populate_embeddings_by_num(df, data, collection_name, additional_volume=final_volume-len(rows_uids))

    client.update_collection(
        collection_name=collection_name,
        optimizer_config=models.OptimizersConfigDiff(indexing_threshold=10_000),
    )

def populate_embeddings_by_num(df, data, collection_name, additional_volume, offset=5000):
    print(f'use collection for additional_volume {collection_name}')
    batch = []
    vectors_to_populate = data
    for i in range(offset, offset+additional_volume):
        batch.append(models.PointStruct(
            id=df['uid'][i],
            vector=vectors_to_populate[i],
        ))

        if len(batch) == BATCH_SIZE:
            print(f'upload {i} points to collection {collection_name}')
            client.upload_points(
                collection_name=collection_name,
                points=batch,
            )
            batch = []

    print(f'upload {i} points to collection {collection_name}')
    client.upload_points(
        collection_name=collection_name,
        points=batch,
    )
    batch = []

    client.update_collection(
        collection_name=collection_name,
        optimizer_config=models.OptimizersConfigDiff(indexing_threshold=10_000),
    )

In [17]:
from sqlalchemy import select
from db_populate.models import Match, Product
from sqlalchemy.orm import Session, aliased
from db_populate.session import db_session_as_kwarg


@db_session_as_kwarg
def get_mapped_products_uids(session: Session) -> list[tuple[str, str]]:
    product_1_alias = aliased(Product)
    product_2_alias = aliased(Product)

    query_result = session.execute(
        select(product_1_alias.uid, product_2_alias.uid)
        .select_from(Match)
        .join(product_1_alias, product_1_alias.id == Match.product_1_id)
        .join(product_2_alias, product_2_alias.id == Match.product_2_id)
    )

    return query_result.all()


products_uids = get_mapped_products_uids()

In [18]:
populate_embeddings(df_sdv, embeddings_sdv, np.array(products_uids[:100])[:, 0], "sdv")
populate_embeddings(df_obi, embeddings_obi, np.array(products_uids[:100])[:, 1], "obi")
populate_embeddings(df_obi, embeddings_obi, np.array(products_uids[:100])[:, 1], "obi", final_volume=1000)

collection already exists len_100_sdv
upload 99 points to collection len_100_sdv
collection already exists len_100_obi
upload 99 points to collection len_100_obi
collection created len_1000_obi
upload 99 points to collection len_1000_obi
use collection for additional_volume len_1000_obi
upload 5899 points to collection len_1000_obi


In [19]:
def drop_collections():
    for collection_name in [x.name for x in client.get_collections().collections]:
        client.delete_collection(collection_name)

In [23]:
drop_collections()

In [33]:
from qdrant_client.http.models import ScoredPoint, Record


def query_product_suggestions(uid: str, collection_name_l: str, collection_name_r: str, limit: int = 10) -> list[ScoredPoint]:
    vector_data_list_tiny_turbo = client.retrieve(
        collection_name=collection_name_l,
        ids=[uid],
        with_vectors=True
    )
    if len(vector_data_list_tiny_turbo) != 1:
        ValueError(f'Wrong number of vectors. Expected 1, got {len(vector_data_list_tiny_turbo)}')
    vector_data: Record = vector_data_list_tiny_turbo[0]
    hits: list[ScoredPoint] = client.query_points(
        collection_name=collection_name_r,
        query=vector_data.vector,
        limit=limit,
    ).points
    return hits

In [35]:
import time
from collections import defaultdict

MAX_HITS_TO_RETURN = 10
quantity = ((100, 100), (100, 1000), (1000, 1000))
result = defaultdict(lambda : 0)

for size_sdv, size_obi in quantity:
    print(f"\n\n############# {size_sdv} x {size_obi}")
    collection_name_l = f"len_{size_sdv}_sdv"
    collection_name_r = f"len_{size_obi}_obi"

    time_start_0 = time.time()

    populate_embeddings(df_sdv, embeddings_sdv, np.array(products_uids[:size_sdv])[:, 0], "sdv", final_volume=size_sdv)
    populate_embeddings(df_obi, embeddings_obi, np.array(products_uids[:size_sdv])[:, 1], "obi", final_volume=size_obi)

    time_start_1 = time.time()

    for product_1_uid, product_2_uid in products_uids[:size_sdv]:
        hits: list[ScoredPoint] = query_product_suggestions(product_1_uid, collection_name_l, collection_name_r, limit=MAX_HITS_TO_RETURN)
        hits_uids = [hit.id for hit in hits]
        if product_2_uid in hits_uids:
            result[f'{size_sdv} x {size_obi}'] += 1

    time_finish = time.time()

    print(f"finished: {result[f'{size_sdv} x {size_obi}']} in {time_finish - time_start_1} [with db populate {time_finish - time_start_0} Δ={time_start_1-time_start_0}]")
    drop_collections()



############# 100 x 100
collection created len_100_sdv
upload 99 points to collection len_100_sdv
collection created len_100_obi
upload 99 points to collection len_100_obi
finished: 100 in 0.06448197364807129 [with db populate 0.3677499294281006 Δ=0.3032679557800293]


############# 100 x 1000
collection created len_100_sdv
upload 99 points to collection len_100_sdv
collection created len_1000_obi
upload 99 points to collection len_1000_obi
use collection for additional_volume len_1000_obi
upload 5899 points to collection len_1000_obi
finished: 100 in 0.1770951747894287 [with db populate 1.2901403903961182 Δ=1.1130452156066895]


############# 1000 x 1000
collection created len_1000_sdv
upload 106 points to collection len_1000_sdv
use collection for additional_volume len_1000_sdv
upload 5892 points to collection len_1000_sdv
collection created len_1000_obi
upload 106 points to collection len_1000_obi
use collection for additional_volume len_1000_obi
upload 5892 points to collection l

In [36]:
print(0.064 / (100*100))
print(0.177 / (100*1000))
print(0.265 / (1000*1000))

6.4e-06
1.77e-06
2.6400000000000003e-07


In [37]:
result

defaultdict(<function __main__.<lambda>()>,
            {'100 x 100': 100, '100 x 1000': 100, '1000 x 1000': 106})

In [38]:
products_uids

[('cdee09a0-048a-4993-83fa-3671caf8f23e', 'a136378b-391f-4c45-ac3c-de3e511a66e5'),
 ('3813c723-075e-4031-ab1d-2b87a750539b', '00a124fe-f5eb-406a-8cca-983f9aace1cf'),
 ('be6ea243-36db-4e46-a2d9-c808e1798a70', 'a02af0c7-3cfa-480e-80a2-f71c24160a04'),
 ('91b92163-8053-47b5-8fc8-8b41ace4c11b', '55ad5aa6-cfcc-442b-aeb8-647d847caa2c'),
 ('7beaaee0-cabe-4d85-b104-06e564c9c3b8', 'd69fe655-8a7f-45e3-8837-17d4fe81e189'),
 ('f7afd5ac-61d4-4a58-896e-c6774c802726', '740e8bb8-4aef-4157-bc24-a2672b5c5db2'),
 ('00f17caa-e7f1-4d71-b3fe-b8af5ebcd12d', '20fc3cb5-0997-4742-a90c-fc80c6f9c343'),
 ('100e0711-0d4a-4330-bb21-fcc71dea663b', '266fb40f-d473-4351-90a2-7ef4d49e0e42'),
 ('7748ca37-e7de-45e4-add1-58ccd7eb1d0b', '84cb6e34-673a-43b9-9752-b1bcb88911ac'),
 ('f97ee8f7-8056-47b4-8034-4b566b2debe0', 'c75a2a44-a747-490d-a7c1-70c57ff24088'),
 ('bfe3a41e-8947-460f-94d9-38778601f7e8', '20fc3cb5-0997-4742-a90c-fc80c6f9c343'),
 ('0c7f3c0b-a8d5-412a-93c7-7acb562a6b6b', '68490869-03e9-49cc-a75b-15145fbe6eb7'),
 ('5