# Base part

In [1]:
import datetime
from pydantic import BaseModel
import abc
import logging

logger = logging.getLogger(__name__)


class DotModel(BaseModel):
    uid: str
    vector: list[float]


class VectorClient(abc.ABC):
    @abc.abstractmethod
    def push_dots(self, dots: list[DotModel]):
        pass

    @abc.abstractmethod
    def get_dot(self, vector: list[float], n: int = 15) -> list[DotModel]:
        pass

    @abc.abstractmethod
    def create_collection(self):
        pass

    def start_pushing(self):
        self._start_pushing_time = datetime.datetime.now()

    def end_pushing(self):
        delta = datetime.datetime.now() - self._start_pushing_time
        logger.info(f'end pushing {type(self).__name__} in {delta}')

In [3]:
from create_embeddings.utils import get_csv_files_info
get_csv_files_info()

[EmbeddingModelCsvFile(embedding_model_name=<ModelEnum.RUBERT_TINY2: 'cointegrated/rubert-tiny2'>, field=<EmbeddingsFieldsEnum.DESCRIPTION: 'description'>, embedding_size=312, file_path='/home/roman/PycharmProjects/personal/diploma/create_embeddings/data/2024-10-20_23-12_products_embeddings_STROYDVOR_description_RUBERT_TINY2.csv'),
 EmbeddingModelCsvFile(embedding_model_name=<ModelEnum.LABSE_RU_TURBO: 'sergeyzh/LaBSE-ru-turbo'>, field=<EmbeddingsFieldsEnum.DESCRIPTION: 'description'>, embedding_size=768, file_path='/home/roman/PycharmProjects/personal/diploma/create_embeddings/data/2024-10-20_23-34_products_embeddings_STROYDVOR_description_LABSE_RU_TURBO.csv'),
 EmbeddingModelCsvFile(embedding_model_name=<ModelEnum.LABSE_RU_TURBO: 'sergeyzh/LaBSE-ru-turbo'>, field=<EmbeddingsFieldsEnum.TITLE: 'title'>, embedding_size=768, file_path='/home/roman/PycharmProjects/personal/diploma/create_embeddings/data/2024-10-20_23-29_products_embeddings_STROYDVOR_title_LABSE_RU_TURBO.csv'),
 EmbeddingMo

In [None]:
from create_embeddings.schemas.embedding import ProductEmbedding, ModelEnum
from parsers.runnures.utils.csv import CsvReader

from create_embeddings.schemas.embedding_model import EmbeddingModelCsvFile
from create_embeddings.utils import get_csv_files_info

BATCH_SIZE = 100


def populate_embeddings(client: VectorClient):
    csv_files_info: list[EmbeddingModelCsvFile] = get_csv_files_info()

    client.create_collection()
    logger.info(f'collection created {type(client).__name__}')

    i = 0
    client.start_pushing()
    for csv_file_info in csv_files_info:
        if csv_file_info.embedding_model_name != ModelEnum.MULTILINGUAL_E5_LARGE_INSTRUCT or "OBI" not in csv_file_info.file_path:
            continue

        logger.info(f'start new file {csv_file_info.file_path.split("/")[-1]}')

        batch: list[DotModel] = []
        csv_reader: CsvReader[ProductEmbedding] = CsvReader(csv_file_info.file_path, ProductEmbedding)
        for uid_vector in csv_reader:
            i += 1
            batch.append(DotModel(id=uid_vector.uid, vector=uid_vector.embedding))
            if len(batch) == BATCH_SIZE:
                logger.info(f'uploading {i} points to {type(client).__name__}')
                client.push_dots(batch)
                batch = []

        client.push_dots(batch)
        batch = []
    client.end_pushing()


In [None]:
import time
import asyncio

TEST_DURATION = 60

def call_get_dot_with_rps(client: VectorClient, rps: int):
    now = time.time()
    end_time =  now + TEST_DURATION
    sleep_interval = 1/rps
    next_call = now + sleep_interval

    while now < end_time:
        now = time.time()
        if now >= next_call:
            client.get_dot()
            next_call = next_call + sleep_interval

        await asyncio.sleep(sleep_interval)


def next_point(include_random=True):
    vector_len: int = 1024

    csv_files_info: list[EmbeddingModelCsvFile] = get_csv_files_info()
    for csv_file_info in csv_files_info:
        if csv_file_info.embedding_model_name != ModelEnum.MULTILINGUAL_E5_LARGE_INSTRUCT or "OBI" not in csv_file_info.file_path:
            continue

        csv_reader: CsvReader[ProductEmbedding] = CsvReader(csv_file_info.file_path, ProductEmbedding)
        for uid_vector in csv_reader:

            yield uid_vector.embedding