In [1]:
from core.logging import service_log, add_file_log
import logging
import multiprocessing
import os

os.environ['NUMEXPR_NUM_THREADS'] = str(multiprocessing.cpu_count() - 2)

logger = logging.getLogger(__name__)
service_log()
add_file_log()

# Считаем embeddings

In [2]:
from create_embeddings.schemas.embedding import ModelEnum
from pydantic import BaseModel

class EbeddingModel(BaseModel):
  name: ModelEnum
  param_millions: int
  ebedding_size: int
  max_tokens: int
  rank: int
  title_batch_per_gb: int = 250
  description_batch_per_gb: int = 250

PARAMS_TO_MEMORY_GB_COFICIENT: float = 0.0037572219181414585

In [3]:
ebeddins_list = (
    EbeddingModel(name=ModelEnum.RUBERT_TINY_TURBO, param_millions=29, ebedding_size=312, max_tokens=2048, rank=12),
    EbeddingModel(name=ModelEnum.RUBERT_TINY2, param_millions=29, ebedding_size=2048, max_tokens=514, rank=16),
    EbeddingModel(name=ModelEnum.LABSE_RU_TURBO, param_millions=128, ebedding_size=768, max_tokens=512, rank=8),
    EbeddingModel(name=ModelEnum.MULTILINGUAL_E5_LARGE_INSTRUCT, param_millions=560, ebedding_size=1024, max_tokens=514, rank=2),
    EbeddingModel(name=ModelEnum.BGE_M3, param_millions=567, ebedding_size=1024,	max_tokens=8192, rank=5),
)

In [4]:

import torch

if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("Using CPU")
    device = torch.device("cpu")

Using CPU


In [5]:
from parsers.runnures.schemas.product import StoreEnum
from create_embeddings.schemas.embedding import EmbeddingsFieldsEnum

store_map = {
    StoreEnum.STROYDVOR: '/home/roman/PycharmProjects/personal/diploma/parsers/runnures/stroydvor/data/products.csv',
    StoreEnum.OBI: '/home/roman/PycharmProjects/personal/diploma/parsers/runnures/obi/data/products_merged.csv',
}
uid_column = 'uid'
fields_to_encode = [EmbeddingsFieldsEnum.TITLE, EmbeddingsFieldsEnum.DESCRIPTION]
BATCH_SIZE = 1000
# fields_to_encode = ['title', 'description', 'properties__as_text']

In [None]:
import datetime
from transformers import AutoTokenizer, AutoModel
from create_embeddings.schemas.embedding import ProductEmbedding
from parsers.runnures.utils.csv import CsvWriter
import pandas as pd


for ebedding in ebeddins_list:
    logger.info(f'ebedding: {ebedding.name}')
    tokenizer = AutoTokenizer.from_pretrained(ebedding.name)
    model = AutoModel.from_pretrained(ebedding.name)
    for store, csv_path in store_map.items():
        logger.info(f'store: {store}')
        df = pd.read_csv(csv_path)
        for field in fields_to_encode:
            logger.info(f'field: {field}')
            datetime_start_over_all = datetime.datetime.now()
            csv_writer: CsvWriter[ProductEmbedding] = CsvWriter(f'products_embeddings_{store}_{field}_{ebedding.name.name}.csv', ProductEmbedding, path=r'./data/')
            i = 0
            while True:
                datetime_start = datetime.datetime.now()
                batch_df = df[[uid_column, field]][BATCH_SIZE * i:BATCH_SIZE * (i+1)]
                if len(batch_df) == 0:
                    break
                batch_df.dropna(inplace=True)
                uid_data = batch_df[uid_column].tolist()
                data = batch_df[field].tolist()

                encoded_input = tokenizer(data, padding=True, truncation=True, return_tensors='pt')
                with torch.no_grad():
                    model_output = model(**encoded_input)
                    sentence_embeddings = model_output[0][:, 0]

                sentence_embeddings_list = sentence_embeddings.tolist()
                csv_data_to_write = []
                for write_index in range(len(uid_data)):
                    csv_data_to_write.append(ProductEmbedding(
                            uid=uid_data[write_index],
                            field=field,
                            store=store,
                            model=ebedding.name,
                            embedding=sentence_embeddings_list[write_index]
                    ))
                csv_writer.write_lines(csv_data_to_write)
                logger.info(f'written batch {i} | runtime {encoded_input["input_ids"].size()}: {datetime.datetime.now() - datetime_start}')

                i += 1
            del csv_writer
            logger.info(f'overall runtime: {datetime.datetime.now() - datetime_start_over_all} | {store} {field} {ebedding.name.name}')

2024-10-20 23:12:01,573 - numexpr.utils - INFO - Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-10-20 23:12:01,667 - __main__ - INFO - ebedding: cointegrated/rubert-tiny2
2024-10-20 23:12:02,800 - __main__ - INFO - store: STROYDVOR
2024-10-20 23:12:03,316 - __main__ - INFO - field: title
2024-10-20 23:12:04,850 - __main__ - INFO - written batch 0 | runtime 38: 0:00:01.533080
2024-10-20 23:12:06,055 - __main__ - INFO - written batch 1 | runtime 38: 0:00:01.203829
2024-10-20 23:12:07,229 - __main__ - INFO - written batch 2 | runtime 35: 0:00:01.173384
2024-10-20 23:12:08,398 - __main__ - INFO - written batch 3 | runtime 31: 0:00:01.167780
2024-10-20 23:12:09,501 - __main__ - INFO - written batch 4 | runtime 33: 0:00:01.102494
2024-10-20 23:12:10,579 - __main__ - INFO - written batch 5 | runtime 29: 0:00:01.077016
2024-10-20 23:12:11,781 - __main__ - INFO - written batch 6 | runtime 32: 0:00:01.201603
2024-10-20 23:12:12,957 - __main_

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/576k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/887 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/513M [00:00<?, ?B/s]

2024-10-20 23:29:56,247 - __main__ - INFO - store: STROYDVOR
2024-10-20 23:29:56,748 - __main__ - INFO - field: title
2024-10-20 23:30:19,201 - __main__ - INFO - written batch 0 | runtime 46: 0:00:22.449564
2024-10-20 23:30:38,894 - __main__ - INFO - written batch 1 | runtime 40: 0:00:19.691617
2024-10-20 23:30:58,223 - __main__ - INFO - written batch 2 | runtime 39: 0:00:19.328847
2024-10-20 23:31:15,263 - __main__ - INFO - written batch 3 | runtime 34: 0:00:17.038861
2024-10-20 23:31:31,140 - __main__ - INFO - written batch 4 | runtime 32: 0:00:15.876284
2024-10-20 23:31:46,242 - __main__ - INFO - written batch 5 | runtime 30: 0:00:15.100582
2024-10-20 23:32:03,167 - __main__ - INFO - written batch 6 | runtime 34: 0:00:16.923918
2024-10-20 23:32:20,195 - __main__ - INFO - written batch 7 | runtime 34: 0:00:17.026952
2024-10-20 23:32:43,226 - __main__ - INFO - written batch 8 | runtime 45: 0:00:23.030780
2024-10-20 23:33:04,425 - __main__ - INFO - written batch 9 | runtime 41: 0:00:21

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

2024-10-21 03:14:06,865 - __main__ - INFO - store: STROYDVOR
2024-10-21 03:14:07,440 - __main__ - INFO - field: title
2024-10-21 03:15:18,846 - __main__ - INFO - written batch 0 | runtime 42: 0:01:11.401169
2024-10-21 03:16:27,395 - __main__ - INFO - written batch 1 | runtime 40: 0:01:08.548412
2024-10-21 03:17:32,493 - __main__ - INFO - written batch 2 | runtime 38: 0:01:05.096960
2024-10-21 03:18:30,707 - __main__ - INFO - written batch 3 | runtime 34: 0:00:58.212576
2024-10-21 03:19:27,107 - __main__ - INFO - written batch 4 | runtime 33: 0:00:56.399483
2024-10-21 03:20:18,601 - __main__ - INFO - written batch 5 | runtime 30: 0:00:51.492441
2024-10-21 03:21:13,697 - __main__ - INFO - written batch 6 | runtime 32: 0:00:55.095760
2024-10-21 03:22:19,049 - __main__ - INFO - written batch 7 | runtime 38: 0:01:05.350852
2024-10-21 03:23:36,270 - __main__ - INFO - written batch 8 | runtime 45: 0:01:17.220331
2024-10-21 03:24:51,626 - __main__ - INFO - written batch 9 | runtime 44: 0:01:15

# R&D

In [3]:
import pandas as pd

df_a = pd.read_csv('/home/roman/PycharmProjects/personal/diploma/parsers/runnures/stroydvor/data/products.csv')
df_b = pd.read_csv('/home/roman/PycharmProjects/personal/diploma/parsers/runnures/obi/data/products_merged.csv')

  df_b = pd.read_csv('/home/roman/PycharmProjects/personal/diploma/parsers/runnures/obi/data/products_merged.csv')


In [4]:
df_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14103 entries, 0 to 14102
Data columns (total 50 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   uid                                       14103 non-null  object 
 1   store                                     14103 non-null  object 
 2   title                                     14103 non-null  object 
 3   url                                       14103 non-null  object 
 4   category                                  14103 non-null  object 
 5   description                               8916 non-null   object 
 6   images                                    14103 non-null  object 
 7   images__0                                 14095 non-null  object 
 8   images__1                                 8349 non-null   object 
 9   images__2                                 5485 non-null   object 
 10  images__3                         

In [5]:
df_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45571 entries, 0 to 45570
Data columns (total 50 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   uid                                       45571 non-null  object 
 1   store                                     45571 non-null  object 
 2   title                                     45571 non-null  object 
 3   url                                       45571 non-null  object 
 4   category                                  45571 non-null  object 
 5   description                               39457 non-null  object 
 6   images                                    45571 non-null  object 
 7   images__0                                 45571 non-null  object 
 8   images__1                                 21810 non-null  object 
 9   images__2                                 14436 non-null  object 
 10  images__3                         

In [None]:
raise Exception

In [9]:
sentence_test = df_a.iloc[5000:6000]
sentence_test_title = list(sentence_test['title'])
sentence_test_desc = list(filter(lambda x: not pd.isna(x), sentence_test['description']))

In [49]:
from transformers import AutoTokenizer, AutoModel
import datetime

current_embedding = ebeddins_list[0]
print("Use: ", current_embedding.name)

tokenizer = AutoTokenizer.from_pretrained(current_embedding.name)
model = AutoModel.from_pretrained(current_embedding.name)

print('title')
encoded_input = tokenizer(sentence_test_title, padding=True, truncation=True, return_tensors='pt')
print('token_size', encoded_input['input_ids'].size())

datetime_start = datetime.datetime.now()
with torch.no_grad():
    model_output = model(**encoded_input)
    sentence_embeddings = model_output[0][:, 0]
datetime_end = datetime.datetime.now()
print(f'runtime {len(sentence_test_title)}: {datetime_end - datetime_start}')


print('\ndescription')
encoded_input = tokenizer(sentence_test_desc, padding=True, truncation=True, return_tensors='pt')
print('token_size', encoded_input['input_ids'].size())

datetime_start = datetime.datetime.now()
with torch.no_grad():
    model_output = model(**encoded_input)
    sentence_embeddings = model_output[0][:, 0]
datetime_end = datetime.datetime.now()
print(f'runtime {len(sentence_test_desc)}: {datetime_end - datetime_start}')

# sentence_embeddings_norm = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

Use:  sergeyzh/rubert-tiny-turbo
title
token_size torch.Size([1000, 29])
runtime 1000: 0:00:00.665106

description
token_size torch.Size([547, 425])
runtime 547: 0:00:06.257252


In [30]:
single_encoded_input = tokenizer(sentence_test_title[0], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    single_model_output = model(**encoded_input)
    single_sentence_embeddings = model_output[0][:, 0]

In [39]:
print(all(sentence_embeddings[0] == single_sentence_embeddings[0]))
sentence_embeddings_norm = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
single_sentence_embeddings_norm = torch.nn.functional.normalize(single_sentence_embeddings, p=2, dim=1)
print(all(sentence_embeddings_norm[0] == single_sentence_embeddings_norm[0]))
print(all(sentence_embeddings_norm[0] == sentence_embeddings[0]))

True
True
False


In [42]:
sentence_embeddings

tensor([[ 0.1968,  0.0805, -0.0681,  ...,  0.6348, -0.2396, -0.0982],
        [ 0.2096, -0.0101, -0.0631,  ...,  0.6291, -0.2470, -0.0917],
        [ 0.1825,  0.1019, -0.0212,  ...,  0.6207, -0.2527, -0.0990],
        ...,
        [ 0.3311,  0.2308, -0.0581,  ...,  0.6759,  0.0426, -0.1563],
        [ 0.2736,  0.2664, -0.0208,  ...,  0.6345,  0.0157, -0.0923],
        [ 0.3377,  0.2336, -0.0278,  ...,  0.6733,  0.0401, -0.1745]])

In [44]:
sentence_embeddings_list = sentence_embeddings.tolist()

In [46]:
len(sentence_embeddings_list[0])

312