In [1]:
from core.logging import service_log, add_file_log
import logging
import multiprocessing
import os

os.environ['NUMEXPR_NUM_THREADS'] = str(multiprocessing.cpu_count() - 2)
os.environ['NUMEXPR_MAX_THREADS'] = str(multiprocessing.cpu_count())

logger = logging.getLogger(__name__)
service_log()
add_file_log()

# Считаем embeddings

In [2]:
from create_embeddings.schemas.embedding import ModelEnum
from pydantic import BaseModel

class EbeddingModel(BaseModel):
  name: ModelEnum
  param_millions: int
  ebedding_size: int
  max_tokens: int
  rank: int
  title_batch_per_gb: int = 250
  description_batch_per_gb: int = 250

PARAMS_TO_MEMORY_GB_COFICIENT: float = 0.0037572219181414585

In [3]:
ebeddins_list = (
    EbeddingModel(name=ModelEnum.RUBERT_TINY_TURBO, param_millions=29, ebedding_size=312, max_tokens=2048, rank=12),
    EbeddingModel(name=ModelEnum.RUBERT_TINY2, param_millions=29, ebedding_size=312, max_tokens=514, rank=16),
    EbeddingModel(name=ModelEnum.LABSE_RU_TURBO, param_millions=128, ebedding_size=768, max_tokens=512, rank=8),
    EbeddingModel(name=ModelEnum.MULTILINGUAL_E5_LARGE_INSTRUCT, param_millions=560, ebedding_size=1024, max_tokens=514, rank=2),
    EbeddingModel(name=ModelEnum.BGE_M3, param_millions=567, ebedding_size=1024,	max_tokens=8192, rank=5),
)

In [4]:

import torch

if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("Using CPU")
    device = torch.device("cpu")

Using CPU


In [5]:
from parsers.runnures.schemas.product import StoreEnum
from create_embeddings.schemas.embedding import EmbeddingsFieldsEnum

store_map = {
    StoreEnum.STROYDVOR: '/home/roman/PycharmProjects/personal/diploma/parsers/runnures/stroydvor/data/products.csv',
    StoreEnum.OBI: '/home/roman/PycharmProjects/personal/diploma/parsers/runnures/obi/data/products_merged.csv',
}
uid_column = 'uid'
fields_to_encode = [EmbeddingsFieldsEnum.TITLE, EmbeddingsFieldsEnum.DESCRIPTION]
# BATCH_SIZE = 1000
BATCH_SIZE = 200
# fields_to_encode = ['title', 'description', 'properties__as_text']

In [6]:
start_from_model = ModelEnum.MULTILINGUAL_E5_LARGE_INSTRUCT
start_from_store = StoreEnum.OBI
start_from_field = EmbeddingsFieldsEnum.DESCRIPTION

In [None]:
import datetime
from transformers import AutoTokenizer, AutoModel
from create_embeddings.schemas.embedding import ProductEmbedding
from parsers.runnures.utils.csv import CsvWriter
import pandas as pd


for embedding in ebeddins_list:
    if start_from_model == embedding.name:
        start_from_model = None
    if start_from_model is not None:
        continue
    logger.info(f'ebedding: {embedding.name}')
    tokenizer = AutoTokenizer.from_pretrained(embedding.name)
    model = AutoModel.from_pretrained(embedding.name)
    for store, csv_path in store_map.items():
        if start_from_store == store:
            start_from_store = None
        if start_from_store is not None:
            continue
        logger.info(f'store: {store}')
        df = pd.read_csv(csv_path)
        for field in fields_to_encode:
            if start_from_field == field:
                start_from_field = None
            if start_from_field is not None:
                continue
            logger.info(f'field: {field}')
            datetime_start_over_all = datetime.datetime.now()
            csv_writer: CsvWriter[ProductEmbedding] = CsvWriter(f'products_embeddings_{store}_{field}_{embedding.name.name}.csv', ProductEmbedding, path=r'./data/')
            i = 0
            while True:
                datetime_start = datetime.datetime.now()
                batch_df = df[[uid_column, field]][BATCH_SIZE * i:BATCH_SIZE * (i+1)]
                if len(batch_df) == 0:
                    break
                batch_df.dropna(inplace=True)
                uid_data = batch_df[uid_column].tolist()
                data = batch_df[field].tolist()

                encoded_input = tokenizer(data, padding=True, truncation=True, return_tensors='pt')
                with torch.no_grad():
                    model_output = model(**encoded_input)
                    sentence_embeddings = model_output[0][:, 0]

                sentence_embeddings_list = sentence_embeddings.tolist()
                csv_data_to_write = []
                for write_index in range(len(uid_data)):
                    csv_data_to_write.append(ProductEmbedding(
                            uid=uid_data[write_index],
                            field=field,
                            store=store,
                            model=embedding.name,
                            embedding=sentence_embeddings_list[write_index]
                    ))
                csv_writer.write_lines(csv_data_to_write)
                logger.info(f'written batch {i} | runtime {encoded_input["input_ids"].size()}: {datetime.datetime.now() - datetime_start}')

                i += 1
            del csv_writer
            logger.info(f'overall runtime: {datetime.datetime.now() - datetime_start_over_all} | {store} {field} {embedding.name.name}')

2024-10-21 23:47:29,124 - numexpr.utils - INFO - Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-10-21 23:47:29,257 - __main__ - INFO - ebedding: intfloat/multilingual-e5-large-instruct
2024-10-21 23:47:31,657 - __main__ - INFO - store: OBI
  df = pd.read_csv(csv_path)
2024-10-21 23:47:33,518 - __main__ - INFO - field: description
2024-10-21 23:51:07,783 - __main__ - INFO - written batch 0 | runtime torch.Size([197, 512]): 0:03:34.263835
2024-10-21 23:54:41,356 - __main__ - INFO - written batch 1 | runtime torch.Size([195, 512]): 0:03:33.572465
2024-10-21 23:58:14,197 - __main__ - INFO - written batch 2 | runtime torch.Size([197, 503]): 0:03:32.840162
2024-10-22 00:01:46,390 - __main__ - INFO - written batch 3 | runtime torch.Size([193, 512]): 0:03:32.191817
2024-10-22 00:04:50,128 - __main__ - INFO - written batch 4 | runtime torch.Size([190, 457]): 0:03:03.737325
2024-10-22 00:07:17,636 - __main__ - INFO - written batch 5 | runtim

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs-us-1.hf.co/repos/23/2c/232ca60237b0bb19bb6c28c5a6c8af79f2e423333a9626aad445543b80fbf31e/b5e0ce3470abf5ef3831aa1bd5553b486803e83251590ab7ff35a117cf6aad38?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1729839616&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyOTgzOTYxNn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzIzLzJjLzIzMmNhNjAyMzdiMGJiMTliYjZjMjhjNWE2YzhhZjc5ZjJlNDIzMzMzYTk2MjZhYWQ0NDU1NDNiODBmYmYzMWUvYjVlMGNlMzQ3MGFiZjVlZjM4MzFhYTFiZDU1NTNiNDg2ODAzZTgzMjUxNTkwYWI3ZmYzNWExMTdjZjZhYWQzOD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=JMtGu8W-TrEGlgsxoTOx2cI6LG-yVX3zoO6ecpkDSALOxAW4Cd-asYc-HY666J-eizsT9H9d%7E3Gr6s%7EDKWfXsTUgWRRlY1fjgk19bIGEmJ-cETLKBQLYIKc1xsihKpZjY5ZeadaseCxFmV4pZ4jPbPw0ZRTEmSlolGzjZcOxTnBQYa-

pytorch_model.bin:  72%|#######1  | 1.63G/2.27G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs-us-1.hf.co/repos/23/2c/232ca60237b0bb19bb6c28c5a6c8af79f2e423333a9626aad445543b80fbf31e/b5e0ce3470abf5ef3831aa1bd5553b486803e83251590ab7ff35a117cf6aad38?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1729839616&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyOTgzOTYxNn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzIzLzJjLzIzMmNhNjAyMzdiMGJiMTliYjZjMjhjNWE2YzhhZjc5ZjJlNDIzMzMzYTk2MjZhYWQ0NDU1NDNiODBmYmYzMWUvYjVlMGNlMzQ3MGFiZjVlZjM4MzFhYTFiZDU1NTNiNDg2ODAzZTgzMjUxNTkwYWI3ZmYzNWExMTdjZjZhYWQzOD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=JMtGu8W-TrEGlgsxoTOx2cI6LG-yVX3zoO6ecpkDSALOxAW4Cd-asYc-HY666J-eizsT9H9d%7E3Gr6s%7EDKWfXsTUgWRRlY1fjgk19bIGEmJ-cETLKBQLYIKc1xsihKpZjY5ZeadaseCxFmV4pZ4jPbPw0ZRTEmSlolGzjZcOxTnBQYa-

pytorch_model.bin:  73%|#######3  | 1.67G/2.27G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs-us-1.hf.co/repos/23/2c/232ca60237b0bb19bb6c28c5a6c8af79f2e423333a9626aad445543b80fbf31e/b5e0ce3470abf5ef3831aa1bd5553b486803e83251590ab7ff35a117cf6aad38?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1729839616&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyOTgzOTYxNn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzIzLzJjLzIzMmNhNjAyMzdiMGJiMTliYjZjMjhjNWE2YzhhZjc5ZjJlNDIzMzMzYTk2MjZhYWQ0NDU1NDNiODBmYmYzMWUvYjVlMGNlMzQ3MGFiZjVlZjM4MzFhYTFiZDU1NTNiNDg2ODAzZTgzMjUxNTkwYWI3ZmYzNWExMTdjZjZhYWQzOD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=JMtGu8W-TrEGlgsxoTOx2cI6LG-yVX3zoO6ecpkDSALOxAW4Cd-asYc-HY666J-eizsT9H9d%7E3Gr6s%7EDKWfXsTUgWRRlY1fjgk19bIGEmJ-cETLKBQLYIKc1xsihKpZjY5ZeadaseCxFmV4pZ4jPbPw0ZRTEmSlolGzjZcOxTnBQYa-

pytorch_model.bin:  99%|#########8| 2.24G/2.27G [00:00<?, ?B/s]

2024-10-22 10:21:27,597 - __main__ - INFO - store: STROYDVOR
2024-10-22 10:21:28,151 - __main__ - INFO - field: title
2024-10-22 10:21:37,961 - __main__ - INFO - written batch 0 | runtime torch.Size([200, 29]): 0:00:09.806912
2024-10-22 10:21:50,335 - __main__ - INFO - written batch 1 | runtime torch.Size([200, 37]): 0:00:12.373159
2024-10-22 10:22:04,698 - __main__ - INFO - written batch 2 | runtime torch.Size([200, 42]): 0:00:14.362170
2024-10-22 10:22:18,238 - __main__ - INFO - written batch 3 | runtime torch.Size([200, 40]): 0:00:13.539601
2024-10-22 10:22:27,934 - __main__ - INFO - written batch 4 | runtime torch.Size([200, 29]): 0:00:09.695031
2024-10-22 10:22:38,818 - __main__ - INFO - written batch 5 | runtime torch.Size([200, 32]): 0:00:10.883639
2024-10-22 10:22:48,939 - __main__ - INFO - written batch 6 | runtime torch.Size([200, 30]): 0:00:10.119202
2024-10-22 10:22:58,797 - __main__ - INFO - written batch 7 | runtime torch.Size([200, 29]): 0:00:09.857441
2024-10-22 10:23:1

# R&D

In [3]:
import pandas as pd

df_a = pd.read_csv('/home/roman/PycharmProjects/personal/diploma/parsers/runnures/stroydvor/data/products.csv')
df_b = pd.read_csv('/home/roman/PycharmProjects/personal/diploma/parsers/runnures/obi/data/products_merged.csv')

  df_b = pd.read_csv('/home/roman/PycharmProjects/personal/diploma/parsers/runnures/obi/data/products_merged.csv')


In [4]:
df_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14103 entries, 0 to 14102
Data columns (total 50 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   uid                                       14103 non-null  object 
 1   store                                     14103 non-null  object 
 2   title                                     14103 non-null  object 
 3   url                                       14103 non-null  object 
 4   category                                  14103 non-null  object 
 5   description                               8916 non-null   object 
 6   images                                    14103 non-null  object 
 7   images__0                                 14095 non-null  object 
 8   images__1                                 8349 non-null   object 
 9   images__2                                 5485 non-null   object 
 10  images__3                         

In [5]:
df_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45571 entries, 0 to 45570
Data columns (total 50 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   uid                                       45571 non-null  object 
 1   store                                     45571 non-null  object 
 2   title                                     45571 non-null  object 
 3   url                                       45571 non-null  object 
 4   category                                  45571 non-null  object 
 5   description                               39457 non-null  object 
 6   images                                    45571 non-null  object 
 7   images__0                                 45571 non-null  object 
 8   images__1                                 21810 non-null  object 
 9   images__2                                 14436 non-null  object 
 10  images__3                         

In [None]:
raise Exception

In [9]:
sentence_test = df_a.iloc[5000:6000]
sentence_test_title = list(sentence_test['title'])
sentence_test_desc = list(filter(lambda x: not pd.isna(x), sentence_test['description']))

In [49]:
from transformers import AutoTokenizer, AutoModel
import datetime

current_embedding = ebeddins_list[0]
print("Use: ", current_embedding.name)

tokenizer = AutoTokenizer.from_pretrained(current_embedding.name)
model = AutoModel.from_pretrained(current_embedding.name)

print('title')
encoded_input = tokenizer(sentence_test_title, padding=True, truncation=True, return_tensors='pt')
print('token_size', encoded_input['input_ids'].size())

datetime_start = datetime.datetime.now()
with torch.no_grad():
    model_output = model(**encoded_input)
    sentence_embeddings = model_output[0][:, 0]
datetime_end = datetime.datetime.now()
print(f'runtime {len(sentence_test_title)}: {datetime_end - datetime_start}')


print('\ndescription')
encoded_input = tokenizer(sentence_test_desc, padding=True, truncation=True, return_tensors='pt')
print('token_size', encoded_input['input_ids'].size())

datetime_start = datetime.datetime.now()
with torch.no_grad():
    model_output = model(**encoded_input)
    sentence_embeddings = model_output[0][:, 0]
datetime_end = datetime.datetime.now()
print(f'runtime {len(sentence_test_desc)}: {datetime_end - datetime_start}')

# sentence_embeddings_norm = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

Use:  sergeyzh/rubert-tiny-turbo
title
token_size torch.Size([1000, 29])
runtime 1000: 0:00:00.665106

description
token_size torch.Size([547, 425])
runtime 547: 0:00:06.257252


In [30]:
single_encoded_input = tokenizer(sentence_test_title[0], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    single_model_output = model(**encoded_input)
    single_sentence_embeddings = model_output[0][:, 0]

In [39]:
print(all(sentence_embeddings[0] == single_sentence_embeddings[0]))
sentence_embeddings_norm = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
single_sentence_embeddings_norm = torch.nn.functional.normalize(single_sentence_embeddings, p=2, dim=1)
print(all(sentence_embeddings_norm[0] == single_sentence_embeddings_norm[0]))
print(all(sentence_embeddings_norm[0] == sentence_embeddings[0]))

True
True
False


In [42]:
sentence_embeddings

tensor([[ 0.1968,  0.0805, -0.0681,  ...,  0.6348, -0.2396, -0.0982],
        [ 0.2096, -0.0101, -0.0631,  ...,  0.6291, -0.2470, -0.0917],
        [ 0.1825,  0.1019, -0.0212,  ...,  0.6207, -0.2527, -0.0990],
        ...,
        [ 0.3311,  0.2308, -0.0581,  ...,  0.6759,  0.0426, -0.1563],
        [ 0.2736,  0.2664, -0.0208,  ...,  0.6345,  0.0157, -0.0923],
        [ 0.3377,  0.2336, -0.0278,  ...,  0.6733,  0.0401, -0.1745]])

In [44]:
sentence_embeddings_list = sentence_embeddings.tolist()

In [46]:
len(sentence_embeddings_list[0])

312