In [1]:
from pathlib import Path
import polars as pl
import duckdb
from datasets import Dataset
import numpy as np
from autofaiss import build_index
from FlagEmbedding import BGEM3FlagModel
from core_pro.ultilities import make_dir, make_sync_folder

In [2]:
path = make_sync_folder('Item_Matching_Test')
file = path / 'clean.parquet'

query = f"""select * from read_parquet('{file}')"""
df = duckdb.sql(query).pl()
df.head()

item_id,item_name,shop_id,shop_name,level1_global_be_category,description,images,image_url,item_name_clean,file_path,exists
i64,str,i64,str,str,str,str,str,str,str,bool
25823629171,"""Kẹp Tóc càng cua Choice Việt N…",851157471,"""Shopee Choice Việt Nam""","""Fashion Accessories""","""[{""t"":""✪ THÔNG TIN SẢN PHẨM \n…","""sg-11134301-7rd4i-lvolqk5ptysj…","""http://f.shopee.vn/file/sg-111…","""kẹp tóc càng cua choice việt n…","""/home/kevin/Downloads/Item_Mat…",True
29108145531,"""Áo dây CÚP ngực phối ren sexy …",1074316967,"""Honestss""","""Women Clothes""","""Áo dây CÚP ngực phối ren sexy …","""vn-11134207-7r98o-lz1svcz309xp…","""http://f.shopee.vn/file/vn-111…","""áo dây cúp ngực phối ren sexy …","""/home/kevin/Downloads/Item_Mat…",True
6092976691,"""Miếng Dán Ngực ❤️FREESHIP❤️ Hộ…",275954116,"""Dan Bikini""","""Women Clothes""","""MIẾNG DÁN NGỰC SILICON HÀN QUỐ…","""vn-11134201-7r98o-lyynyaibhgv5…","""http://f.shopee.vn/file/vn-111…","""miếng dán ngực freeship hộp 5 …","""/home/kevin/Downloads/Item_Mat…",True
23328371747,"""Găng tay phao nam chống lạnh s…",960970699,"""Winter Market""","""Fashion Accessories""","""Găng tay phao, bao tay phao đi…","""vn-11134211-7r98o-ln8wlsop9p7c…","""http://f.shopee.vn/file/vn-111…","""găng tay phao nam chống lạnh s…","""/home/kevin/Downloads/Item_Mat…",True
13599450536,"""Quần Dài Thể Thao Chống Nắng D…",704317817,"""KHOUSE-한국 여성 패션""","""Women Clothes""","""[{""t"":""Xuất xứ: Thâm Quyến\nTấ…","""sg-11134201-7qveg-lgomar6dayh8…","""http://f.shopee.vn/file/sg-111…","""quần dài thể thao chống nắng d…","""/home/kevin/Downloads/Item_Mat…",True


In [3]:
name = 'bge_gemma'
path_tmp_array = Path(path / f'tmp/array/{name}')
path_tmp_ds = Path(path / f'tmp/ds/{name}')
make_dir(path_tmp_ds)
make_dir(path_tmp_array)

file_embed = path_tmp_array / 'embed.npy'
if not file_embed.exists():
    model = BGEM3FlagModel('BAAI/bge-multilingual-gemma2', use_fp16=False)
    embeddings = model.encode(
        df['item_name_clean'].to_list(),
        batch_size=1,
        max_length=80,
        return_dense=True,
        return_sparse=False,
        return_colbert_vecs=False
    )['dense_vecs']
    np.save(file_embed, embeddings)
else:
    embeddings = np.load(file_embed)
print(embeddings.shape)

Fetching 25 files:   0%|          | 0/25 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 196.00 MiB. GPU 0 has a total capacity of 7.78 GiB of which 114.19 MiB is free. Process 2235 has 132.68 MiB memory in use. Process 33171 has 19.46 MiB memory in use. Including non-PyTorch memory, this process has 7.05 GiB memory in use. Of the allocated memory 6.92 GiB is allocated by PyTorch, and 3.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [4]:
df = df.with_columns(pl.Series(values=embeddings, name='embed'))
dataset = Dataset.from_polars(df)
dataset.set_format(type='numpy', columns=['embed'], output_all_columns=True)

In [5]:
path_index = Path(path / 'tmp/index')
build_index(
    embeddings=embeddings,
    index_path=str(path_index / f'ip.index'),
    index_infos_path=str(path_index / f'index.json'),
    save_on_disk=True,
    metric_type='ip',
    verbose=30,
)

100%|██████████| 1/1 [00:00<00:00, 2743.17it/s]
100%|██████████| 1/1 [00:00<00:00,  6.35it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


(<faiss.swigfaiss_avx2.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x7b8858053ea0> >,
 {'index_key': 'HNSW15',
  'index_param': 'efSearch=5226',
  'index_path': '/home/kevin/Downloads/Item_Matching_Test/tmp/index/ip.index',
  'size in bytes': 42050218,
  'avg_search_speed_ms': 9.97823628438878,
  '99p_search_speed_ms': 10.557658101897687,
  'reconstruction error %': 0.0,
  'nb vectors': 9936,
  'vectors dimension': 1024,
  'compression ratio': 0.9678393581693203})

In [6]:
# add index
dataset.load_faiss_index('embed', path_index / f'ip.index')

In [7]:
score, result = dataset.get_nearest_examples_batch(
    'embed',
    np.asarray(dataset['embed']),
    k=5
)

dict_ = {'score': [list(i) for i in score]}
df_score = pl.DataFrame(dict_)
df_result = (
    pl.DataFrame(result).drop(['embed'])
    .select(pl.all().name.prefix(f'db_'))
)

In [8]:
df_match = pl.concat([df, df_result, df_score], how='horizontal')
col_explode = [i for i in df_match.columns if 'notebooks' in i] + ['score']

In [10]:
path_export = path / 'text_match'
make_dir(path_export)
df_match.write_parquet(path_export / f'{name}.parquet')