In [1]:
from pathlib import Path
import polars as pl
from datasets import Dataset
import numpy as np
from autofaiss import build_index
from core_pro.ultilities import make_dir
from sentence_transformers import SentenceTransformer
import sys
sys.path.extend([str(Path.home() / 'PycharmProjects/item_matching')])

from notebooks.benchmark.data_load import load

In [2]:
df, col, path = load()
df.head()

Data Shape: (72110, 6)


id,q_item_id,q_level1_global_be_category,q_item_name,q_link_first_image,q_item_name_clean
u32,i64,str,str,str,str
0,25027814732,"""Men Shoes""","""Dép quai ngang slipper Xé dán …","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang slipper xé dán …"
1,898579880,"""Men Shoes""","""Xi Đánh Giày - Chuyên Dùng Đán…","""https://cf.shopee.sg/file/sg-1…","""xi đánh giày - chuyên dùng đán…"
2,14885441590,"""Men Shoes""","""Giày Thể Thao BigSize Nam Năng…","""https://cf.shopee.sg/file/1124…","""giày thể thao bigsize nam năng…"
3,25974450598,"""Men Shoes""","""Chai Xịt Khử Mùi Giày Dép NaNo…","""https://cf.shopee.sg/file/vn-1…","""chai xịt khử mùi giày dép nano…"
4,22289716536,"""Men Shoes""","""Giày sneaker vải dệt đẳng cấp …","""https://cf.shopee.sg/file/vn-1…","""giày sneaker vải dệt đẳng cấp …"


In [3]:
path_tmp_array = Path('tmp/array')
path_tmp_ds = Path('tmp/ds')
make_dir(path_tmp_ds)
make_dir(path_tmp_array)

file_embed = path_tmp_array / f'jina_embed.npy'
if not file_embed.exists():
    model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)
    task = 'text-matching'
    embeddings = model.encode(
        df['q_item_name_clean'].to_list(),
        task=task,
        prompt_name=task,
        show_progress_bar=True
    )
    np.save(file_embed, embeddings)
else:
    embeddings = np.load(file_embed)
print(embeddings.shape)

flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

(72110, 1024)


In [4]:
df = df.with_columns(pl.Series(values=embeddings, name='embed'))
dataset = Dataset.from_polars(df)
dataset.set_format(type='numpy', columns=['embed'], output_all_columns=True)

In [5]:
path_index = Path('tmp/index')
build_index(
    str(path_tmp_array),
    index_path=str(path_index / f'ip.index'),
    index_infos_path=str(path_index / f'index.json'),
    save_on_disk=True,
    metric_type='ip',
    verbose=30,
)

100%|██████████| 1/1 [00:00<00:00, 2345.81it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


(<faiss.swigfaiss_avx2.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x7423dbb6f270> >,
 {'index_key': 'HNSW32',
  'index_param': 'efSearch=526',
  'index_path': 'tmp/index/ip.index',
  'size in bytes': 314987402,
  'avg_search_speed_ms': 9.004521084566571,
  '99p_search_speed_ms': 14.145050499064382,
  'reconstruction error %': 0.0,
  'nb vectors': 72110,
  'vectors dimension': 1024,
  'compression ratio': 0.9376964225381941})

In [6]:
# add index
dataset.load_faiss_index('embed', path_index / f'ip.index')

In [12]:
score, result = dataset.get_nearest_examples_batch(
    'embed',
    np.asarray(dataset['embed']),
    k=5
)

dict_ = {'score': [list(i) for i in score]}
df_score = pl.DataFrame(dict_)
df_result = (
    pl.DataFrame(result).drop(['embed'])
    .select(pl.all().name.prefix(f'db_'))
)

In [13]:
df_match = pl.concat([df.drop(['id']), df_result, df_score], how='horizontal')
col_explode = [i for i in df_match.columns if 'db' in i] + ['score']
df_match = df_match.explode(col_explode)

In [14]:
df_match

q_item_id,q_level1_global_be_category,q_item_name,q_link_first_image,q_item_name_clean,embed,db_id,db_q_item_id,db_q_level1_global_be_category,db_q_item_name,db_q_link_first_image,db_q_item_name_clean,score
i64,str,str,str,str,"array[f32, 1024]",i64,i64,str,str,str,str,f32
25027814732,"""Men Shoes""","""Dép quai ngang slipper Xé dán …","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang slipper xé dán …","[-0.086914, -0.095703, … 0.017212]",0,25027814732,"""Men Shoes""","""Dép quai ngang slipper Xé dán …","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang slipper xé dán …",0.995602
25027814732,"""Men Shoes""","""Dép quai ngang slipper Xé dán …","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang slipper xé dán …","[-0.086914, -0.095703, … 0.017212]",47558,23155111248,"""Men Shoes""","""Dép quai ngang slipper quai dá…","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang slipper quai dá…",0.912379
25027814732,"""Men Shoes""","""Dép quai ngang slipper Xé dán …","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang slipper xé dán …","[-0.086914, -0.095703, … 0.017212]",65399,24677825864,"""Men Shoes""","""Dép quai ngang slipper Xé dán …","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang slipper xé dán …",0.893187
25027814732,"""Men Shoes""","""Dép quai ngang slipper Xé dán …","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang slipper xé dán …","[-0.086914, -0.095703, … 0.017212]",16572,25477821246,"""Men Shoes""","""Dép quai ngang slipper Xé dán …","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang slipper xé dán …",0.88825
25027814732,"""Men Shoes""","""Dép quai ngang slipper Xé dán …","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang slipper xé dán …","[-0.086914, -0.095703, … 0.017212]",53621,24927821804,"""Men Shoes""","""Dép quai ngang slipper Xé dán …","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang slipper xé dán …",0.886416
…,…,…,…,…,…,…,…,…,…,…,…,…
24065151497,"""Men Shoes""","""Giày Slipon Amiri Nhung Vạch D…","""https://cf.shopee.sg/file/vn-1…","""giày slipon amiri nhung vạch d…","[-0.007812, -0.086426, … 0.028198]",43028,23859543435,"""Men Shoes""","""Giày Slipon Amiri Nhung Vạch D…","""https://cf.shopee.sg/file/vn-1…","""giày slipon amiri nhung vạch d…",1.005471
24065151497,"""Men Shoes""","""Giày Slipon Amiri Nhung Vạch D…","""https://cf.shopee.sg/file/vn-1…","""giày slipon amiri nhung vạch d…","[-0.007812, -0.086426, … 0.028198]",72109,24065151497,"""Men Shoes""","""Giày Slipon Amiri Nhung Vạch D…","""https://cf.shopee.sg/file/vn-1…","""giày slipon amiri nhung vạch d…",1.005471
24065151497,"""Men Shoes""","""Giày Slipon Amiri Nhung Vạch D…","""https://cf.shopee.sg/file/vn-1…","""giày slipon amiri nhung vạch d…","[-0.007812, -0.086426, … 0.028198]",65395,13312400800,"""Men Shoes""","""Giày Slipon Amiri Nhung Vạch D…","""https://cf.shopee.sg/file/8fcd…","""giày slipon amiri nhung vạch d…",0.910827
24065151497,"""Men Shoes""","""Giày Slipon Amiri Nhung Vạch D…","""https://cf.shopee.sg/file/vn-1…","""giày slipon amiri nhung vạch d…","[-0.007812, -0.086426, … 0.028198]",54588,20582182381,"""Men Shoes""","""⚡️[HÀNG CAO CẤP] - Giày Slip-o…","""https://cf.shopee.sg/file/vn-1…","""⚡️ - giày slip-on amiri xương …",0.815412
