In [1]:
from pathlib import Path
import polars as pl
from datasets import Dataset
import numpy as np
from autofaiss import build_index
from core_pro.ultilities import make_dir
from sentence_transformers import SentenceTransformer
import sys
sys.path.extend([str(Path.home() / 'PycharmProjects/item_matching')])

from notebooks.benchmark.data_load import load

In [2]:
df, col, path = load()
df.head()

Data Shape: (72110, 6)


id,q_item_id,q_level1_global_be_category,q_item_name,q_link_first_image,q_item_name_clean
u32,i64,str,str,str,str
0,25622409846,"""Men Shoes""","""Dép da nam changfa hàng moi v…","""https://cf.shopee.sg/file/vn-1…","""dép da nam changfa hàng moi v…"
1,7669916630,"""Men Shoes""","""Giày Sneaker Nam Đế Cao Su Non…","""https://cf.shopee.sg/file/d771…","""giày sneaker nam đế cao su non…"
2,25827115226,"""Men Shoes""","""Giày_Nike Air Force 1 Louis Vu…","""https://cf.shopee.sg/file/sg-1…","""giày_nike air force 1 louis vu…"
3,23231741326,"""Men Shoes""","""Dép sandal nam Quai Mảnh Fashi…","""https://cf.shopee.sg/file/vn-1…","""dép sandal nam quai mảnh fashi…"
4,22885058948,"""Men Shoes""","""Dép Quai Ngang M.L.B Mound Mic…","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang m.l.b mound mic…"


In [7]:
# df['q_item_name_clean'].to_list()

In [8]:
path_tmp_array = Path('tmp/array')
path_tmp_ds = Path('tmp/ds')
make_dir(path_tmp_ds)
make_dir(path_tmp_array)

file_embed = path_tmp_array / f'jina_embed.npy'
if not file_embed.exists():
    model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)
    task = 'text-matching'
    embeddings = model.encode(
        df['q_item_name_clean'].to_list(),
        task=task,
        prompt_name=task,
        show_progress_bar=True
    )
    np.save(file_embed, embeddings)
else:
    embeddings = np.load(file_embed)
print(embeddings.shape)

Batches:   0%|          | 0/2254 [00:00<?, ?it/s]

Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention implementation does not support kwargs: prompt_length
Flash attention impl

(72110, 1024)


In [4]:
df = df.with_columns(pl.Series(values=embeddings, name='embed'))
dataset = Dataset.from_polars(df)
dataset.set_format(type='numpy', columns=['embed'], output_all_columns=True)

In [5]:
path_index = Path('tmp/index')
build_index(
    embeddings=embeddings,
    index_path=str(path_index / f'ip.index'),
    index_infos_path=str(path_index / f'index.json'),
    save_on_disk=True,
    metric_type='ip',
    verbose=30,
)

100%|██████████| 1/1 [00:00<00:00, 24672.38it/s]
100%|██████████| 6/6 [00:03<00:00,  1.83it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


(<faiss.swigfaiss_avx2.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x742fdb92e670> >,
 {'index_key': 'HNSW32',
  'index_param': 'efSearch=814',
  'index_path': 'tmp/index/ip.index',
  'size in bytes': 314987402,
  'avg_search_speed_ms': 3.7758498293187586,
  '99p_search_speed_ms': 12.999637332832208,
  'reconstruction error %': 0.0,
  'nb vectors': 72110,
  'vectors dimension': 1024,
  'compression ratio': 0.9376964225381941})

In [6]:
# add index
dataset.load_faiss_index('embed', path_index / f'ip.index')

In [7]:
score, result = dataset.get_nearest_examples_batch(
    'embed',
    np.asarray(dataset['embed']),
    k=5
)

dict_ = {'score': [list(i) for i in score]}
df_score = pl.DataFrame(dict_)
df_result = (
    pl.DataFrame(result).drop(['embed'])
    .select(pl.all().name.prefix(f'db_'))
)

In [8]:
df_match = pl.concat([df.drop(['id']), df_result, df_score], how='horizontal')
col_explode = [i for i in df_match.columns if 'notebooks' in i] + ['score']
df_match = df_match.explode(col_explode)

In [9]:
df_match

q_item_id,q_level1_global_be_category,q_item_name,q_link_first_image,q_item_name_clean,embed,db_id,db_q_item_id,db_q_level1_global_be_category,db_q_item_name,db_q_link_first_image,db_q_item_name_clean,score
i64,str,str,str,str,"array[f32, 1024]",i64,i64,str,str,str,str,f32
27804508088,"""Men Shoes""","""（100% Auth ）Giày Thể Thao Nam …","""https://cf.shopee.sg/file/sg-1…","""（100% auth ）giày thể thao nam …","[0.059814, -0.092285, … 0.034424]",0,27804508088,"""Men Shoes""","""（100% Auth ）Giày Thể Thao Nam …","""https://cf.shopee.sg/file/sg-1…","""（100% auth ）giày thể thao nam …",0.99385
27804508088,"""Men Shoes""","""（100% Auth ）Giày Thể Thao Nam …","""https://cf.shopee.sg/file/sg-1…","""（100% auth ）giày thể thao nam …","[0.059814, -0.092285, … 0.034424]",61563,28304511427,"""Men Shoes""","""（100% Auth ）Giày Thể Thao Nam …","""https://cf.shopee.sg/file/sg-1…","""（100% auth ）giày thể thao nam …",0.878783
27804508088,"""Men Shoes""","""（100% Auth ）Giày Thể Thao Nam …","""https://cf.shopee.sg/file/sg-1…","""（100% auth ）giày thể thao nam …","[0.059814, -0.092285, … 0.034424]",33996,26754508885,"""Men Shoes""","""（100% Auth ）Giày Thể Thao Nam …","""https://cf.shopee.sg/file/sg-1…","""（100% auth ）giày thể thao nam …",0.876756
27804508088,"""Men Shoes""","""（100% Auth ）Giày Thể Thao Nam …","""https://cf.shopee.sg/file/sg-1…","""（100% auth ）giày thể thao nam …","[0.059814, -0.092285, … 0.034424]",58149,26804507877,"""Men Shoes""","""（100% Auth ）Giày Thể Thao Nam …","""https://cf.shopee.sg/file/sg-1…","""（100% auth ）giày thể thao nam …",0.8751
27804508088,"""Men Shoes""","""（100% Auth ）Giày Thể Thao Nam …","""https://cf.shopee.sg/file/sg-1…","""（100% auth ）giày thể thao nam …","[0.059814, -0.092285, … 0.034424]",40021,24883074142,"""Men Shoes""","""（100% Auth ）Giày Thể Thao Nam …","""https://cf.shopee.sg/file/sg-1…","""（100% auth ）giày thể thao nam …",0.867648
…,…,…,…,…,…,…,…,…,…,…,…,…
20288231623,"""Men Shoes""","""Dép quai ngang LV da cao cấp_D…","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang lv da cao cấp_d…","[-0.060791, -0.108398, … 0.024292]",72109,20288231623,"""Men Shoes""","""Dép quai ngang LV da cao cấp_D…","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang lv da cao cấp_d…",0.997246
20288231623,"""Men Shoes""","""Dép quai ngang LV da cao cấp_D…","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang lv da cao cấp_d…","[-0.060791, -0.108398, … 0.024292]",44482,22440321229,"""Men Shoes""","""Dép LV quai da nam, Dép quai …","""https://cf.shopee.sg/file/vn-1…","""dép lv quai da nam, dép quai …",0.91091
20288231623,"""Men Shoes""","""Dép quai ngang LV da cao cấp_D…","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang lv da cao cấp_d…","[-0.060791, -0.108398, … 0.024292]",68916,22887555749,"""Men Shoes""","""Dép nam quai ngang LV 2023 - d…","""https://cf.shopee.sg/file/vn-1…","""dép nam quai ngang lv 2023 - d…",0.875208
20288231623,"""Men Shoes""","""Dép quai ngang LV da cao cấp_D…","""https://cf.shopee.sg/file/vn-1…","""dép quai ngang lv da cao cấp_d…","[-0.060791, -0.108398, … 0.024292]",36779,26750145724,"""Men Shoes""","""Dép Da Nam Quai Ngang LV Cao C…","""https://cf.shopee.sg/file/vn-1…","""dép da nam quai ngang lv cao c…",0.874898


In [10]:
df_match.filter(q_item_id=4992244030)

q_item_id,q_level1_global_be_category,q_item_name,q_link_first_image,q_item_name_clean,embed,db_id,db_q_item_id,db_q_level1_global_be_category,db_q_item_name,db_q_link_first_image,db_q_item_name_clean,score
i64,str,str,str,str,"array[f32, 1024]",i64,i64,str,str,str,str,f32
4992244030,"""Men Shoes""","""Hai Miếng Lót Đệm Giày Mềm Mã …","""https://cf.shopee.sg/file/bfe9…","""hai miếng lót đệm giày mềm mã …","[-0.091309, 0.057617, … 0.002396]",61973,4992244030,"""Men Shoes""","""Hai Miếng Lót Đệm Giày Mềm Mã …","""https://cf.shopee.sg/file/bfe9…","""hai miếng lót đệm giày mềm mã …",1.004506
4992244030,"""Men Shoes""","""Hai Miếng Lót Đệm Giày Mềm Mã …","""https://cf.shopee.sg/file/bfe9…","""hai miếng lót đệm giày mềm mã …","[-0.091309, 0.057617, … 0.002396]",31716,11812317752,"""Men Shoes""","""Cặp Lót Giày Mềm ( Chân Phải +…","""https://cf.shopee.sg/file/acd1…","""cặp lót giày mềm mã l03""",0.825395
4992244030,"""Men Shoes""","""Hai Miếng Lót Đệm Giày Mềm Mã …","""https://cf.shopee.sg/file/bfe9…","""hai miếng lót đệm giày mềm mã …","[-0.091309, 0.057617, … 0.002396]",20819,17237055021,"""Men Shoes""","""Cặp Lót Giày Mềm ( Chân Phải +…","""https://cf.shopee.sg/file/vn-1…","""cặp lót giày mềm mềm mại""",0.791478
4992244030,"""Men Shoes""","""Hai Miếng Lót Đệm Giày Mềm Mã …","""https://cf.shopee.sg/file/bfe9…","""hai miếng lót đệm giày mềm mã …","[-0.091309, 0.057617, … 0.002396]",38434,3857128315,"""Men Shoes""","""[nguyên bàn] 02 Miếng Lót Giày…","""https://cf.shopee.sg/file/vn-1…","""02 miếng lót giày độn đế tăng …",0.778861
4992244030,"""Men Shoes""","""Hai Miếng Lót Đệm Giày Mềm Mã …","""https://cf.shopee.sg/file/bfe9…","""hai miếng lót đệm giày mềm mã …","[-0.091309, 0.057617, … 0.002396]",68604,28951461671,"""Men Shoes""","""Miếng Lót Giày Độn Đế Giày Tăn…","""https://cf.shopee.sg/file/vn-1…","""miếng lót giày độn đế giày tăn…",0.777886
