In [1]:
from pathlib import Path
import polars as pl
from datasets import Dataset, concatenate_datasets, load_from_disk
import numpy as np
from autofaiss import build_index
from src.item_matching.build_index.func import tfidf
from src.item_matching.build_index.model import Model

## 1) Data 

Prepare 2 datasets: Database and Query

In [2]:
path = Path('/home/kevin/Downloads/yang')
path_db = '/home/kevin/Downloads/yang/fss (query)/fss_itemid_Beauty.csv'

col = ['item_id', 'item_name']
df_db = (
    pl.read_csv(path_db)
    .sort('item_id')
    .select(pl.col(c).name.prefix('db_') for c in col)
    .head(10_000)
)

df_q = (
    pl.read_csv(path_db)
    .sort('item_id')
    .select(pl.col(c).name.prefix('q_') for c in col)
    .head(10_000)
)
df_db.head()

db_item_id,db_item_name
i64,str
137996,"""Sữa rửa mặt tá…"
168273,"""BB CREAM GARNI…"
168274,"""BB Cream Garni…"
168278,"""BỘ KEM DƯỠNG D…"
168279,"""NIVEA VITAL SO…"


Can skip this step if not using tfidf

In [3]:
items = df_db['db_item_name'].unique().to_list()
items[:3]

['Serum Trẻ Hoá Da Dành Cho Da Dầu - Face Rejuvenation Serum For Oily Skin 15ml',
 'Kẹp mi Shu uemura',
 'Phấn Tạo Khối Wet N Wild Megaglo Contouring']

## 2) Embeddings

Use datasets and tfidf to transform texts to vectors

In [4]:
vectorizer = tfidf(items, dim=512)

In [5]:
dataset = Dataset.from_pandas(df_db.to_pandas())
fn_kwargs = {'col': 'db_item_name', 'vectorizer': vectorizer}
dataset = dataset.map(Model().pp_sparse_tfidf, batched=True, batch_size=512, fn_kwargs=fn_kwargs)
dataset.set_format(type='numpy', columns=['embeddings'], output_all_columns=True)

# save to disk
path_tmp_array = Path('tmp/array')
path_tmp_ds = Path('tmp/ds')
np.save(path_tmp_array / 'array.npy', dataset['embeddings'])
dataset.save_to_disk(path_tmp_ds / 'ds')

[32m10:58:06[0m | [1mINFO[0m | [36m__init__[0m | [1m[Model] Run on: cuda[0m


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

## 3) Indexing

Build index to search items

In [6]:
path_index = Path('tmp/index')
build_index(
    str(path_tmp_array),
    index_path=str(path_index / f'ip.index'),
    index_infos_path=str(path_index / f'index.json'),
    save_on_disk=True,
    metric_type='ip',
    verbose=30,
)

100%|██████████| 1/1 [00:00<00:00, 29746.84it/s]
100%|██████████| 1/1 [00:00<00:00,  3.94it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


(<faiss.swigfaiss_avx2.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x7f9165549590> >,
 {'index_key': 'HNSW32',
  'index_param': 'efSearch=137',
  'index_path': 'tmp/index/ip.index',
  'size in bytes': 23197858,
  'avg_search_speed_ms': 0.38646493599344467,
  '99p_search_speed_ms': 15.412297680441043,
  'reconstruction error %': 0.0,
  'nb vectors': 10000,
  'vectors dimension': 512,
  'compression ratio': 0.8828401311879743})

Load index into datasets

In [7]:
dataset_db = concatenate_datasets([
    load_from_disk(str(f)) for f in sorted(path_tmp_ds.glob('*'))
])

# add index
dataset_db.load_faiss_index('embeddings', path_index / f'ip.index')

## 4) Retrieve

Batch search top-k from datasets

In [8]:
score, result = dataset_db.get_nearest_examples_batch(
    'embeddings',
    np.asarray(dataset_db['embeddings']),
    k=5
)

dict_ = {'score': [list(i) for i in score]}
df_score = pl.DataFrame(dict_)
df_result = pl.DataFrame(result).drop(['embeddings'])

## 5) Post process

In [9]:
df_match = pl.concat([df_q, df_result, df_score], how='horizontal')
col_explode = [i for i in df_match.columns if 'db' in i] + ['score']
df_match = df_match.explode(col_explode)

In [10]:
df_match

q_item_id,q_item_name,db_item_id,db_item_name,score
i64,str,i64,str,f32
137996,"""Sữa rửa mặt tá…",137996,"""Sữa rửa mặt tá…",1.0
137996,"""Sữa rửa mặt tá…",2990626,"""Sữa rửa mặt tr…",0.873776
137996,"""Sữa rửa mặt tá…",59721325,"""Sữa rửa mặt tr…",0.873776
137996,"""Sữa rửa mặt tá…",4398267,"""Sữa rửa mặt ch…",0.858371
137996,"""Sữa rửa mặt tá…",11125426,"""Sữa rửa mặt tá…",0.846729
…,…,…,…,…
63665777,"""Xà bông dầu dừ…",63665777,"""Xà bông dầu dừ…",1.0
63665777,"""Xà bông dầu dừ…",63659109,"""Xà bông dầu dừ…",0.919509
63665777,"""Xà bông dầu dừ…",63662419,"""Xà bông dầu dừ…",0.919509
63665777,"""Xà bông dầu dừ…",63662980,"""Xà bông dầu dừ…",0.919509
