In [1]:
from pathlib import Path
import polars as pl
from datasets import Dataset, concatenate_datasets, load_from_disk
import numpy as np
from autofaiss import build_index
from src.item_matching.build_index.model import Model
from core_pro.ultilities import make_dir

## 1) Data 

Prepare 2 datasets: Database and Query

In [2]:
path = Path.home() / 'Downloads/item_match'
path_db = path / 'ls fss plus - Nguyen Truong Son.csv'

col = ['item_id', 'item_name']
df_db = (
    pl.read_csv(path_db)
    .sort('item_id')
    .select(pl.col(c).name.prefix('db_') for c in col)
    .head(10_000)
)

df_q = (
    pl.read_csv(path_db)
    .sort('item_id')
    .select(pl.col(c).name.prefix('q_') for c in col)
    .head(10_000)
)
df_db.head()

## 2) Embeddings

Use datasets and tfidf to transform texts to vectors

In [None]:
text_model = Model().get_text_model()

In [None]:
dataset = Dataset.from_pandas(df_db.to_pandas())
fn_kwargs = {'col': 'db_item_name', 'model': text_model}
dataset = dataset.map(Model().pp_dense, batched=True, batch_size=512, fn_kwargs=fn_kwargs)
dataset.set_format(type='numpy', columns=['dense_embed'], output_all_columns=True)

# save to disk
path_tmp_array = Path('tmp/array')
path_tmp_ds = Path('tmp/ds')
make_dir(path_tmp_ds)
make_dir(path_tmp_array)
np.save(path_tmp_array / 'array.npy', dataset['dense_embed'])
dataset.save_to_disk(path_tmp_ds / 'ds')

## 3) Indexing

Build index to search items

In [None]:
path_index = Path('tmp/index')
build_index(
    str(path_tmp_array),
    index_path=str(path_index / f'ip.index'),
    index_infos_path=str(path_index / f'index.json'),
    save_on_disk=True,
    metric_type='ip',
    verbose=30,
)

Load index into datasets

In [None]:
dataset_db = concatenate_datasets([
    load_from_disk(str(f)) for f in sorted(path_tmp_ds.glob('*'))
])

# add index
dataset_db.load_faiss_index('dense_embed', path_index / f'ip.index')

## 4) Retrieve

Batch search top-k from datasets

In [None]:
score, result = dataset_db.get_nearest_examples_batch(
    'dense_embed',
    np.asarray(dataset_db['dense_embed']),
    k=5
)

dict_ = {'score': [list(i) for i in score]}
df_score = pl.DataFrame(dict_)
df_result = pl.DataFrame(result).drop(['dense_embed'])

## 5) Post process

In [None]:
df_match = pl.concat([df_q, df_result, df_score], how='horizontal')
col_explode = [i for i in df_match.columns if 'db' in i] + ['score']
df_match = df_match.explode(col_explode)

In [None]:
df_match