In [1]:
from pathlib import Path
import polars as pl
from datasets import Dataset, concatenate_datasets, load_from_disk
import numpy as np
from autofaiss import build_index
from src.item_matching.build_index.model import Model

## 1) Data 

Prepare 2 datasets: Database and Query

In [2]:
path = Path.home() / 'Downloads/yang'
path_db = path / 'fss_itemid_Home & Living.csv'

col = ['item_id', 'item_name']
df_db = (
    pl.read_csv(path_db)
    .sort('item_id')
    .select(pl.col(c).name.prefix('db_') for c in col)
    .head(10_000)
)

df_q = (
    pl.read_csv(path_db)
    .sort('item_id')
    .select(pl.col(c).name.prefix('q_') for c in col)
    .head(10_000)
)
df_db.head()

db_item_id,db_item_name
i64,str
120347,"""Hộp đựng bông …"
120348,"""Khay để son 28…"
171553,"""Mullet ren hoa…"
246058,"""Drap giường gi…"
247718,"""Bộ drap bông h…"


## 2) Embeddings

Use datasets and tfidf to transform texts to vectors

In [4]:
text_model = Model().get_text_model()

[32m14:20:18[0m | [1mINFO[0m | [36m__init__[0m | [1m[Model] Run on: mps[0m


Fetching 23 files:   0%|          | 0/23 [00:00<?, ?it/s]

loading existing colbert_linear and sparse_linear---------


In [5]:
dataset = Dataset.from_pandas(df_db.to_pandas())
fn_kwargs = {'col': 'db_item_name', 'model': text_model}
dataset = dataset.map(Model().pp_dense, batched=True, batch_size=512, fn_kwargs=fn_kwargs)
dataset.set_format(type='numpy', columns=['dense_embed'], output_all_columns=True)

# save to disk
path_tmp_array = Path('tmp/array')
path_tmp_ds = Path('tmp/ds')
np.save(path_tmp_array / 'array.npy', dataset['dense_embed'])
dataset.save_to_disk(path_tmp_ds / 'ds')

[32m14:20:24[0m | [1mINFO[0m | [36m__init__[0m | [1m[Model] Run on: mps[0m


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]


Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:04<00:00,  4.80s/it][A

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:03<00:00,  3.90s/it][A

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:05<00:00,  5.24s/it][A

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:03<00:00,  3.83s/it][A

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:04<00:00,  4.22s/it][A

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:04<00:00,  4.55s/it][A

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:04<00:00,  4.33s/it][A

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

## 3) Indexing

Build index to search items

In [6]:
path_index = Path('tmp/index')
build_index(
    str(path_tmp_array),
    index_path=str(path_index / f'ip.index'),
    index_infos_path=str(path_index / f'index.json'),
    save_on_disk=True,
    metric_type='ip',
    verbose=30,
)

100%|██████████| 1/1 [00:00<00:00, 21290.88it/s]
100%|██████████| 1/1 [00:00<00:00,  2.03it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


(<faiss.swigfaiss.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x2aa473d80> >,
 {'index_key': 'HNSW32',
  'index_param': 'efSearch=3092',
  'index_path': 'tmp/index/ip.index',
  'size in bytes': 43677858,
  'avg_search_speed_ms': 10.272223917739677,
  '99p_search_speed_ms': 15.392375045339557,
  'reconstruction error %': 0.0,
  'nb vectors': 10000,
  'vectors dimension': 1024,
  'compression ratio': 0.9377749247685178})

Load index into datasets

In [7]:
dataset_db = concatenate_datasets([
    load_from_disk(str(f)) for f in sorted(path_tmp_ds.glob('*'))
])

# add index
dataset_db.load_faiss_index('dense_embed', path_index / f'ip.index')

## 4) Retrieve

Batch search top-k from datasets

In [8]:
score, result = dataset_db.get_nearest_examples_batch(
    'dense_embed',
    np.asarray(dataset_db['dense_embed']),
    k=5
)

dict_ = {'score': [list(i) for i in score]}
df_score = pl.DataFrame(dict_)
df_result = pl.DataFrame(result).drop(['dense_embed'])

## 5) Post process

In [9]:
df_match = pl.concat([df_q, df_result, df_score], how='horizontal')
col_explode = [i for i in df_match.columns if 'db' in i] + ['score']
df_match = df_match.explode(col_explode)

In [10]:
df_match

q_item_id,q_item_name,db_item_id,db_item_name,score
i64,str,i64,str,f32
120347,"""Hộp đựng bông …",120347,"""Hộp đựng bông …",0.999996
120347,"""Hộp đựng bông …",87320247,"""Hộp đựng bông …",0.83048
120347,"""Hộp đựng bông …",42204991,"""Túi đựng mỹ ph…",0.731331
120347,"""Hộp đựng bông …",52038487,"""Hộp đựng đồ tr…",0.688779
120347,"""Hộp đựng bông …",60057254,"""combo 2 giỏ đự…",0.685374
120348,"""Khay để son 28…",120348,"""Khay để son 28…",0.999821
120348,"""Khay để son 28…",77862423,"""khay đựng son …",0.780724
120348,"""Khay để son 28…",11854809,"""Khay 24 ô để s…",0.722672
120348,"""Khay để son 28…",87311846,"""Khay đựng son …",0.716589
120348,"""Khay để son 28…",33973471,"""Khay son 24 ô …",0.700865
