In [1]:
from pathlib import Path
import polars as pl
import duckdb
from datasets import Dataset, concatenate_datasets, load_from_disk
import numpy as np
from autofaiss import build_index
from src.item_matching.build_index.func import load_images
from src.item_matching.build_index.model import Model

## 1) Data 

Prepare 2 datasets: Database and Query

In [2]:
path = Path('/home/kevin/Downloads/cb')
path_db = path / 'cb_2024-03-07.parquet'

query = f"""
select *
,concat('http://f.shopee.vn/file/', UNNEST(array_slice(string_split(images, ','), 1, 1))) image_url
from parquet_scan('{str(path_db)}')
order by item_id, images
"""
df_db = (
    duckdb.sql(query).pl()
    .head(10_000)
)
df_img_db = load_images(path / 'img_cb_2024-03-07', 'db', 'image_url')
df_db = (
    df_db.drop(['images'])
    .select(pl.all().name.prefix(f'db_'))
    .join(df_img_db, on='db_image_url', how='left')
    .filter(pl.col('db_exists'))
)


df_q = (
    duckdb.sql(query).pl()
    .head(10_000)
)
df_img_q = load_images(path / 'img_cb_2024-03-07', 'q', 'image_url')
df_q = (
    df_q.drop(['images'])
    .select(pl.all().name.prefix(f'q_'))
    .join(df_img_q, on='q_image_url', how='left')
    .filter(pl.col('q_exists'))
)
df_q.head()

q_item_id,q_item_name,q_shop_id,q_level1_global_be_category,q_create_datetime,q_image_url,q_file_path,q_exists
i64,str,i64,str,date,str,str,bool
2998791564,"""🍎FREE SHIP🍎Lit…",619038499,"""Fashion Access…",2022-03-16,"""http://f.shope…","""/home/kevin/Do…",True
2999430969,"""Dụng cụ cắt ra…",619035621,"""Home & Living""",2022-05-26,"""http://f.shope…","""/home/kevin/Do…",True
2999568833,"""Bộ 50 Món Dụng…",779448044,"""Beauty""",2022-07-20,"""http://f.shope…","""/home/kevin/Do…",True
2999714346,"""[Hàng HOT] Giá…",619019560,"""Automobiles""",2022-09-27,"""http://f.shope…","""/home/kevin/Do…",True
2999787165,"""SUPERCUTE Túi …",881303265,"""Women Bags""",2022-11-18,"""http://f.shope…","""/home/kevin/Do…",True


## 2) Embeddings

Use datasets and clip to transform images to vectors

In [3]:
img_model, img_processor = Model().get_img(model_id='openai/clip-vit-base-patch32')

[32m16:38:32[0m | [1mINFO[0m | [36m__init__[0m | [1m[Model] Run on: cuda[0m
[32m16:38:36[0m | [1mINFO[0m | [36mget_img[0m | [1mImage model: openai/clip-vit-base-patch32[0m


In [4]:
dataset = Dataset.from_pandas(df_db.to_pandas())
fn_kwargs = {'col': f'db_file_path', 'processor': img_processor, 'model': img_model}
dataset = dataset.map(Model().pp_img, batched=True, batch_size=768, fn_kwargs=fn_kwargs)
dataset.set_format(type='numpy', columns=['img_embed'], output_all_columns=True)

# save to disk
path_tmp_array = Path('tmp/array')
path_tmp_ds = Path('tmp/ds')
np.save(path_tmp_array / 'array.npy', dataset['img_embed'])
dataset.save_to_disk(path_tmp_ds / 'ds')

[32m16:38:36[0m | [1mINFO[0m | [36m__init__[0m | [1m[Model] Run on: cuda[0m


Map:   0%|          | 0/10192 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10192 [00:00<?, ? examples/s]

## 3) Indexing

Build index to search items

In [5]:
path_index = Path('tmp/index')
build_index(
    str(path_tmp_array),
    index_path=str(path_index / f'ip.index'),
    index_infos_path=str(path_index / f'index.json'),
    save_on_disk=True,
    metric_type='ip',
    verbose=30,
)

100%|██████████| 1/1 [00:00<00:00, 5356.71it/s]
100%|██████████| 1/1 [00:00<00:00,  5.54it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


(<faiss.swigfaiss_avx2.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x7f1268d06cd0> >,
 {'index_key': 'HNSW32',
  'index_param': 'efSearch=986',
  'index_path': 'tmp/index/ip.index',
  'size in bytes': 34080034,
  'avg_search_speed_ms': 6.132795656043631,
  '99p_search_speed_ms': 20.589535099952627,
  'reconstruction error %': 0.0,
  'nb vectors': 10192,
  'vectors dimension': 768,
  'compression ratio': 0.918714576399777})

Load index into datasets

In [6]:
dataset_db = concatenate_datasets([
    load_from_disk(str(f)) for f in sorted(path_tmp_ds.glob('*'))
])

# add index
dataset_db.load_faiss_index('img_embed', path_index / f'ip.index')

## 4) Retrieve

Batch search top-k from datasets

In [7]:
score, result = dataset_db.get_nearest_examples_batch(
    'img_embed',
    np.asarray(dataset_db['img_embed']),
    k=5
)

dict_ = {'score_img': [list(i) for i in score]}
df_score = pl.DataFrame(dict_)
df_result = pl.DataFrame(result).drop(['img_embed'])

## 5) Post process

In [8]:
df_match = pl.concat([df_q, df_result, df_score], how='horizontal')
col_explode = [i for i in df_match.columns if 'db' in i] + ['score_img']
df_match = df_match.explode(col_explode)

In [9]:
df_match

q_item_id,q_item_name,q_shop_id,q_level1_global_be_category,q_create_datetime,q_image_url,q_file_path,q_exists,db_item_id,db_item_name,db_shop_id,db_level1_global_be_category,db_create_datetime,db_image_url,db_file_path,db_exists,score_img
i64,str,i64,str,date,str,str,bool,i64,str,i64,str,datetime[μs],str,str,bool,f32
2998791564,"""🍎FREE SHIP🍎Lit…",619038499,"""Fashion Access…",2022-03-16,"""http://f.shope…","""/home/kevin/Do…",true,2998791564,"""🍎FREE SHIP🍎Lit…",619038499,"""Fashion Access…",2022-03-16 00:00:00,"""http://f.shope…","""/home/kevin/Do…",true,1.0
2998791564,"""🍎FREE SHIP🍎Lit…",619038499,"""Fashion Access…",2022-03-16,"""http://f.shope…","""/home/kevin/Do…",true,13793061194,"""[Hàng mới về] …",619038499,"""Beauty""",2022-05-27 00:00:00,"""http://f.shope…","""/home/kevin/Do…",true,0.674531
2998791564,"""🍎FREE SHIP🍎Lit…",619038499,"""Fashion Access…",2022-03-16,"""http://f.shope…","""/home/kevin/Do…",true,8599704681,"""Lithium067 2 c…",619038499,"""Stationery""",2022-09-26 00:00:00,"""http://f.shope…","""/home/kevin/Do…",true,0.580695
2998791564,"""🍎FREE SHIP🍎Lit…",619038499,"""Fashion Access…",2022-03-16,"""http://f.shope…","""/home/kevin/Do…",true,13194351208,"""Lithium067 Lit…",619038499,"""Beauty""",2022-06-21 00:00:00,"""http://f.shope…","""/home/kevin/Do…",true,0.569684
2998791564,"""🍎FREE SHIP🍎Lit…",619038499,"""Fashion Access…",2022-03-16,"""http://f.shope…","""/home/kevin/Do…",true,11596967860,"""Lithium067 Bệ …",619038499,"""Mom & Baby""",2022-07-04 00:00:00,"""http://f.shope…","""/home/kevin/Do…",true,0.568764
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
14298758417,"""WHYME Đồ chơi …",946581486,"""Mom & Baby""",2023-05-06,"""http://f.shope…","""/home/kevin/Do…",true,14298758417,"""WHYME Đồ chơi …",946581486,"""Mom & Baby""",2023-05-06 00:00:00,"""http://f.shope…","""/home/kevin/Do…",true,1.0
14298758417,"""WHYME Đồ chơi …",946581486,"""Mom & Baby""",2023-05-06,"""http://f.shope…","""/home/kevin/Do…",true,13799431108,"""WHYME Điện Dễ …",946581486,"""Baby & Kids Fa…",2023-05-06 00:00:00,"""http://f.shope…","""/home/kevin/Do…",true,0.675363
14298758417,"""WHYME Đồ chơi …",946581486,"""Mom & Baby""",2023-05-06,"""http://f.shope…","""/home/kevin/Do…",true,13599719843,"""[Hàng cao cấp]…",946581486,"""Mom & Baby""",2023-08-15 00:00:00,"""http://f.shope…","""/home/kevin/Do…",true,0.672375
14298758417,"""WHYME Đồ chơi …",946581486,"""Mom & Baby""",2023-05-06,"""http://f.shope…","""/home/kevin/Do…",true,14099225413,"""[Hàng cao cấp]…",946581486,"""Mom & Baby""",2023-08-02 00:00:00,"""http://f.shope…","""/home/kevin/Do…",true,0.671396


In [10]:
# df_match.write_csv(path / 'match.csv')