In [1]:
from pathlib import Path
import polars as pl
import duckdb
from datasets import Dataset, concatenate_datasets, load_from_disk
import numpy as np
from autofaiss import build_index
import sys
sys.path.extend([Path.home() / 'PycharmProjects/item_matching'])

# from src.item_matching.build_index.func_img import PipelineImage
from item_matching.model.model import Model
# from func import draw_images

## 1) Data 

Prepare 2 datasets: Database and Query

In [2]:
path = Path('/media/kevin/data_4t/item_match_datasets')
path_db = path / 'db_clean.parquet'

# db
query = f"""
select *
from parquet_scan('{str(path_db)}')
"""
df_db = (
    duckdb.sql(query).pl()
)

# q
df_q = df_db.clone()
df_q.columns = [f'q_{i.split('db_')[1]}' for i in df_db.columns]
df_q.head()

q_l0_category,q_model_id,q_item_name,q_model_name,q_model_price,q_image_show,q_item_url,q_image_url,q_item_name_clean,q_file_path,q_exists
str,i64,str,str,i64,str,str,str,str,str,bool
"""Lifestyle""",158663010693,"""Chăn Hè Cotton Đũi Choice Việt…","""Quả Táo""",102200,"""https://cf.shopee.sg/file/sg-1…","""https://shopee.vn/product/8511…","""https://cf.shopee.sg/file/sg-1…","""chăn hè cotton đũi choice việt…","""/media/kevin/data_4t/item_matc…",True
"""Fashion""",79755460523,"""Áo bra nữ Choice Việt Nam BRS0…","""Ghi""",16800,"""https://cf.shopee.sg/file/vn-1…","""https://shopee.vn/product/8511…","""https://cf.shopee.sg/file/vn-1…","""áo bra nữ choice việt nam brs0…","""/media/kevin/data_4t/item_matc…",True
"""Fashion""",256169922336,"""[COMBO 2] Quần gen nịt bụng dư…","""Đen,S""",42000,"""https://cf.shopee.sg/file/sg-1…","""https://shopee.vn/product/8511…","""https://cf.shopee.sg/file/sg-1…","""quần gen nịt bụng dưới choice …","""/media/kevin/data_4t/item_matc…",True
"""Fashion""",256169922336,"""[COMBO 2] Quần gen nịt bụng dư…","""Đen,S""",42000,"""https://cf.shopee.sg/file/sg-1…","""https://shopee.vn/product/8511…","""https://cf.shopee.sg/file/sg-1…","""quần gen nịt bụng dưới choice …","""/media/kevin/data_4t/item_matc…",True
"""Fashion""",256169922336,"""[COMBO 2] Quần gen nịt bụng dư…","""Đen,S""",42000,"""https://cf.shopee.sg/file/sg-1…","""https://shopee.vn/product/8511…","""https://cf.shopee.sg/file/sg-1…","""quần gen nịt bụng dưới choice …","""/media/kevin/data_4t/item_matc…",True


## 2) Embeddings

Use datasets and clip to transform images to vectors

In [3]:
model = Model()
model.get_img_model()

[1mINFO[0m | [36m__init__[0m | [1m[Model] Run on: cuda[0m


In [7]:
dataset = Dataset.from_polars(df_db)
dataset = dataset.map(
    model.process_image,
    batch_size=512,
    batched=True,
    fn_kwargs={'col': 'db_file_path'}
)

dataset.set_format(type='torch', columns=['image_embed'], output_all_columns=True)
dataset = dataset.map(model.pp_normalize, batched=True, fn_kwargs={'col': 'image_embed'})
dataset.set_format(type='numpy', columns=['image_embed'], output_all_columns=True)

# save to disk
path_tmp_array = Path('tmp/array')
path_tmp_ds = Path('tmp/ds')
np.save(path_tmp_array / 'array.npy', dataset['image_embed'])
dataset.save_to_disk(path_tmp_ds / 'ds')

Map:   0%|          | 0/145 [00:00<?, ? examples/s]

Map:   0%|          | 0/145 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/145 [00:00<?, ? examples/s]

## 3) Indexing

Build index to search items

In [8]:
path_index = Path('tmp/index')
build_index(
    str(path_tmp_array),
    index_path=str(path_index / f'ip.index'),
    index_infos_path=str(path_index / f'index.json'),
    save_on_disk=True,
    metric_type='ip',
    verbose=30,
)

100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2172.09it/s]
100%|█████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 172.64it/s]
  0%|                                                                              | 0/1 [00:00<?, ?it/s]


(<faiss.swigfaiss_avx2.IndexFlat; proxy of <Swig Object of type 'faiss::IndexFlat *' at 0x7c37e5fba670> >,
 {'index_key': 'Flat',
  'index_param': '',
  'index_path': 'tmp/index/ip.index',
  'size in bytes': 445485,
  'avg_search_speed_ms': 0.01423576567293342,
  '99p_search_speed_ms': 0.03772199779632523,
  'reconstruction error %': 0.0,
  'nb vectors': 145,
  'vectors dimension': 768,
  'compression ratio': 0.9998989864978619})

Load index into datasets

In [9]:
dataset_db = concatenate_datasets([
    load_from_disk(str(f)) for f in sorted(path_tmp_ds.glob('*'))
])

# add index
dataset_db.load_faiss_index('img_embed', path_index / f'ip.index')

## 4) Retrieve

Batch search top-k from datasets

In [10]:
score, result = dataset_db.get_nearest_examples_batch(
    'img_embed',
    np.asarray(dataset_db['img_embed']),
    k=5
)

dict_ = {'score_img': [list(i) for i in score]}
df_score = pl.DataFrame(dict_)
df_result = pl.DataFrame(result).drop(['img_embed'])

KeyError: "Column img_embed not in the dataset. Current columns in the dataset: ['db_l0_category', 'db_model_id', 'db_item_name', 'db_model_name', 'db_model_price', 'db_image_show', 'db_item_url', 'db_image_url', 'db_item_name_clean', 'db_file_path', 'db_exists', 'image_embed']"

## 5) Post process

In [8]:
df_match = pl.concat([df_q, df_result, df_score], how='horizontal')
col_explode = [i for i in df_match.columns if 'db' in i] + ['score_img']
df_match = df_match.explode(col_explode)

In [12]:
df_match.to_pandas()

In [17]:
draw_images(df_match, 2999787165)

In [16]:
draw_images(df_match, 3099789245)

In [14]:
draw_images(df_match, 2999838844)

In [15]:
draw_images(df_match, 3099458499)

In [11]:
# df_match.write_csv(path / 'match.csv')