In [1]:
import os
from pathlib import Path
import rich
import numpy as np
from dotenv import load_dotenv
from tqdm import tqdm
dotenv_path = Path('.env')

load_dotenv(dotenv_path=dotenv_path)

CLIP_PORT = os.environ.get("CLIP_PORT", None)
MILVUS_PORT = os.environ.get("MILVUS_PORT", None)

assert CLIP_PORT is not None, "CLIP_PORT is not set"
assert MILVUS_PORT is not None, "MILVUS_PORT is not set"

from pysearch.milvus import Milvus2Processor as MilvusProcessor

In [2]:
config = {
    # Global config
    "HOST": "0.0.0.0",
    "PORT": MILVUS_PORT,
    "INDEX": "vbs24vector",
    "RETURN_SIZE": 10,
    "CACHE_DIR": ".cache/",
    # Milvus config
    "DIMENSION": 768,
}

In [3]:
proc = MilvusProcessor(config, autoload_collection=False)
rich.print(proc.info())


Create connection...


In [4]:
proc.kill('vbs24vector')

In [5]:
root = Path('/home/lsc/2024/vbs24/datahub/')

In [6]:
!head -n 10 {root}

head: error reading '/home/lsc/2024/vbs24/datahub': Is a directory


In [7]:
with open(root / "filenames/V3C_image_names.txt") as f:
    lines = f.readlines()
    lines = [line.strip() for line in lines]

using_ids = set(lines)
using_ids = [Path(x).stem for x in using_ids]
len(using_ids)

2508110

In [8]:
data = np.load(root / "embedding_features/V3C_L14_336_features_128.pkl", allow_pickle=True)
# data = data.item()
len(data)

2508110

In [9]:
# print(list(data.items())[0])
# 'shot00001_1_RKF.png' : []

In [10]:
# Path('shot00001_1_RKF.png').stem
# 20190101_103717_000

In [11]:
data = {Path(k).stem: v for k, v in tqdm(data.items())} # remove extension, e.g. .jpg. From 000000.jpg to 000000

  0%|          | 0/2508110 [00:00<?, ?it/s]

100%|██████████| 2508110/2508110 [00:06<00:00, 372748.96it/s]


In [12]:
# data = {k: v for k, v in tqdm(data.items()) if k in using_ids}

In [13]:
features = np.array([x[1] for x in  data.items()])
ids = np.array([x[0] for x in data.items()])

In [14]:
# example of an id and a feature vector 
print(ids[0])
print(features[0].shape)

shot00001_1_RKF
(768,)


In [15]:
assert features.shape[0] == len(ids), "vectors and ids must have the same length"
print(len(ids))

2508110


In [16]:
proc.index_list_document(features, ids)

(insert count: 78, delete count: 0, upsert count: 0, timestamp: 452570097410375682, success count: 78, err count: 0): 100%|██████████| 257/257 [06:03<00:00,  1.41s/it]    


[Pysearch] Function update_list_document elapsed time: 0:06:03.419476


In [17]:
query = np.ones((1, 768)) * 7
results = proc.search(query, top_k=3)
print(results)

[Pysearch] Function search elapsed time: 0:00:01.240800
(['shot10862_141_RKF', 'shot07511_1_RKF', 'shot07511_72_RKF'], [37576.9921875, 37579.0078125, 37581.796875])


In [18]:
results = proc.search(query, top_k=10, filter=['shot07511_1_RKF', 'shot00770_24_RKF'])
results

[Pysearch] Function search elapsed time: 0:00:00.451108


(['shot07511_1_RKF', 'shot00770_24_RKF'], [37579.0078125, 37581.04296875])