In [33]:
import os
from pathlib import Path
import rich
import numpy as np
from dotenv import load_dotenv

dotenv_path = Path('../../.env')
root = Path('/home/lsc/2024/')

load_dotenv(dotenv_path=dotenv_path)

CLIP_PORT = os.environ.get("CLIP_PORT", None)
MILVUS_PORT = os.environ.get("MILVUS_PORT", None)

assert CLIP_PORT is not None, "CLIP_PORT is not set"
assert MILVUS_PORT is not None, "MILVUS_PORT is not set"

from pysearch.milvus import Milvus2Processor as MilvusProcessor

In [34]:
config = {
    # Global config
    "HOST": "0.0.0.0",
    "PORT": MILVUS_PORT,
    "INDEX": "lsc24",
    "RETURN_SIZE": 10,
    "CACHE_DIR": ".cache/",
    # Milvus config
    "DIMENSION": 768,
}

In [38]:
proc = MilvusProcessor(config, autoload_collection=False)
rich.print(proc.info())

In [37]:
proc.kill("lsc24")

In [49]:
with open(root / "datahub/metadata/image_list_contracted.txt") as f:
    lines = f.readlines()
    lines = [line.strip() for line in lines]

using_ids = set(lines)

new_lines = [] 
for line in lines:
    if line.startswith("2000"):
        continue
    new_lines.append(line)
# only use a subset ids 
using_ids1 = set(new_lines)

len(using_ids1)

296663

In [43]:
with open(root / "datahub/metadata/raw/image_list_full.txt") as f:
    lines = f.readlines()
    lines = [line.strip() for line in lines]

using_ids = set(lines)

new_lines = [] 
for line in lines:
    if line.startswith("2000"):
        continue
    new_lines.append(line)
# only use a subset ids 
using_ids = set(new_lines)
len(using_ids)

721823

In [18]:
data = np.load(root / "datahub/embeddings/lsc23_clip768_embeddings.npy", allow_pickle=True)
data = data.item()
len(data)

725226

In [22]:
# print(list(data.items())[0])
# '201901/01/20190101_103717_000.jpg' : []

In [21]:
# Path('201901/01/20190101_103717_000.jpg').stem
# 20190101_103717_000

'20190101_103717_000'

In [50]:
data = {Path(k).stem: v for k, v in data.items()} # remove extension, e.g. .jpg. From 000000.jpg to 000000
data = {k: v for k, v in data.items() if k in using_ids1}

In [51]:
features = np.array([x[1] for x in  data.items()])
ids = np.array([x[0] for x in data.items()])

In [46]:
# example of an id and a feature vector 
print(ids[0])
print(features[0].shape)

20190101_103717_000
(768,)


In [52]:
assert features.shape[0] == len(ids), "vectors and ids must have the same length"
print(len(ids))

296663


In [53]:
proc.index_list_document(features, ids)

(insert count: 215, delete count: 0, upsert count: 0, timestamp: 448493545498345476, success count: 215, err count: 0): 100%|██████████| 257/257 [00:28<00:00,  9.02it/s]  


Function update_list_document elapsed time: 0:00:28.505985


In [55]:
query = np.ones((1, 768)) * 7
results = proc.search(query, top_k=3)
print(results)

Function search elapsed time: 0:00:00.398008
(['20190421_105512_000', '20191009_064951_000', '20190416_090317_000'], [37583.96875, 37584.62890625, 37584.86328125])


In [56]:
assert results[0] == ['20190421_105512_000', '20191009_064951_000', '20190416_090317_000'], "Results are not as expected, index not belong to the CLIP embeddings"

In [57]:
results = proc.search(query, top_k=10, filter=["20190421_105512_000", "20190416_090317_000"])
results

Function search elapsed time: 0:00:00.270500


(['20190421_105512_000', '20190416_090317_000'], [37583.96875, 37584.86328125])