In [None]:
!pip install faiss-cpu

In [1]:
import os
import glob
import faiss
import numpy as np
from tqdm import tqdm

# Clip (open_ai)

In [None]:
feature_shape = 512
features_dir = './CLIP_features'

index = faiss.IndexFlatIP(feature_shape)

for data_part in tqdm(sorted(os.listdir(features_dir))):
    for feature_path in tqdm(sorted(glob.glob(os.path.join(features_dir, data_part) +'/*.npy'))):
        feats = np.load(feature_path)
        for feat in feats:
            feat = feat.astype(np.float32).reshape(1,-1)
            index.add(feat)

faiss.write_index(index, f"faiss_clip.bin")

# Clip (open_clip)

In [5]:
feature_shape = 768
features_dir = './CLIPv2_features'

index = faiss.IndexFlatIP(feature_shape)

for data_part in tqdm(sorted(os.listdir(features_dir))):
    for feature_path in tqdm(sorted(glob.glob(os.path.join(features_dir, data_part) +'/*.npy'))):
        feats = np.load(feature_path)
        for feat in feats:
            feat = feat.astype(np.float32).reshape(1,-1)
            index.add(feat)

faiss.write_index(index, f"faiss_clipv2_cosine.bin")

In [None]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

# Connect to Milvus
connections.connect("default", host="localhost", port="19530")

# Define the schema for the collection
fields = [
    # FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=feature_shape)
]
schema = CollectionSchema(fields, "CLIPv2 feature vectors")

# Create the collection
collection = Collection("clip_v2_collection", schema)

# Load the data and insert into Milvus
for data_part in tqdm(sorted(os.listdir(features_dir))):
    for feature_path in tqdm(sorted(glob.glob(os.path.join(features_dir, data_part) +'/*.npy'))):
        feats = np.load(feature_path)
        feats = feats.astype(np.float32).tolist()
        # ids = [i for i in range(len(feats))]
        insert_result = collection.insert(feats)
        # Retrieve the automatically generated IDs
        generated_ids = insert_result.primary_keys
        # print("Generated IDs:", generated_ids)

# Flush to make sure data is persisted
collection.flush()

# 4.1. Set up the index parameters
index_params =  {
    # field_name="vector",
    "metric_type":"COSINE",
    "index_type":"IVF_FLAT",
    "params" : { "nlist": 128 }
}

# 4.2. Create the index on the "embedding" field
collection.create_index("embedding", index_params)

# Load the collection into memory
collection.load()



In [2]:
import faiss
import numpy as np
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

# Load the FAISS index from the binary file
index = faiss.read_index("/mnt/data/MLOps/VN_Multi_User_Video_Search/dict/faiss_clip.bin")

# Extract vectors and IDs from the FAISS index
vectors = index.reconstruct_n(0, index.ntotal)
ids = np.arange(index.ntotal)

# Validate the data
assert vectors.shape[0] == ids.shape[0], "Number of vectors and IDs must match"
assert vectors.dtype == np.float32, "Vectors must be of type float32"
assert ids.dtype == np.int64, "IDs must be of type int64"

print(vectors.shape[1])
# Connect to Milvus
connections.connect("default", host="localhost", port="19530")

# Drop the existing collection
collection_name = "clip"
existing_collection = Collection(collection_name)
existing_collection.drop()

# Define the schema for the collection
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=vectors.shape[1])
]
schema = CollectionSchema(fields, collection_name)

# Create a collection
collection = Collection(collection_name, schema)

# Insert vectors into the collection
data = [
    ids.tolist(),  # id field
    vectors.tolist()  # vector field
]


# Insert data in batches with progress tracking
batch_size = 100
num_vectors = vectors.shape[0]
for i in tqdm(range(0, num_vectors, batch_size), desc="Inserting data"):
    batch_vectors = vectors[i:i + batch_size].tolist()
    batch_ids = ids[i:i + batch_size].tolist()
    data_batch = [
        batch_ids,  # id field
        batch_vectors  # vector field
    ]
    collection.insert(data_batch)


# collection.insert(data)

# Create an index on the vector field
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "COSINE",
    "params": {"nlist": 128}
}
collection.create_index("vector", index_params)

# Load the collection into memory
collection.load()

# Perform a basic search to verify
query_vectors = np.random.random([1, vectors.shape[1]]).tolist()
search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}}
results = collection.search(query_vectors, "vector", search_params, limit=3)

# Print the search results
for result in results:
    print(result)

768


Inserting data: 100%|██████████| 17135/17135 [09:24<00:00, 30.34it/s]


['id: 1519346, distance: 0.12350457906723022, entity: {}', 'id: 1496630, distance: 0.11723673343658447, entity: {}', 'id: 1519347, distance: 0.11685427278280258, entity: {}']


In [4]:
from pymilvus import Collection,connections
import numpy as np
# Connect to Milvus
connections.connect("default", host="localhost", port="19530")
# Drop the existing collection
collection_name = "clipv2"
# Create a collection
collection = Collection(collection_name)
# Load the collection into memory
collection.load()
# Perform a basic search to verify
query_vectors = np.random.random([1, 768]).tolist()
search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}}
results = collection.search(query_vectors, "vector", search_params, limit=3)

# Print the search results
for result in results:
    print(result)

['id: 1009146, distance: 0.0900886133313179, entity: {}', 'id: 938028, distance: 0.08985694497823715, entity: {}', 'id: 1009147, distance: 0.08466795086860657, entity: {}']
