In [3]:
from pymilvus import MilvusClient, DataType
from dotenv import load_dotenv
import os

load_dotenv()
# https://milvus.io/docs/quickstart.md

client = MilvusClient(
    uri=os.getenv("MILVUS_URI"),
    user=os.getenv("MILVUS_USER"),
    password=os.getenv("MILVUS_PWD"),
)

In [4]:
from scipy.sparse import csr_matrix

# create a sparse matrix
row = [0, 0, 1, 2, 2, 2]
col = [0, 2, 2, 0, 1, 2]
data = [1, 2, 3, 4, 5, 6]
sparse_matrix = csr_matrix((data, (row, col)), shape=(3,3))

# represent sparse vector using the sparse matrix
sparse_vector = sparse_matrix.getrow(0)

In [6]:
# create schema
schema = client.create_schema(
    auto_id=True,
    enbale_dynamic_fields=True,
)
schema.add_field(
    field_name="pk",
    datatype=DataType.VARCHAR,
    is_primary=True,
    max_length=100
)
schema.add_field(
    field_name="sparse_vector",
    datatype=DataType.SPARSE_FLOAT_VECTOR,
)

# create index params
index_params = client.prepare_index_params()
index_params.add_index(
    field_name="sparse_vector",
    index_name="sparse_vector_index",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="IP", # or BM25
    params={"inverted_index_algo": "DAAT_MAXSCORE"} # or "DAAT_WAND" or "TAAT_NAIVE", DAAT(Document at a time), TAAT(Term at a time)
)

# create collection
client.create_collection(
    collection_name="my_col_for_sparse_vector",
    schema=schema,
    index_params=index_params,
)

# insert data
sparse_vectors = [
    {"sparse_vector": {1: 0.5, 100: 0.3, 500: 0.8}},
    {"sparse_vector": {10: 0.1, 200: 0.7, 1000: 0.9}},
]

client.insert(
    collection_name="my_col_for_sparse_vector",
    data=sparse_vectors,
)

{'insert_count': 2, 'ids': ['458312171707658159', '458312171707658160'], 'cost': 1}

In [10]:
# perform similarity search
search_params = {
    "params": {"drop_ratio_search": 0.2}
}

query_vector = [{1: 0.2, 50: 0.4, 1000: 0.7}]

res = client.search(
    collection_name="my_col_for_sparse_vector",
    data=query_vector,
    search_params=search_params,
    limit=3,
    output_fields=["pk"],
)

print(res)

data: [[{'pk': '458312171707658160', 'distance': 0.629931628704071, 'entity': {'pk': '458312171707658160'}}, {'pk': '458312171707658159', 'distance': 0.10000000149011612, 'entity': {'pk': '458312171707658159'}}]],{'cost': 6}
