# create schema

In [4]:
from pymilvus import MilvusClient, DataType, Function, FunctionType

base_url = "host.docker.internal"
URI = f"http://{base_url}:19530"
try:
    milvus_client = MilvusClient(
        uri=URI,
    )
    print(f"Milvus client initialized for endpoint: {URI}")
except Exception as e:
    print(f"Fatal Error: Could not initialize Milvus client: {e}")

Milvus client initialized for endpoint: http://host.docker.internal:19530


In [5]:
schema = MilvusClient.create_schema()

schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True, auto_id=True)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=1000, enable_analyzer=True)
schema.add_field(field_name="sparse", datatype=DataType.SPARSE_FLOAT_VECTOR)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1000, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>}], 'enable_dynamic_field': False}

In [6]:
bm25_function = Function(
    name="text_bm25_emb", # Function name
    input_field_names=["text"], # Name of the VARCHAR field containing raw text data
    output_field_names=["sparse"], # Name of the SPARSE_FLOAT_VECTOR field reserved to store generated embeddings
    function_type=FunctionType.BM25, # Set to `BM25`
)

schema.add_function(bm25_function)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1000, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}], 'enable_dynamic_field': False, 'functions': [{'name': 'text_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['text'], 'output_field_names': ['sparse'], 'params': {}}]}

# configure the index

In [7]:
index_params = MilvusClient.prepare_index_params()

index_params.add_index(
    field_name="sparse",

    index_type="SPARSE_INVERTED_INDEX",
    metric_type="BM25",
    params={
        "inverted_index_algo": "DAAT_MAXSCORE",
        "bm25_k1": 1.2,
        "bm25_b": 0.75
    }

)


# create collection

In [11]:
milvus_client.create_collection(
    collection_name='full_text_search_collection', 
    schema=schema, 
    index_params=index_params
)

# insert text data

In [None]:
milvus_client.insert('full_text_search_collection', [
    {'text': 'information retrieval is a field of study.'},
    {'text': 'information retrieval focuses on finding relevant information in large datasets.'},
    {'text': 'data mining and information retrieval overlap in research.'},
])

In [None]:
documents = [
    {'text': 'I had chocolate chip pancakes and scrambled eggs for breakfast this morning.'},
    {'text': 'The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'},
    {'text': 'Building an exciting new project with LangChain - come check it out!'},
    {'text': 'Robbers broke into the city bank and stole $1 million in cash.'},
    {'text': "Wow! That was an amazing movie. I can't wait to see it again."},
    {'text': 'Is the new iPhone worth the price? Read this review to find out.'},
    {'text': 'The top 10 soccer players in the world right now.'},
    {'text': 'LangGraph is the best framework for building stateful, agentic applications!'},
    {'text': 'The stock market is down 500 points today due to fears of a recession.'},
    {'text': 'I have a bad feeling I am going to get deleted :('}
]
milvus_client.insert('full_text_search_collection', documents)

{'insert_count': 10, 'ids': [458426901916294296, 458426901916294297, 458426901916294298, 458426901916294299, 458426901916294300, 458426901916294301, 458426901916294302, 458426901916294303, 458426901916294304, 458426901916294305], 'cost': 0}

# perform full text search

In [40]:
search_params = {
    'params': {'drop_ratio_search': 0.2},
}

milvus_client.search(
    collection_name='full_text_search_collection', 
    data=['whats the focus of information retrieval?'],
    anns_field='sparse',
    limit=3,
    search_params=search_params
)

data: [[{'id': 458426901916294292, 'distance': 3.793283462524414, 'entity': {}}, {'id': 458426901916294288, 'distance': 3.793283462524414, 'entity': {}}, {'id': 458426901916294289, 'distance': 2.3380560874938965, 'entity': {}}]]

In [41]:
search_params = {
    'params': {'drop_ratio_search': 0.3, "sort": True},
}

milvus_client.search(
    collection_name='full_text_search_collection', 
    data=['what is langgraph'],
    anns_field='sparse',
    limit=3,
    search_params=search_params
)

data: [[{'id': 458426901916294303, 'distance': 3.488731861114502, 'entity': {}}, {'id': 458426901916294288, 'distance': 1.1214488744735718, 'entity': {}}, {'id': 458426901916294292, 'distance': 1.1214488744735718, 'entity': {}}]]