# Embeddings playground

## Reload dependencies

In [3]:
# Reload the dependencies
%reload_ext autoreload
%autoreload 2

## Init the imports and global vars

In [None]:
# Import the dependencies and setup the environment
import time
import json
import importlib

from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

import sys
sys.path.insert(0, "..")
from src.utils.Utilities import *
from src.utils import io_utils
from src.utils.MilvusUtil import *
from src.utils import MilvusUtil

DEBUG = False

fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
milvus_port = 19530

collection_name = "character_info"
entity_delta_table_path = get_path_from_project_root("storage/entity_hashtable.txt")
# entity_raw_data_path = get_path_from_project_root("entities/testing.json")
entity_raw_data_path = get_path_from_project_root("entities/poc1/pat-brain.json")

## Options

In [5]:
embedding_model = text_embedding_3_large
dim = get_dimensions_of_model(embedding_model)

## Connect to Milvus

Add a new connection alias `default` for Milvus server in `localhost:19530`. 

First check if an existing server is already spun up. If not create one.

In [7]:
# Initialize Milvus client (check if it is already running. Connect if it is and initialize if not)
restart_server = False
initialize_server(milvus_port, restart_server)

Checking for running processes listening on port 19530
Killing zombie process 68778
Zombie process 68778 is still alive
Starting Milvus server
Milvus server initialized on port 19530



## Create/recreate the table
1. Drop collection if exists already
2. Create collection

|   |field name  |field type |other attributes              |  field description      |
|---|:----------:|:---------:|:----------------------------:|:-----------------------:|
|1  |    "id"    |  VARCHAR  |is_primary=True, auto_id=False|        "id field"       |
|2  |  "content" |  VARCHAR  |                              |     "textual content"   |
|3  |"embeddings"|FloatVector|          dim=8               |"float vector with dim 8"|
|4  |   "tags"   |FloatVector|                              |  "tags for filtering"   |

3. Create index
4. Load collection
5. Reset the delta table


In [8]:
# Recreate the collection

# Drop collection if it already exists
has = utility.has_collection(collection_name)
if has:
    print(f"Collection {collection_name} found. Dropping it now.")
    utility.drop_collection(collection_name)
else:
    print(f"Collection {collection_name} does not exist in Milvus. Creating it now.")

# Create collection
fields = [
    FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
    FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=10000),
    FieldSchema(name="from", dtype=DataType.VARCHAR, max_length=10000),
    FieldSchema(name="to", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_capacity=1000, max_length=10000),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
]

schema = CollectionSchema(fields, "schema for vector embeddings")
character_info = Collection(collection_name, schema, consistency_level="Strong")
print(f"Collection {collection_name} created.")

# Create index
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "COSINE",
    "params": {"nlist": 128},
}

# Print without newline
character_info.create_index("embeddings", index)
print("Index created.")

# Load the collection
print("Loading collection... ", end="")
character_info.load()
print("Done.")

# Reset the delta hashtable
persist_hashtable_to_file({}, entity_delta_table_path)

print("Complete!")

Collection character_info found. Dropping it now.
Collection character_info created.
Index created.
Loading collection... Done.
Complete!


## Load data

In [None]:
# Load the raw data as a list of dictionaries from the json file
data = io_utils.load_json_custom(entity_raw_data_path)

## Insert data

In [None]:
# Iterate over the data and insert into entity table
new_entities = [[],[],[], [], []]
updated_entities = [[],[],[], [], []]

entity_delta_table = io_utils.load_hashtable_from_file(entity_delta_table_path, content_type=int)
for i, d in enumerate(data):
    content = d["content"]
    summary = d["summary"]
    from_field = d.get("from", "")
    to_field = d.get("to", [])
    if content == "":
        print(f"Skipping entity {i+1} as it has no summary")
        continue
    id = summary # hash(summary)
    query_expr = f"id == \"{id}\""
    
    # Check if the entity already exists in the collection
    # result = character_info.query(expr=query_expr, output_fields=["content"])
    content_hash = hash(content)
    # Create a backup copy of the entity_delta_table
    entity_delta_table_backup = entity_delta_table.copy()
    found_content_hash = entity_delta_table.get(id, None)

    # Check if the entity already exists. If not, insert it.
    if found_content_hash:
        # Check if the content hash is the same, if not, update the entity
        if found_content_hash != content_hash:
            print(f"Entity \"{summary}\" content is different. Updating now...")
            if not DEBUG:
                embedding = get_embedding(content, embedding_model, dim)
            else:
                embedding = np.random.default_rng(seed=19530).random(dim)
            add_to_entities(updated_entities, id, content, from_field, to_field, embedding)
            entity_delta_table[id] = content_hash
    else:
        print(f"Entity \"{summary}\" does not exist in collection. Inserting now...")
        embedding = get_embedding(content, embedding_model, dim)
        add_to_entities(new_entities, id, content, from_field, to_field, embedding)
        entity_delta_table[id] = content_hash

print()

# Insert new entities
if len(new_entities[0]) > 0:
    print(f"Inserting {len(new_entities[0])} new entities into Milvus")
    try:
        insert_result = character_info.insert(new_entities)
    except Exception as e:
        print(f"Failed to insert {len(new_entities[0])} new entities")
        entity_delta_table = entity_delta_table_backup
        raise e
else:
    print("No new entities to insert")

if len(updated_entities[0]) > 0:
    print(f"Updating {len(updated_entities[0])} entities in Milvus")
    try:
        update_result = character_info.upsert(updated_entities)
    except Exception as e:
        print(f"Failed to upsert {len(updated_entities[0])} new entities")
        entity_delta_table = entity_delta_table_backup
        raise e
else:
    print("No existing entities to update")

# Persist the hashtable to file
persist_hashtable_to_file(entity_delta_table, entity_delta_table_path)

print(f"Number of entities in {collection_name} collection after: {character_info.num_entities}")

Skipping entity 1 as it has no summary
Entity "Tom is the genius technical cofounder of Virtual Tech Solutions" does not exist in collection. Inserting now...
Entity "Tom's reluctance with Virtual Tech Solutions going into a b2b direction" does not exist in collection. Inserting now...
Entity "Tom's relationship with Synaptix" does not exist in collection. Inserting now...
Entity "Tom's depression and poor relationships" does not exist in collection. Inserting now...
Entity "Tom's best friend is his dog, Baxter" does not exist in collection. Inserting now...
Entity "Tom is working on his Matrix tool secretly" does not exist in collection. Inserting now...
Entity "Tom is stuck in a bugged version of his VR world" does not exist in collection. Inserting now...
Entity "Tom's past relationship with Bill" does not exist in collection. Inserting now...
Entity "Tom's current relationship with Bill" does not exist in collection. Inserting now...
Entity "Tom's relationship with Wendy" does not 

## Search, query, and hybrid search

Update the search_query variable to search

In [12]:
# search_query = "You are pathetic and weak and will never be a big strong man"
# search_query = "Should I go into business with Tom?"
search_query = "Can I depend on Tom?"

vectors_to_search = [get_embedding(search_query, embedding_model, dim)]
search_params = {
    "metric_type": "COSINE",
    "params": {"nprobe": 10},
}

start_time = time.time()
result = character_info.search(vectors_to_search, "embeddings", search_params, limit=10, output_fields=["id"], expr="ARRAY_CONTAINS(to, 'Tom')")
end_time = time.time()

hits = result[0]

for hits in result:
    for hit in hits:
        print(f"Hit: {hit.id}, similarity: {hit.distance}")
print(search_latency_fmt.format(end_time - start_time))

# Write the results to a text file in the results folder, titling it with the search query .txt
search_query_as_filename = "".join([c for c in search_query if c.isalnum() or c in " _-"])
results_path = f"results/{search_query_as_filename}.csv"

# Convert the results to a table, with similarity as the first column and the hit id as the second
results_table = [[hit.distance, hit.id] for hits in result for hit in hits]

# Write the results to a csv file
with open(results_path, "w") as f:
    for row in results_table:
        f.write(f"{row[0]},{row[1]}\n")

print(f"Results written to \"{results_path}\"")

Hit: Baxter's relationship with Tom, similarity: 0.41780126094818115
Hit: Wendy's relationship with Tom', similarity: 0.40880414843559265
Hit: Clark's past relationship with Tom, similarity: 0.3928787112236023
Hit: Karen's early relationship with Tom, similarity: 0.3814072012901306
Hit: Bill is jealous of Tom's technical genius, similarity: 0.37535765767097473
Hit: Beau's relationship with Tom, similarity: 0.36429229378700256
Hit: Bill is resentful of Tom's focus, similarity: 0.35804322361946106
Hit: Tom's depression and poor relationships, similarity: 0.3511824309825897
Hit: Clark's current relationship with Tom, similarity: 0.34763574600219727
Hit: Tom's reluctance with Virtual Tech Solutions going into a b2b direction, similarity: 0.3356180489063263
search latency = 0.2852s
Results written to "results/Can I depend on Tom.csv"
