# Embeddings playground

## Init the imports and global vars

In [6]:
# Reload the dependencies
%reload_ext autoreload
%autoreload 2

# Import the dependencies and setup the environment
import time
import json
import importlib
from typing import Tuple

from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

import sys
sys.path.insert(0, "../..")
from src.utils.Utilities import *
from src.utils.MilvusUtil import *
from src.utils import MilvusUtil

DEBUG = False

fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
milvus_port = 19530

collection_name = "character_info"
# lookup_cache = "storage/entity_hashtable.txt"
# entity_raw_data_path = get_path_from_project_root("entities/testing.json")
# entity_raw_data_path = "entities/raj_info.json"
entity_raw_data_path = "entities/raj_info2.json"

# Options
embedding_model = text_embedding_3_large
dim = get_dimensions_of_model(embedding_model)

# Initialize Milvus client (check if it is already running. Connect if it is and initialize if not)
restart_server = False
initialize_server(milvus_port, restart_server) # Checks if the server is running and starts it if not

Checking for running processes listening on port 19530
Found running process milvus on port 19530
Found running process milvus on port 19530
Connecting to established Milvus server
Connected to Milvus server



## Create/recreate the table
1. Drop collection if exists already
2. Create collection

|   |field name  |field type |other attributes              |  field description      |
|---|:----------:|:---------:|:----------------------------:|:-----------------------:|
|1  |    "id"    |   INT64   |is_primary=True, auto_id=False|        "id field"       |
|2  |"embeddings"|FloatVector|          dim=8               |"float vector with dim 8"|

3. Create index
4. Load collection
5. Reset the delta table


In [7]:
# Recreate the collection

# Drop collection if it already exists
has = utility.has_collection(collection_name)
if has:
    print(f"Collection {collection_name} found. Dropping it now.")
    utility.drop_collection(collection_name)
else:
    print(f"Collection {collection_name} does not exist in Milvus. Creating it now.")

# Create collection
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False, max_length=100),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
]

schema = CollectionSchema(fields, "schema for vector embeddings")
character_info = Collection(collection_name, schema, consistency_level="Strong")
print(f"Collection {collection_name} created.")

# Create index
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "COSINE",
    "params": {"nlist": 128},
}

# Print without newline
character_info.create_index("embeddings", index)
print("Index created.")

# Load the collection
print("Loading collection... ", end="")
character_info.load()
print("Done.")

# Reset the delta hashtable
# persist_hashtable_to_file({}, lookup_cache)

print("Complete!")

# Initialize the lookup cache
lookup_cache: set[int] = set()

Collection character_info found. Dropping it now.
Collection character_info created.
Index created.
Loading collection... Done.
Complete!


## Load data, convert, and insert data

In [8]:
# Load the raw data as a list of dictionaries from the json file
data_raw = load_json_custom(entity_raw_data_path)

key_hashes: list[int] = []
key_content_map: dict[str, Tuple[str, str]] = {}
base: list[str] = []

# Iterate through the raw data and convert it to the format expected by Milvus
for item in data_raw:
    # If it is a base entry, add it to base (to be included in every prompt)
    tags = item.get("tags", [])
    if "base" in tags:
        base.append(item.get("id", ""))
        continue
    
    content = item.get("content", "")

    keys = item.get("keys", [])
    keys.append(content) # Add the content as an additional search key
    for key in keys:
        key_hash = hash(key)
        if key_hash in key_hashes:
            # TODO it should be able to handle the same key for different contents eventually
            print(f"Duplicate key found: {key}. Skipping this entry.")
            continue
        
        key_hashes.append(key_hash)  # Add the key to the list of keys
        key_content_map[key_hash] = (key, content) # dedupe the content using a hash

# Iterate over the data and insert into entity table
new_entities = [[],[]]
updated_entities = [[],[]]

# entity_delta_table = load_hashtable_from_file(lookup_cache, content_type=int)

for key_hash in key_hashes:
    if key_hash not in key_content_map:
        raise ValueError(f"Key hash '{key_hash}' not found in key_content_map. This should not happen.")
    key = key_content_map[key_hash][0]
    
    # Check if the key already exists in the collection
    # query_expr = f"id == {key_hash}"
    # found_key = character_info.query(expr=query_expr, output_fields=["id"])

    # Check if the entity already exists. If not, insert it.
    if key_hash in lookup_cache:
        print(f"Entity \"{key}\" already exists in collection. Skipping insertion.")
    else:
        print(f"Entity \"{key}\" does not exist in collection. Adding to insert queue...")
        start_time = time.time()
        embedding = get_embedding(key, embedding_model, dim)
        end_time = time.time()
        print(f"Embedding latency = {end_time - start_time:.4f}s")
        add_to_entities(new_entities, key_hash, embedding)

# Insert new entities
if len(new_entities[0]) > 0:
    print(f"Inserting {len(new_entities[0])} new entities into Milvus")
    try:
        insert_result = character_info.insert(new_entities)
        lookup_cache.update(new_entities[0])
    except Exception as e:
        print(f"Failed to insert {len(new_entities[0])} new entities")
        raise e
else:
    print("No new entities to insert")

# TODO Delete stale keys (present in lookup_cache but not in the key_hashes)
# stale_keys = lookup_cache - set(key_hashes)

print(f"Number of entities in {collection_name} collection after: {character_info.num_entities}")

Entity "Objectifies women — sees them as individuals, but also as a way to impress. Has a harmless puppy-dog facade." does not exist in collection. Adding to insert queue...
Embedding latency = 0.4476s
Entity "Seeks situations where he can impress women. Likes to talk about them like a pick up artist." does not exist in collection. Adding to insert queue...
Embedding latency = 1.2450s
Entity "Into fashion and wearing expensive outfits tailored to occasions" does not exist in collection. Adding to insert queue...
Embedding latency = 0.5987s
Entity "Physically scrawny; avoids situations that might emasculate him or involve overexertion. Acts mildly prima donna about his body, but not flamboyantly." does not exist in collection. Adding to insert queue...
Embedding latency = 0.2808s
Entity "Judges and avoids anything stereotypically “lame” or nerdy." does not exist in collection. Adding to insert queue...
Embedding latency = 2.2219s
Entity "Enjoys gossiping about people" does not exist in 

## Search, query, and hybrid search

Update the search_query variable to search

In [None]:
# search_query = "You are pathetic and weak and will never be a big strong man"
# search_query = "Should I go into business with Tom?"

search_query = "Wanna go to yoga with me? Look for some spiritual validation?"
# search_query = "where can I meet women"

vectors_to_search = [get_embedding(search_query, embedding_model, dim)]
search_params = {
    "metric_type": "COSINE",
    "params": {"nprobe": 10},
}

start_time = time.time()
result = character_info.search(vectors_to_search, "embeddings", search_params, limit=200, output_fields=["id"])
end_time = time.time()

hits = result[0]

# Group the hits by same content, with a list of keys and similarities for each content hit
content_groups: dict[str, Dict[str, float]] = {}
for hits in result:
    for hit in hits:
        key_hash = hit.id
        if key_hash not in key_content_map:
            print(f"Key hash {key_hash} not found in key_content_map. This should not happen.")
            continue
        key = key_content_map[key_hash][0]
        content = key_content_map[key_hash][1]

        # print(f"Similarity: {hit.distance:.4f}, Key: {key}, Content: \"{content}\"")  # Convert distance to similarity score

        if content not in content_groups:
            content_groups[content] = {}
        content_groups[content][key] = hit.distance

# For each content group, sort the keys by similarity score in descending order.
for content, hits in content_groups.items():
    content_groups[content] = sorted(hits.items(), key=lambda x: x[1], reverse=True)
#  Then sort the content groups by the highest similarity score in descending order
content_groups = dict(sorted(content_groups.items(), key=lambda item: max([hit[1] for hit in item[1]]), reverse=True))

# Print the search latency
print(search_latency_fmt.format(end_time - start_time))

print(f"Search results for query: \"{search_query}\"")
for content, hits in content_groups.items():
    for key, similarity in hits:
        # Convert similarity decimal to a percentage similarity score
        sim_perc_str = 
        print(f"{distance:.4f}: {content}")  # Convert distance to similarity score

    # print(f"\nContent: \"{content}\"")
    # for key, distance in hits:
    #     print(f"  {distance:.4f}: {key}")  # Convert distance to similarity score



search latency = 0.4828s
Search results for query: "Wanna go to yoga with me? Look for some spiritual validation?"
0.2034: Seeks situations where he can impress women. Likes to talk about them like a pick up artist.
0.2031: As a part of his creative spritit and desire to make a name for himself, he has a desire to start businesses that seem cool and innovative. Usually in trendy spaces.
0.1928: He is currently invested in aesthetic and lifestly choices that would make him seem cool. He is also invested in a local night club.
0.1915: He is not into traditionally masculine things like sports, nor is athletic. He would try to pivot at the topic into something related but more trendy or cool.
0.1840: Physically scrawny; avoids situations that might emasculate him or involve overexertion. Acts mildly prima donna about his body, but not flamboyantly.
0.1798: Tries to cope with the letdown of his government job by trying to make it seem cool and impressive.
0.1786: Likes to network and make c