In [1]:
import numpy as np

# Simulated embeddings for 5 HPO terms
embeddings_dict = {
    "HP1": np.array([0.1, 0.2, 0.3]),
    "HP2": np.array([0.2, 0.3, 0.4]),
    "HP3": np.array([0.3, 0.4, 0.5]),
    "HP4": np.array([0.4, 0.5, 0.6]),
    "HP5": np.array([0.5, 0.6, 0.7])
}
disease_to_hpo = {
    "Disease A": ["HP1", "HP2", "HP3", "HP4", "HP5"]
}
def calculate_average_embedding(hpo_list, embeddings_dict):
    embeddings = [embeddings_dict[hp] for hp in hpo_list if hp in embeddings_dict]
    return np.mean(embeddings, axis=0) if embeddings else np.array([])

disease_a_embedding = calculate_average_embedding(disease_to_hpo["Disease A"], embeddings_dict)


In [2]:
disease_a_embedding

array([0.3, 0.4, 0.5])

In [3]:
from pheval_exomiser.prepare.core.chromadb_manager import ChromaDBManager
# from pheval_exomiser.prepare.utils.similarity_measures import SimilarityMeasures
manager = ChromaDBManager()

In [4]:
manager

<pheval_exomiser.prepare.core.chromadb_manager.ChromaDBManager at 0x11c026150>

In [5]:
manager.create_collection("test")
diseaseAvgEmbedings = manager.get_collection("test")
for disease, hps in disease_to_hpo.items():
    average_embedding = calculate_average_embedding(hps, embeddings_dict)
    diseaseAvgEmbedings.upsert(ids=[disease], embeddings=[average_embedding.tolist()],
                               metadatas=[{"type": "disease"}])

In [6]:
diseaseAvgEmbedings.get(include=['embeddings'])

{'ids': ['Disease A'],
 'embeddings': [[0.3, 0.4, 0.5]],
 'metadatas': None,
 'documents': None,
 'uris': None,
 'data': None}

In [8]:
# avg_embedding = calculate_average_embedding(OMIM619340, cachedDict)

query_results = diseaseAvgEmbedings.query(
    query_embeddings=[disease_a_embedding.tolist()],
    n_results=10,
    include=["embeddings", "distances"]
)

disease_ids = query_results['ids'][0] if 'ids' in query_results and query_results['ids'] else []

distances = query_results['distances'][0] if 'distances' in query_results and query_results['distances'] else []
sorted_results = sorted(zip(disease_ids, distances), key=lambda x: x[1])
print(distances)
print(disease_ids)
sorted_results

Number of requested results 10 is greater than number of elements in index 1, updating n_results = 1


[2.220446049250313e-16]
['Disease A']


[('Disease A', 2.220446049250313e-16)]

In [21]:
import numpy as np

# Step 1: Create new simulated embeddings with higher dimensionality (1536)
new_embeddings_dict = {f"HP{i}": np.random.normal(-0.039941002130508423, 0.039941002130508423, 1536) for i in range(1, 11)}

# Redefine the calculate_average_embedding function for the new embeddings
def calculate_average_embedding(hpo_list, embeddings_dict):
    embeddings = [embeddings_dict[hp] for hp in hpo_list if hp in embeddings_dict]
    return np.mean(embeddings, axis=0) if embeddings else np.array([])

# Step 2: Calculate the average embedding for Disease A with new embeddings
disease_a_new_embedding = calculate_average_embedding(disease_to_hpo["Disease A"], new_embeddings_dict)

In [23]:
# import numpy as np
# 
# # Step 1: Create new simulated embeddings with higher dimensionality (1536)
# new_embeddings_dict = {f"HP{i}": np.random.normal(-0.00877, 0.00001, 1536) for i in range(1, 11)}
# 
# # Redefine the calculate_average_embedding function for the new embeddings
# def calculate_average_embedding(hpo_list, embeddings_dict):
#     embeddings = [embeddings_dict[hp] for hp in hpo_list if hp in embeddings_dict]
#     return np.mean(embeddings, axis=0) if embeddings else np.array([])
# 
# # Step 2: Calculate the average embedding for Disease A with new embeddings
# disease_a_new_embedding = calculate_average_embedding(disease_to_hpo["Disease A"], new_embeddings_dict)
manager.create_collection("crazy")
newTEST = manager.get_collection("crazy")
# new = newTEST.get(include=['metadatas', 'embeddings'])

# Step 3: Upsert this new average embedding into your collection
# newTEST.upsert(ids=["Disease A"], embeddings=[disease_a_new_embedding.tolist()], metadatas=[{"type": "disease"}])
# 
# # Step 4: Query the collection with the same average embedding
# query_results = newTEST.query(
#     query_embeddings=[disease_a_new_embedding.tolist()],
#     n_results=10,
#     include=["embeddings", "distances"]
# )
# 
# # Extract the results
# disease_ids = query_results['ids'][0] if 'ids' in query_results and query_results['ids'] else []
# distances = query_results['distances'][0] if 'distances' in query_results and query_results['distances'] else []
# sorted_results = sorted(zip(disease_ids, distances), key=lambda x: x[1])
# 
# # Step 5: Print out the distances and check the results
# print(distances)
# print(disease_ids)
# print(sorted_results)


In [24]:
newTEST.upsert(ids=["Disease A"], embeddings=[disease_a_new_embedding.tolist()], metadatas=[{"type": "disease"}])

# Step 4: Query the collection with the same average embedding
query_results = newTEST.query(
    query_embeddings=[disease_a_new_embedding.tolist()],
    n_results=10,
    include=["embeddings", "distances"]
)

# Extract the results
disease_ids = query_results['ids'][0] if 'ids' in query_results and query_results['ids'] else []
distances = query_results['distances'][0] if 'distances' in query_results and query_results['distances'] else []
sorted_results = sorted(zip(disease_ids, distances), key=lambda x: x[1])

# Step 5: Print out the distances and check the results
print(distances)
print(disease_ids)
print(sorted_results)


Number of requested results 10 is greater than number of elements in index 1, updating n_results = 1


[0.0]
['Disease A']
[('Disease A', 0.0)]


In [25]:
new_embeddings_dict

{'HP1': array([-0.03581619, -0.07619514, -0.05492479, ..., -0.06591951,
        -0.05394123, -0.07380856]),
 'HP2': array([ 0.00451391,  0.01096005, -0.00496753, ..., -0.05810581,
        -0.10208682, -0.08620999]),
 'HP3': array([-0.00180608, -0.0770883 , -0.04278196, ..., -0.02331939,
        -0.01675649, -0.01305129]),
 'HP4': array([ 0.0034397 ,  0.05602453, -0.07040899, ..., -0.04364077,
        -0.00717418, -0.04011242]),
 'HP5': array([-0.07470257,  0.01484858,  0.00946041, ..., -0.04499468,
        -0.04954425, -0.14558055]),
 'HP6': array([-0.02546018, -0.04406923, -0.07830516, ..., -0.05680108,
        -0.04194895, -0.00078835]),
 'HP7': array([-0.09715197,  0.00175014, -0.05199694, ..., -0.07017845,
        -0.0287933 , -0.12851831]),
 'HP8': array([ 0.06817379, -0.00313828, -0.00917981, ..., -0.14316451,
         0.00530304, -0.02747261]),
 'HP9': array([-0.07479182, -0.04127378, -0.02561885, ..., -0.00586313,
        -0.01688929, -0.05656104]),
 'HP10': array([ 0.00738873,