In [34]:
from pheval_exomiser.prepare.core.chromadb_manager import ChromaDBManager
from pheval_exomiser.prepare.utils.similarity_measures import SimilarityMeasures
manager = ChromaDBManager()

In [35]:
# manager.ont_hp
ont = manager.get_collection("ont_hp")

In [36]:

import json
from typing import Dict
from chromadb.types import Collection


def create_hpo_id_to_embedding(collection: Collection) -> Dict:
    hpo_id_to_data = {}
    results = collection.get(include=["metadatas", "embeddings"])
    for metadata, embedding in zip(results.get("metadatas", []), results.get("embeddings", []), strict=False):
        metadata_json = json.loads(metadata["_json"])
        hpo_id = metadata_json.get("original_id")
        if hpo_id:
            hpo_id_to_data[hpo_id] = {"embeddings": embedding}  # #{'HP:0005872': [1,2,3, ...]}
    return hpo_id_to_data

cachedDict = create_hpo_id_to_embedding(ont)
manager.create_collection("HpEmbeddingsL2")
hpToEmbedding = manager.get_collection("HpEmbeddingsL2")
for hp, data in cachedDict.items():
    embedding_list = data['embeddings']
    hpToEmbedding.upsert(ids=[hp], embeddings=[embedding_list], metadatas=[{"type": "HP"}])

In [37]:
from pheval_exomiser.prepare.core.OMIMHPOExtractor import OMIMHPOExtractor
from pheval_exomiser.prepare.core.data_processor import DataProcessor

data_processor = DataProcessor(db_manager=manager)
file_path = "/Users/carlo/PycharmProjects/chroma_db_playground/phenotype.hpoa"
with open(file_path, 'r') as file:
    data = file.read()

extractor = OMIMHPOExtractor
omimToHPdict = extractor.extract_omim_hpo_mappings(data)

manager.create_collection("avgDiseaseEmbeddingsL2")
diseaseAvgEmbedings = manager.get_collection("avgDiseaseEmbeddingsL2")
for disease, hps in omimToHPdict.items():
    average_embedding = data_processor.calculate_average_embedding(hps, cachedDict)
    diseaseAvgEmbedings.upsert(ids=[disease], embeddings=[average_embedding.tolist()],
                               metadatas=[{"type": "disease"}])

12468


In [7]:
omimToHPdict["OMIM:251280"]

['HP:0001250',
 'HP:0002521',
 'HP:0002266',
 'HP:0001622',
 'HP:0012469',
 'HP:0032792',
 'HP:0008936',
 'HP:0000007',
 'HP:0000253',
 'HP:0000505',
 'HP:0002123',
 'HP:0001511',
 'HP:0011451',
 'HP:0003593',
 'HP:0001347',
 'HP:0030674',
 'HP:0002510',
 'HP:0002187',
 'HP:0001263',
 'HP:0001332']

In [27]:
omimToHPdict['OMIM:619340']


['HP:0000006',
 'HP:0001522',
 'HP:0001789',
 'HP:0002643',
 'HP:0032792',
 'HP:0002187',
 'HP:0011097',
 'HP:0001518',
 'HP:0011451',
 'HP:0200134',
 'HP:0010851']

In [38]:
import numpy as np

disease=['HP:0001250',
 'HP:0002521',
 'HP:0002266',
 'HP:0001622',
 'HP:0012469',
 'HP:0032792',
 'HP:0008936',
 'HP:0000007',
 'HP:0000253',
 'HP:0000505',
 'HP:0002123',
 'HP:0001511',
 'HP:0011451',
 'HP:0003593',
 'HP:0001347',
 'HP:0030674',
 'HP:0002510',
 'HP:0002187',
 'HP:0001263',
 'HP:0001332']

disease2=['HP:0000006',
 'HP:0001522',
 'HP:0001789',
 'HP:0002643',
 'HP:0032792',
 'HP:0002187',
 'HP:0011097',
 'HP:0001518',
 'HP:0011451',
 'HP:0200134',
 'HP:0010851']


# np.set_printoptions(precision=20)

def calculate_average_embedding(hpo_list, embeddings_dict):
    embeddings = [embeddings_dict[hpo]['embeddings'] for hpo in hpo_list if hpo in embeddings_dict]
    return np.mean(embeddings, axis=0, dtype=np.float64) if embeddings else np.array([])



In [13]:
cachedDict['HP:0001263']

{'embeddings': [-0.033657487481832504,
  -0.008162522688508034,
  0.02748393639922142,
  -0.02208530530333519,
  -0.02391928993165493,
  0.01651877537369728,
  -0.013716137036681175,
  0.014284414239227772,
  -0.012870179489254951,
  -0.018998529762029648,
  0.012353564612567425,
  0.014155260287225246,
  -0.0018339843954890966,
  0.00641248794272542,
  -0.0046785976737737656,
  0.010351680219173431,
  0.03670552000403404,
  -0.01177237182855606,
  0.004898159299045801,
  -0.020703360438346863,
  -0.016079653054475784,
  0.011378453113138676,
  -0.01663501374423504,
  -0.03639554977416992,
  -0.01788780651986599,
  0.001861429656855762,
  -0.0022844085469841957,
  -0.02626989036798477,
  -0.0205096285790205,
  0.004846497438848019,
  0.0016354104736819863,
  -0.012618329375982285,
  0.01699664443731308,
  -0.02528832107782364,
  -0.004239474423229694,
  -0.018791882321238518,
  -0.007135749328881502,
  -0.02455214411020279,
  0.008046284317970276,
  -0.004216872621327639,
  0.023518912

In [39]:
avg_embedding = calculate_average_embedding(disease2, cachedDict)
print(avg_embedding)

[-0.012174168305302208   0.01349125131541355    0.009389386427673426  ...
 -0.0032374684762378984 -0.011558321027471473  -0.020406166641888292 ]


In [40]:

query_results = diseaseAvgEmbedings.query(
    query_embeddings=[avg_embedding.tolist()],
    n_results=10,
    include=["embeddings", "distances"]
)

disease_ids = query_results['ids'][0] if 'ids' in query_results and query_results['ids'] else []

distances = query_results['distances'][0] if 'distances' in query_results and query_results['distances'] else []
sorted_results = sorted(zip(disease_ids, distances), key=lambda x: x[1])
print(distances)
print(disease_ids)
sorted_results

[1.1920928955078125e-06, 0.012330412864685059, 0.013801753520965576, 0.014431536197662354, 0.014481723308563232, 0.01506197452545166, 0.01537412405014038, 0.01538097858428955, 0.015410244464874268, 0.015469074249267578]
['OMIM:619340', 'OMIM:251280', 'OMIM:266100', 'OMIM:620033', 'OMIM:617065', 'OMIM:617105', 'OMIM:612164', 'OMIM:609304', 'OMIM:619881', 'OMIM:300607']


[('OMIM:619340', 1.1920928955078125e-06),
 ('OMIM:251280', 0.012330412864685059),
 ('OMIM:266100', 0.013801753520965576),
 ('OMIM:620033', 0.014431536197662354),
 ('OMIM:617065', 0.014481723308563232),
 ('OMIM:617105', 0.01506197452545166),
 ('OMIM:612164', 0.01537412405014038),
 ('OMIM:609304', 0.01538097858428955),
 ('OMIM:619881', 0.015410244464874268),
 ('OMIM:300607', 0.015469074249267578)]