In [1]:
from pheval_exomiser.prepare.core.chromadb_manager import ChromaDBManager
from pheval_exomiser.prepare.utils.similarity_measures import SimilarityMeasures
from pheval_exomiser.prepare.core.OMIMHPOExtractor import OMIMHPOExtractor
from pheval_exomiser.prepare.core.data_processor import DataProcessor
manager = ChromaDBManager(similarity=SimilarityMeasures.COSINE)

In [2]:
manager.list_collections()

[Collection(name=avgDiseaseEmbeddings),
 Collection(name=DiseaseOrganEmbeddings),
 Collection(name=hpo),
 Collection(name=DiseaseNewAvgEmbeddingsNew),
 Collection(name=HpEmbeddings),
 Collection(name=DiseaseNewOrganEmbeddings),
 Collection(name=ont_hp),
 Collection(name=average)]

In [10]:
ont = manager.get_collection("ont_hp")
emb = ont.get(limit=1, include=['embeddings'])
len(emb['embeddings'][0])

print(emb)

{'ids': ['10MinuteAPGARScoreOf0'], 'embeddings': [[-0.022373054176568985, 0.0014565790770575404, 0.01586170494556427, -0.01423033606261015, -0.01827697642147541, 0.03268386796116829, -0.00418082345277071, 0.016525551676750183, -0.02268379181623459, -0.03533925488591194, 0.005889875814318657, 0.03025447018444538, 0.011850373819470406, -0.012726087123155594, 0.009484536945819855, 0.03104543685913086, 0.016525551676750183, -0.00810740701854229, 0.00361231598071754, 0.0023234642576426268, 0.01298032607883215, 0.03282511234283447, -0.030113227665424347, -0.001727591035887599, -0.013163943774998188, 0.004315005149692297, 0.002381727332249284, -0.013107446022331715, 0.0028301773127168417, -0.014032593928277493, 0.034774281084537506, -0.030056729912757874, -0.01613006740808487, -0.03050871007144451, 0.0019156221533194184, 0.006327732466161251, 0.0035593495704233646, 0.010339062660932541, 0.03903985023498535, -0.005480268504470587, 0.002042741747573018, -0.00011553319927770644, -0.0092161726206

In [5]:
ont = manager.get_collection("ont_hp")
data_processor = DataProcessor(db_manager=manager)

cachedDict = data_processor.create_hpo_id_to_embedding(ont)
manager.create_collection("newEmbeddingsFromHpOntology_phase2")
hpToEmbedding = manager.get_collection("newEmbeddingsFromHpOntology_phase2")
for hp, data in cachedDict.items():
    embedding_list = data['embeddings']
    hpToEmbedding.upsert(ids=[hp], embeddings=[embedding_list], metadatas=[{"type": "HP"}])

In [6]:

file_path = "/Users/carlo/PycharmProjects/chroma_db_playground/phenotype.hpoa"
with open(file_path, 'r') as file:
    data = file.read()

extractor = OMIMHPOExtractor
omimToHPdict = extractor.extract_omim_hpo_mappings(data)

manager.create_collection("avgDiseaseEmbeddings_phase2")
diseaseAvgEmbedings = manager.get_collection("avgDiseaseEmbeddings_phase2")
for disease, hps in omimToHPdict.items():
    average_embedding = data_processor.calculate_average_embedding(hps, cachedDict)
    diseaseAvgEmbedings.upsert(ids=[disease], embeddings=[average_embedding.tolist()],
                               metadatas=[{"type": "disease"}])

12468


In [7]:
OMIM619340 = ['HP:0010851', 'HP:0011097', 'HP:0002643', 'HP:0032792', 'HP:0011451', 'HP:0200134', 'HP:0001518', 'HP:0002187', 'HP:0000006', 'HP:0001522', 'HP:0001789']


diseaseAvgEmbedings = manager.get_collection("avgDiseaseEmbeddings_phase2")
# ont_hp = manager.get_collection("ont_hp")

# cachedDict = manager.create_hpo_id_to_embedding(ont_hp)
avg_embedding = data_processor.calculate_average_embedding(OMIM619340, cachedDict)

if avg_embedding is None:
    print("No valid embeddings found for provided HPO terms.")

query_results = diseaseAvgEmbedings.query(
    query_embeddings=[avg_embedding.tolist()],
    n_results=10,
    include=["embeddings", "distances"]
)

disease_ids = query_results['ids'][0] if 'ids' in query_results and query_results['ids'] else []
distances = query_results['distances'][0] if 'distances' in query_results and query_results['distances'] else []
sorted_results = sorted(zip(disease_ids, distances), key=lambda x: x[1])
# distances
sorted_results

[('OMIM:619340', 1.1920928955078125e-06),
 ('OMIM:251280', 0.012330412864685059),
 ('OMIM:266100', 0.013801753520965576),
 ('OMIM:620033', 0.014431536197662354),
 ('OMIM:617065', 0.014481723308563232),
 ('OMIM:617105', 0.01506197452545166),
 ('OMIM:612164', 0.01537412405014038),
 ('OMIM:609304', 0.01538097858428955),
 ('OMIM:619881', 0.015410244464874268),
 ('OMIM:300607', 0.015469074249267578)]

In [8]:
manager.list_collections()

[Collection(name=avgDiseaseEmbeddings),
 Collection(name=newEmbeddingsFromHpOntology_phase2),
 Collection(name=DiseaseOrganEmbeddings),
 Collection(name=avgDiseaseEmbeddings_phase2),
 Collection(name=hpo),
 Collection(name=DiseaseNewAvgEmbeddingsNew),
 Collection(name=HpEmbeddings),
 Collection(name=DiseaseNewOrganEmbeddings),
 Collection(name=ont_hp),
 Collection(name=average)]

In [12]:
cachedDict.get("HP:0010851")

{'embeddings': [-0.032377034425735474,
  0.019941002130508423,
  0.016879407688975334,
  -0.022094953805208206,
  -0.011067797429859638,
  0.01621561124920845,
  0.016500094905495644,
  0.0002233117847936228,
  -0.028583908453583717,
  -0.025671329349279404,
  -0.002524798968806863,
  0.04611356556415558,
  -0.0020709787495434284,
  0.004355320706963539,
  0.010695258155465126,
  0.011304867453873158,
  0.03617015853524208,
  -0.004277425818145275,
  0.002413037233054638,
  0.011772234924137592,
  0.0013521475484594703,
  0.043918970972299576,
  -0.024844970554113388,
  -0.013865227811038494,
  -0.0023486895952373743,
  -0.006969867739826441,
  0.0037220041267573833,
  -0.03684750199317932,
  -0.026633158326148987,
  0.025441031903028488,
  -0.00021198744070716202,
  -0.010058555752038956,
  0.0025789865758270025,
  0.0006739570526406169,
  -0.01261214166879654,
  -0.021119579672813416,
  0.006377191748470068,
  -0.00298708607442677,
  0.03977362811565399,
  0.019466860219836235,
  0.0

In [13]:
cachedDict["HP:0000478"].values()
0.000000000000000222
0.008773104287683964

dict_values([[-0.008773104287683964, -0.0014036967186257243, 0.027385327965021133, -0.014990421012043953, -0.01687084510922432, 0.03326496109366417, 0.016314662992954254, -0.018168602138757706, 0.008845937438309193, 0.002759389579296112, -3.3416421501897275e-05, -0.0051645440980792046, 0.008110983297228813, 0.007627634797245264, 0.005121506284922361, 0.02215457148849964, 0.03358278051018715, -0.008561225607991219, 0.01597036048769951, -0.008243407122790813, -0.023849600926041603, 0.04592471569776535, -0.0067172180861234665, -0.017678633332252502, -0.00632656691595912, 0.010150316171348095, -0.019002875313162804, -0.045209627598524094, -0.030934296548366547, -0.021081935614347458, 0.022273752838373184, -0.015546603128314018, 0.0006886058836244047, -0.01644708774983883, 0.003297362942248583, 0.009196861647069454, 0.002741181291639805, -0.02188972197473049, 0.010302604176104069, 0.001863870769739151, 0.010368815623223782, 0.007978558540344238, -0.024180661886930466, -0.0036218021996319294