# Explore the Embedding Space

In [1]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np

In [2]:
# Embedding vectors generated above
model = KeyedVectors.load(
   "outputontology.embeddings",
    mmap="r",
)

wv = model.wv

In [3]:
# Get the Numpy embedding of a given HPO

vector = wv["http://purl.obolibrary.org/obo/HP_0000265"]

print(vector)


[ 0.5098921  -0.23432972 -0.4938471   0.30849817  0.84349483 -0.13891587
  0.05551063  0.132816    0.06349483 -0.00534388 -0.33001605  0.03187589
 -0.18625315  0.37663913 -0.04901295 -0.01165972 -0.21994203 -0.11452126
  0.38071305 -0.01989286  0.10116046  0.6033055  -0.17155142 -0.17159711
 -0.23395267  0.64384884  0.37413383 -0.03009368  0.12952618 -0.56874317
 -0.02450729 -0.18908425 -0.7611072  -0.30064592  0.05655113 -0.07322795
 -0.6249055  -0.19374868 -0.09298488  0.09305706 -0.16089508  0.15391922
  0.14659187  0.38145867  0.032216    0.4716444  -0.52714366 -0.35038254
  0.1943305   0.2991756  -0.14908616  0.11279631 -0.24016656 -0.63401693
  0.12068636 -0.50801796  0.4261282  -0.3299459   0.34642035 -0.47858897
 -0.04633766  0.7455133  -0.3721883  -0.34393072 -0.07245614 -0.70287406
  0.31631348  0.37524715  0.24086258  0.18604428 -0.07871442 -0.81244195
 -0.38744488  0.10910387  0.28229252  0.611663   -0.10812523  0.0757219
  0.00429383  0.22146347 -0.2195343  -0.4750807  -0.

## Similarity Measures

In [4]:
# Most similar entities: cosmul
result = wv.most_similar_cosmul(positive=["http://purl.obolibrary.org/obo/HP_0000265"])

print(result)


[('http://purl.obolibrary.org/obo/HP_0011509', 0.9723530411720276), ('http://purl.obolibrary.org/obo/HP_0000031', 0.964691698551178), ('http://purl.obolibrary.org/obo/HP_0500006', 0.9616008400917053), ('http://purl.obolibrary.org/obo/HP_0000024', 0.9604008793830872), ('http://purl.obolibrary.org/obo/HP_0040031', 0.9597347378730774), ('http://purl.obolibrary.org/obo/HP_0003763', 0.9565582871437073), ('http://purl.obolibrary.org/obo/HP_0100816', 0.9558507800102234), ('http://purl.obolibrary.org/obo/HP_0011960', 0.955418586730957), ('http://purl.obolibrary.org/obo/HP_0010724', 0.9551156163215637), ('http://purl.obolibrary.org/obo/HP_0005906', 0.9547369480133057)]


This results aren't accurate, the HPOs are not really related from a medical point of view

In [5]:
# Most similar cosine similarity
result = wv.most_similar(positive=["Mastoiditis"])

print(result)

[('A developmental dysplasia of the dental enamel.', 0.9831352233886719), ('Osteoarthritis of the distal interphalangeal joint', 0.9830306172370911), ('Pseudoepiphysis of the proximal phalanx of the 2nd toe', 0.9829877018928528), ('Absent pigmentation of the limbs', 0.9828448295593262), ('Interphalangeal joint erosions', 0.9826673865318298), ('Pleural plaque', 0.9826033115386963), ('The presence of developmental dysplasia of the optic nerve.', 0.9825000762939453), ('Increased circulating copper concentration', 0.9824551343917847), ('Elongated radius', 0.9824470281600952), ('A soft tissue prominence of the ventral aspects of the fingertips. The term "persistent fetal fingertip pads" is often used as a synonym, but should better not be used because it implies knowledge of history of the patient which often does not exist.', 0.9824117422103882)]


In [6]:
# Compare to HPOs using cosine similarity
similarity = wv.similarity(
    "http://purl.obolibrary.org/obo/HP_0011509",
    "http://purl.obolibrary.org/obo/HP_0100816",
)
print(similarity)


0.92885876


This shouldn't have such a high similarity score --> the model is not generating embeddings correctly