In [1]:
import networkx
import obonet
import pandas as pd

In [2]:
#import os
#os.chdir('c:\\Users\\levrex\\Desktop\\DeepPhenotypingHPO')

## Convert OBO to rules for spacy matcher

In [4]:
%%time
url = 'phenopy_mod/.phenopy/data/hp.obo'
graph = obonet.read_obo(url)

id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}
name_to_id = {data['name']: id_ for id_, data in graph.nodes(data=True) if 'name' in data}

Wall time: 2.79 s


In [19]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_core_sci_sm")

doc = nlp("Hello, world! Hello world!")


In [38]:
id_to_name['HP:0001466']

'Contiguous gene syndrome'

## Expand list of phenotypes with OBO

In [26]:
f_syn = open('phenopy_mod/.phenopy/data/new_hpo_synonyms.txt', "r")
content = f_syn.read()
l_syn = content.split("\n")
f_syn.close()
print(len(l_syn))

49034


In [29]:
l_syn[1]

'HP:0003566\te2 increased prostaglandin serum'

In [31]:
l_syn[-4]

'HP:3000075\tAbnormality of lingual nerve'

In [22]:
f_syn = open("phenopy_mod/.phenopy/data/hpo_synonyms.txt", "r")
content = f_syn.read()
l_syn = content.split("\n")
f_syn.close()
print(len(l_syn))

29426


In [37]:
cnt = 0
for i in l_syn:
    line = i.split('\t')
    if len(line) < 2:
        print(line)
        print(line[0])
        print(cnt)
    cnt += 1

['']

29425


In [23]:
#graph
l_synonyms_new = []
for id_, data in graph.nodes(data=True):
    #print(name_to_id[data.get('name')])
    #hpo = 
    if 'synonym' in graph.nodes[id_]:
        for i in graph.nodes[id_]['synonym']:
            syn = i.split('"')[1]
            l_synonyms_new.append(str(name_to_id[data.get('name')]) + '\t' + syn)
print(len(l_synonyms_new))

19633


In [24]:
print(len(l_syn))
for syn in l_synonyms_new:
    if syn not in l_syn:
        l_syn.append(syn)
print(len(l_syn))

29426
49034


In [25]:
with open("phenopy_mod/.phenopy/data/new_hpo_synonyms.txt" , 'w') as f:
    f.write('\n'.join(l_syn))
    

In [7]:
import ast 

class AnnotateHPO(object):
    def __init__(self, matcher):
        self.matcher = matcher
        self.matched_sents = [] 
        self.matched_hpo = []
        self.matched_def = []
    
    def setMatcher(self, matcher):
        self.matcher = matcher
        
    def simpleCleaning(self, sentence): 
        """
        Remove special characters that are not relevant to 
        the interpretation of the text

        Input:
            sentence = free written text from EHR record
            lemma = lemmatize the text
        Output :
            processed sentence (lemmatized depending on preference)
        """
        sticky_chars = r'([!#,.:";@\-\+\\/&=$\]\[<>\'^\*`\(\)])'
        sentence = re.sub(sticky_chars, r' ', sentence)
        sentence = sentence.lower()
        return sentence
        
    def collect_sents(self, matcher, doc, i, matches):
        match_id, start, end = matches[i]
        print(match_id, i)
        hpo_id = doc.vocab.strings[match_id]
        span = doc[start:end]
        sent = span.sent
        start = span.start_char - sent.start_char
        end = span.end_char - sent.start_char
        match_ents = [{
            "start": start,
            "end": end,
            "label": "HP",
        }]
        self.matched_sents.append({"text": sent.text, "ents": match_ents})
        self.matched_hpo.append(hpo_id)
        self.matched_def.append(sent.text[start:end])
    
    def addPatterns(self, graph):
        count = 0
        pat_list = []
        for id_, data in graph.nodes(data=True):
            count += 1
            name = data.get('name').split(' ')
            if len(name) < 3:
                pattern = [ast.literal_eval('{"ORTH": "' + str(i.lower()) +'"}') for i in name]
                self.matcher.add(name_to_id[data.get('name')], self.collect_sents, pattern)
                pat_list.append(pattern)
            else :
                for x in range(2,len(name)+1):
                    #name = data.get('name').split(' ')[]
                    pattern = [ast.literal_eval('{"ORTH": "' + str(i.lower()) +'"}') for i in name[:x]]
                    if pattern not in pat_list:
                        self.matcher.add(name_to_id[data.get('name')], self.collect_sents, pattern)
                        pat_list.append(pattern)
                    if data.get('name') == 'Focal white matter lesions':
                        print(pattern)
                for i in range(len(name)):
                    l_cond = name.copy()
                    l_cond.pop(i)
                    pattern = [ast.literal_eval('{"ORTH": "' + str(i.lower()) +'"}') for i in l_cond]
                    if pattern not in pat_list:
                        pat_list.append(pattern)
                        self.matcher.add(name_to_id[data.get('name')], self.collect_sents, pattern)
                    #if data.get('name') == 'Focal white matter lesions':
                    #    print(l_cond)
                    #    print(pattern)
            #if count > 5:
            #    break
            if 'synonym' in graph.nodes[id_]:
                for i in graph.nodes[id_]['synonym']: # append
                    pattern = [ast.literal_eval('{"ORTH": "' + str(i.lower()) +'"}') for i in i.split('"')[1].split(' ')]
                    self.matcher.add(name_to_id[data.get('name')], self.collect_sents, pattern)
                    syn = i.split('"')[1].split(' ')
                    if len(syn) > 2:
                        for i in range(len(syn)):
                            l_cond = syn.copy()
                            l_cond.pop(i)
                            pattern = [ast.literal_eval('{"ORTH": "' + str(i.lower()) +'"}') for i in l_cond]
                            if pattern not in pat_list:
                                pat_list.append(pattern)
                                self.matcher.add(name_to_id[data.get('name')], self.collect_sents, pattern)
                    
    def employ(self, text):
        self.matched_sents = [] 
        self.matched_hpo = []
        doc = nlp(text.lower())
        matches = self.matcher(doc)
        #print(self.matched_hpo)
        return matches
    
    def prune(self):
        remove_list = []
        #removed_ent = []
        x = 0
        for i in self.matched_sents: 
            i_start = i['ents'][0]['start']
            i_end = i['ents'][0]['end']
            i_length = i_end - i_start
            ix = 0
            for j in self.matched_sents:
                if i['text'] == j['text']:
                    j_start = j['ents'][0]['start']
                    j_end = j['ents'][0]['end']
                    j_length = j_end - j_start
                    if j not in remove_list and j_length < i_length: # don't do duplicates (not very clean)
                        if j_start <= i_start and j_end >= i_start:
                            remove_list.append(ix)
                            #removed_ent.append(j)
                        elif j_start <= i_end and j_end >= i_end:
                            remove_list.append(ix)
                            #removed_ent.append(j)
                        elif j_start >= i_start and j_end <= i_end:
                            remove_list.append(ix)
                            #removed_ent.append(j)
                    elif j == i and ix != x and ix not in remove_list and x not in remove_list:
                        #if j not in removed_ent and i not in removed_ent: # remove latter
                        remove_list.append(ix)
                        #removed_ent.append(j)
                ix += 1
            x += 1
        #print(remove_list)
        self.matched_sents = [i for j, i in enumerate(self.matched_sents) if j not in remove_list]
        self.matched_hpo = [i for j, i in enumerate(self.matched_hpo) if j not in remove_list]
        self.matched_def = [i for j, i in enumerate(self.matched_def) if j not in remove_list]
        return
        
matcher = Matcher(nlp.vocab)
hp = AnnotateHPO(matcher)
hp.addPatterns(graph)
print(len(hp.matcher))


NameError: name 'Matcher' is not defined

In [256]:
print(len(hp.matched_hpo), len(hp.matched_def))

0 0


In [257]:
for i in range(len(hp.matched_def)-1):  
    print(hp.matched_def[i], hp.matched_hpo[i])

In [56]:
hpo_to_umls['HP:0000431']

'C1839764'

In [9]:
def getUMLS(data):
    umls = 'NaN'
    for item in data:
        if ':' in item:
            key, val = item.split(":", 1)
            if key == 'UMLS': 
                umls = val
                break
    return umls

def HPO_to_UMLS(graph):
    hpo_to_umls = {}
    for hpo in graph.nodes.keys():
        try : 
            data = graph.nodes[hpo]['xref']
            
        except :
            data = 'UMLS: NaN'
        hpo_to_umls[hpo] = getUMLS(data)
        umls_to_hpo = {v: k for k, v in hpo_to_umls.items()}
    return hpo_to_umls, umls_to_hpo

hpo_to_umls, umls_to_hpo = HPO_to_UMLS(graph)

In [346]:
import ast

matched_sents = []  # Collect data of matched sentences to be visualized
matched_hpo = []

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    hpo_id = doc.vocab.strings[match_id]
    span = doc[start:end]
    sent = span.sent
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "HP",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})
    matched_hpo.append(hpo_id)
    return 
    
count = 0

## ALl HPO terms
for id_, data in graph.nodes(data=True):
    count += 1
    pattern = [ast.literal_eval('{"ORTH": "' + str(i.lower()) +'"}') for i in data.get('name').split(' ')]
    matcher.add(name_to_id[data.get('name')], collect_sents, pattern)
    if 'synonym' in graph.nodes[id_]:
        for i in graph.nodes[id_]['synonym']: # append
            pattern = [ast.literal_eval('{"ORTH": "' + str(i.lower()) +'"}') for i in i.split('"')[1].split(' ')]
            matcher.add(name_to_id[data.get('name')], collect_sents, pattern)

# Low precision remove certain ambiguious synonyms like layman?
## All synonyms 


In [289]:
import pickle
pickle.dump(hp, open('phenopy_mod/.phenopy/Matcher/hpo_matcher.p', "wb" ) )

In [285]:
hp = pickle.load(open('phenopy_mod/.phenopy/Matcher/hpo_matcher.p', "rb" ))

In [287]:
#hp.matched_def

### Employ HPO on sample text

In [288]:
hp.matched_sents = [] 
hp.matched_hpo = []
hp.matched_def = []

In [290]:
import re

text = 'Intracranial bleeding High frequency hearing loss Short stature, wide nasal bridge,  microcephaly  hearing loss   ID Developmental delay (Global)  Motor Delay Speech delay  seizures  trigonocephaly  coloboma  hypertelorism  high arched eyebrows Hypertrichosis  congenital ptosis Epicanthus Telecanthus  pachygyria Other brain MRI abnormalities (i.e. brain atrophy, corpus callosum agenesis, hydrocephalus, white matter lesions, periventricular nodular heterotopias). Long eyelashes Deeply set eyes Broad/wide nasal bridge/ tip Bulbous nose Failure to thrive Feeding Difficulties Prominent eyes/proptosis Down slanting eyebrows Mid face hypoplasia Low set ears Dysplastic ears Small ears with overfolded helices/prominent helix Long philtrum Smooth philtrum Thin lips Pointed/small chin Pterygium colli/webbed neck Retrognathia/micrognathia Kypho/scoliotic posture Joint laxity Clinodactyly Tapering digits \xa0Hernia High palate Cleft lip Cleft palate Transverse right palmar crease/deep oakmar creases Pectus excavatum/pectus deformity Radial abnormalities (fingers/toes) Hypotonia Dystonia Frontal bossing Scaphocephaly Bitemporal narrowing Prominence of metopic suture/ridge Large anterior fontanelle High anterior hairline Low anterior hairline Low posterior hairline Sparse eyebrows Flaired and straight eyebrows Synophris Upslanting/long palpebral fissures Heart defects (Other then ASD, VSD or PDA) ASD VSD Cardiomegaly Persistent Embryonic Structures Large squared nose tip upturned nose Prominent nasal root on profile Thin upper lip Large/wide mouth tented upper lip with downturned corners Thick/prominent/everted lower lip Prominent cheeks Autism Spectrum disorder ADHD/ADD Other Psychiatric diagnosis (schizophrenia, depression, other, not autism, or ADHD/ADD) Behavioral abnormalities (tantrums, auto mutulation) Horseshoe kidney Renal Cysts Other Renal abnormality Cryptorchidism Recurrent infections Photosensitivity Polyarthralgia Hepatomegaly Hypothyroidism Leukocytosis Thrombocytopenia Asthma Strabismus Growth hormone deficiency'

# 
#doc = nlp(text.lower())
#matches = matcher(doc)
text = hp.simpleCleaning(text)
hp.employ(text.lower())

# Serve visualization of sentences containing match with displaCy
# set manual=True to make displaCy render straight from a dictionary
# (if you're not running the code within a Jupyer environment, you can
# use displacy.serve instead)
#displacy.render(matched_sents, style="ent", options=options, manual=True)

5471357220611380921 0
4581928552930701085 1
4581928552930701085 2
4581928552930701085 3
4628039181647357534 4
945515856306630882 5
4261437138745829640 6
16949990724671758082 7
16949990724671758082 8
16152484463225494763 9
3995578825471680814 10
4628039181647357534 11
98212013394016508 12
8723878000737723424 13
293286804905931729 14
694094419883236909 15
6201834650144109961 16
166996708915989520 17
11793750258293291190 18
2821148264986309643 19
10705396737648612811 20
10705396737648612811 21
14578597842821536426 22
7481548655740655293 23
10818950140088686257 24
4368486485002224386 25
12740591186763247024 26
17751985521904005566 27
18305696374267983467 28
13574998172678904960 29
13302105373301662617 30
16574072320878246487 31
11040745532205498480 32
13141466298718664497 33
11040745532205498480 34
18266677547006962631 35
6652361103653626040 36
12053815203365963164 37
13770341375261184734 38
13434604808808148181 39
11213222984457915974 40
9239268030539539711 41
16949990724671758082 42
1694

[(5471357220611380921, 3, 4),
 (4581928552930701085, 2, 5),
 (4581928552930701085, 2, 6),
 (4581928552930701085, 3, 6),
 (4628039181647357534, 4, 6),
 (945515856306630882, 6, 8),
 (4261437138745829640, 6, 8),
 (16949990724671758082, 9, 11),
 (16949990724671758082, 9, 12),
 (16152484463225494763, 10, 12),
 (3995578825471680814, 13, 14),
 (4628039181647357534, 15, 17),
 (98212013394016508, 20, 22),
 (8723878000737723424, 25, 27),
 (293286804905931729, 27, 29),
 (694094419883236909, 30, 31),
 (6201834650144109961, 32, 33),
 (166996708915989520, 34, 35),
 (11793750258293291190, 36, 37),
 (2821148264986309643, 38, 40),
 (10705396737648612811, 38, 41),
 (10705396737648612811, 39, 41),
 (14578597842821536426, 41, 42),
 (7481548655740655293, 43, 45),
 (10818950140088686257, 43, 45),
 (4368486485002224386, 44, 45),
 (12740591186763247024, 45, 46),
 (17751985521904005566, 46, 47),
 (18305696374267983467, 48, 49),
 (13574998172678904960, 57, 59),
 (13302105373301662617, 57, 59),
 (165740723208782

In [291]:
hp.matched_sents[1]

{'text': 'intracranial bleeding high frequency hearing loss short stature  wide nasal bridge   microcephaly  hearing loss   id developmental delay  global   motor delay speech delay  seizures  trigonocephaly  coloboma  hypertelorism  high arched eyebrows hypertrichosis  congenital ptosis epicanthus telecanthus  pachygyria other brain mri abnormalities  i e  brain atrophy  corpus callosum agenesis  hydrocephalus  white matter lesions  periventricular nodular heterotopias   long eyelashes deeply set eyes broad wide nasal bridge  tip bulbous nose failure to thrive feeding difficulties prominent eyes proptosis down slanting eyebrows mid face hypoplasia low set ears dysplastic ears small ears with overfolded helices prominent helix long philtrum smooth philtrum thin lips pointed small chin pterygium colli webbed neck retrognathia micrognathia kypho scoliotic posture joint laxity clinodactyly tapering digits \xa0hernia high palate cleft lip cleft palate transverse right palmar crease deep oa

### Filter overlapping HPO's 

In [292]:
len(hp.matched_def)

156

In [293]:
def prune(matched_sents, matched_hpo):
    remove_list = []
    for i in matched_sents: 
        i_start = i['ents'][0]['start']
        i_end = i['ents'][0]['end']
        i_length = i_end - i_start
        ix = 0
        for j in matched_sents:
            if i['text'] == j['text']:
                j_start = j['ents'][0]['start']
                j_end = j['ents'][0]['end']
                j_length = j_end - j_start
                if j not in remove_list and j_length < i_length: # don't do duplicates (not very clean)
                    if j_start <= i_start and j_end >= i_start:
                        remove_list.append(ix)
                    elif j_start <= i_end and j_end >= i_end:
                        remove_list.append(ix)
                    elif j_start >= i_start and j_end <= i_end:
                        remove_list.append(ix)
            ix += 1
        print(remove_list)
        matched_sents = [i for j, i in enumerate(matched_sents) if j not in remove_list]
        matched_hpo = [i for j, i in enumerate(matched_hpo) if j not in remove_list]
    return matched_sents, matched_hpo

#matched_sents, matched_hpo = prune(matched_sents, matched_hpo)
hp.prune()
colors = {"HP": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["HP"], "colors": colors}
#displacy.render(hp.matched_sents, style="ent", options=options, manual=True)
#displacy.serve(hp.matched_sents, style="ent", options=options) # , style="ent"

In [281]:
len(hp.matched_def)

97

In [282]:
hp.matched_sents[0]['ents']

[{'start': 22, 'end': 49, 'label': 'HP'}]

In [294]:
from IPython.display import display, HTML
def show_marked_text(matched_sents):
    matched_sents= matched_sents[::-1]
    doc = []
    current = matched_sents[0]['text']
    markup = current
    for i in range(len(matched_sents)):
        if matched_sents[i]['text'] != current:
            current = matched_sents[i]['text']
            doc.append(markup)
            markup = current
        else :
            end, start = matched_sents[i]['ents'][0]['end'], matched_sents[i]['ents'][0]['start']
            start_str = '<span style="color:red">'
            end_str = '</span>'
            markup = markup[:end] + end_str + markup[end:]
            markup = markup[:start] + start_str + markup[start:]
    return markup

markup = show_marked_text(hp.matched_sents)

#
display(HTML(markup)) # ,colors = 'r', default_color="black"

In [265]:
for ix in range(len(hp.matched_hpo)):
    print(hp.matched_def[ix], hp.matched_hpo[ix])

high frequency hearing loss HP:0005101
short stature HP:0003498
wide nasal bridge HP:0000431
microcephaly HP:0000252
hearing loss HP:0000365
developmental delay HP:0001263
motor delay HP:0001270
speech delay HP:0000750
seizures HP:0001250
trigonocephaly HP:0000243
coloboma HP:0000589
hypertelorism HP:0000316
high arched eyebrows HP:0002553
hypertrichosis HP:0000998
congenital ptosis HP:0007911
epicanthus HP:0000286
telecanthus HP:0000506
pachygyria HP:0001302
brain atrophy HP:0002283
corpus callosum agenesis HP:0001274
hydrocephalus HP:0000238
white matter lesions HP:0007042
periventricular nodular HP:0032388
heterotopias HP:0002282
long eyelashes HP:0000527
deeply set HP:0000490
wide nasal bridge HP:0000431
bulbous nose HP:0000414
failure to thrive HP:0001508
feeding difficulties HP:0008872
prominent eyes HP:0000520
proptosis HP:0000520
low set ears HP:0000369
dysplastic ears HP:0000377
small ears HP:0008551
overfolded helices HP:0000396
prominent helix HP:0009904
long philtrum HP:000

In [14]:
txt  = """high frequency hearing loss HP:0005101
short stature HP:0003498
wide nasal bridge HP:0000431
microcephaly HP:0000252
hearing loss HP:0000365
developmental delay HP:0001263
motor delay HP:0001270
speech delay HP:0000750
seizures HP:0001250
trigonocephaly HP:0000243
coloboma HP:0000589
hypertelorism HP:0000316
high arched eyebrows HP:0002553
hypertrichosis HP:0000998
congenital ptosis HP:0007911
epicanthus HP:0000286
telecanthus HP:0000506
pachygyria HP:0001302
brain atrophy HP:0002283
corpus callosum agenesis HP:0001274
hydrocephalus HP:0000238
white matter lesions HP:0007042
periventricular nodular HP:0032388
heterotopias HP:0002282
long eyelashes HP:0000527
deeply set HP:0000490
wide nasal bridge HP:0000431
bulbous nose HP:0000414
failure to thrive HP:0001508
feeding difficulties HP:0008872
prominent eyes HP:0000520
proptosis HP:0000520
low set ears HP:0000369
dysplastic ears HP:0000377
small ears HP:0008551
overfolded helices HP:0000396
prominent helix HP:0009904
long philtrum HP:0000343
smooth philtrum HP:0000319
thin lips HP:0000219
small chin HP:0000307
pterygium colli HP:0000465
webbed neck HP:0000465
retrognathia HP:0000278
micrognathia HP:0000347
joint laxity HP:0001388
clinodactyly HP:0030084
hernia HP:0100790
high palate HP:0000218
cleft lip HP:0000161
cleft palate HP:0000175
right HP:0012834
palmar crease HP:0000954
pectus excavatum HP:0000767
pectus deformity HP:0000766
hypotonia HP:0001290
dystonia HP:0001332
frontal bossing HP:0002007
scaphocephaly HP:0030799
bitemporal narrowing HP:0000341
of metopic suture HP:0005487
large anterior fontanelle HP:0000260
high anterior hairline HP:0009890
low anterior hairline HP:0000294
low posterior hairline HP:0002162
sparse eyebrows HP:0000535
straight eyebrows HP:0011228
synophris HP:0000664
long palpebral fissures HP:0000637
heart defects HP:0001627
asd HP:0000729
asd HP:0000729
cardiomegaly HP:0001640
upturned nose HP:0000455
prominent nasal root HP:0000426
thin upper lip HP:0000219
wide mouth HP:0000154
tented upper lip HP:0010804
downturned corners HP:0002714
everted lower lip HP:0000232
autism spectrum disorder HP:0000729
depression HP:0000716
autism HP:0000717
horseshoe kidney HP:0000085
renal cysts HP:0000107
renal abnormality HP:0012211
cryptorchidism HP:0000028
recurrent infections HP:0001581
photosensitivity HP:0000992
hepatomegaly HP:0002240
hypothyroidism HP:0000821
leukocytosis HP:0001974
thrombocytopenia HP:0001873
asthma HP:0002099
strabismus HP:0000486
growth hormone deficiency HP:0000824
"""
for x in txt.split('\n'):
    #print(x)
    #print(x.split('HP:')[0])
    print("'%s'" % x.split(' HP:')[0], ":", "'HP:%s'" % x.split(' HP:')[1])

'high frequency hearing loss' : 'HP:0005101'
'short stature' : 'HP:0003498'
'wide nasal bridge' : 'HP:0000431'
'microcephaly' : 'HP:0000252'
'hearing loss' : 'HP:0000365'
'developmental delay' : 'HP:0001263'
'motor delay' : 'HP:0001270'
'speech delay' : 'HP:0000750'
'seizures' : 'HP:0001250'
'trigonocephaly' : 'HP:0000243'
'coloboma' : 'HP:0000589'
'hypertelorism' : 'HP:0000316'
'high arched eyebrows' : 'HP:0002553'
'hypertrichosis' : 'HP:0000998'
'congenital ptosis' : 'HP:0007911'
'epicanthus' : 'HP:0000286'
'telecanthus' : 'HP:0000506'
'pachygyria' : 'HP:0001302'
'brain atrophy' : 'HP:0002283'
'corpus callosum agenesis' : 'HP:0001274'
'hydrocephalus' : 'HP:0000238'
'white matter lesions' : 'HP:0007042'
'periventricular nodular' : 'HP:0032388'
'heterotopias' : 'HP:0002282'
'long eyelashes' : 'HP:0000527'
'deeply set' : 'HP:0000490'
'wide nasal bridge' : 'HP:0000431'
'bulbous nose' : 'HP:0000414'
'failure to thrive' : 'HP:0001508'
'feeding difficulties' : 'HP:0008872'
'prominent eyes' 

IndexError: list index out of range

## Employ HPOtool on table

In [299]:
import pandas as pd
df_table = pd.read_csv(r'data/TableExtraction.csv')
df_table = df_table.fillna('')
df_table.head()

Unnamed: 0,Case ID,Inheritance,Gender,Age (Years),Prenatal and Neonatal History,PN Growth Retardation,Microcephaly,Motor Delay,Speech Delay,DD/ ID,"Behavioral, Psychiatric, and Neurological Features",Malformations and Physical Anomalies,Additional Comments
0,I,DN,M,4,SGA and feeding difficulties,Y,N,Y,Y,mod,possible absence and focal seizures,"VSD with tortuous aortic arch, horseshoe kidne...","early-onset hypothyroidism, limitation of join..."
1,II,DN,M,7,SGA and feeding difficulties,Y,N,Y,Y,mild,"sociable, empathetic, hand flapping tendency a...","Rt pelvic kidney, Rt inguinal hernia and scoli...","GOR, asthma and allergies."
2,III,DN,M,7,"SGA, polycythaemia, jaundice and hypoglycaemia...",Y,Y,N,Y,Mild,"attention deficit, echolalia and tantrums.","inguinal hernia, cryptorchidism, proximally pl...","perineal and scalp abscesses, recurrent chest ..."
3,IVa,Mat,F,32,SGA,Y,N,Y,Y,mod,empathetic personality.,scoliosis,"glaucoma, asthma, and eczema"
4,IVb,U,F,68,U,U,U,U,U,mild,U,horseshoe kidney with multiple cysts,hiatus hernia


In [397]:
def HPO_annotate(row):
    list_hpo = []
    for i in row:
        text = i
        hp.employ(text)
        hp.prune()
        list_hpo.extend(hp.matched_hpo)
    return list_hpo
    
df_table['HPO'] = df_table.apply(lambda x: HPO_annotate(x), 1)

0    [HP:0011968, HP:0007359, HP:0000085, HP:000002...
1    [HP:0011968, HP:0012825, HP:0007018, HP:000012...
2    [HP:0000952, HP:0001943, HP:0012825, HP:000701...
3     [HP:0002650, HP:0000501, HP:0002099, HP:0000964]
4                 [HP:0012825, HP:0000085, HP:0002036]
dtype: object

## ISSUE = Acronyms are not recognized

In [410]:
text = """'Category Id Protein High frequency hearing loss Short stature  microcephaly  hearing loss   ID Developmental delay (Global)  Motor Delay Speech delay  seizures  trigonocephaly  coloboma  hypertelorism  high arched eyebrows Hypertrichosis  congenital ptosis Epicanthus Telecanthus  pachygyria Other brain MRI abnormalities (i.e. brain atrophy, corpus callosum agenesis, hydrocephalus, white matter leasions, periventricular nodular heterotopias). Long eyelashes Deeply set eyes Broad/wide nasal bridge/ tip Bulbous nose Failure to thrive Feeding Difficulties Prominent eyes/proptosis Down slanting eyebrows Mid face hypoplasia Low set ears Dysplastic ears Small ears with overfolded helices/prominent helix Long philtrum Smooth philtrum Thin lips Pointed/small chin Pterygium colli/webbed neck Retrognatia/micrognathia Kypho/scoliotic posture Joint laxity Clinodactyly Tapering digits \xa0Hernia High palate Cleft lip Cleft palate Transverse right palmar crease/deep oakmar creases Pectus excavatum/pectus deformity Radial abnormalities (fingers/toes) Hypotonia Dystonia Frontal bossing Scaphocephaly Bitemporal narrowing Prominence of metopic suture/ridge Large anterior fontanelle High anterior hairline Low anterior hairline Low posterior hairline Sparse eyebrows Flaired and straight eyebrows Synophris Upslanting/long palpebral fissures Heart defects (Other then ASD, VSD or PDA) ASD VSD Cardiomegaly Persistent Embryonic Sructures Large squared nose tip upturned nose Prominent nasal root on profile Thin upper lip Large/wide mouth tented upper lip with downturned corners Thick/prominent/everted lower lip Prominent cheeks Autism Spectrum disorder ADHD/ADD Other Psychiatric diagnosis (shizofrenia, depression, other, not autism, or ADHD/ADD) Behavioral abnormalities (tantrums, auto mutulation) Horseshoe kidney Renal Cysts Other Renal abnormality Cryptorchidism Recurrent infections Photosensitivity Polyarthralgia Hepatomegaly Hypothyroidism Leukocytose Trombocytopenie Asthma Strabismus Growth hormone deficiency'"""

In [None]:
#text = df_table[df_table['Case ID'] == 'XIXa']['Malformations and Physical Anomalies'].iloc[0]
hp.employ(text)
hp.prune()

colors = {"HP": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["HP"], "colors": colors}
displacy.render(hp.matched_sents, style="ent", options=options, manual=True)

In [401]:
df_table[['Case ID', 'HPO']]

Unnamed: 0,Case ID,HPO
0,I,"[HP:0011968, HP:0007359, HP:0000085, HP:000002..."
1,II,"[HP:0011968, HP:0012825, HP:0007018, HP:000012..."
2,III,"[HP:0000952, HP:0001943, HP:0012825, HP:000701..."
3,IVa,"[HP:0002650, HP:0000501, HP:0002099, HP:0000964]"
4,IVb,"[HP:0012825, HP:0000085, HP:0002036]"
5,V,"[HP:0011968, HP:0012825, HP:0002119, HP:003065..."
6,VI,"[HP:0012825, HP:0010809, HP:0010055, HP:000988..."
7,VII,"[HP:0001290, HP:0011968, HP:0001290, HP:003073..."
8,VIII,"[HP:0001290, HP:0011968, HP:0012825, HP:000228..."
9,IX,"[HP:0001290, HP:0011968, HP:0001631, HP:0000023]"


In [None]:
import itertools, collections

tweets = ['I went to work but got delayed at other work and got stuck in a traffic and I went to drink some coffee but got no money and asked for money from work',
          'We went to get our car but the car was not ready. We tried to expedite our car but were told it is not ready']

words = set(word.lower() for tweet in tweets for word in tweet.split())
_pairs = list(itertools.permutations(words, 2))
# We need to clean up similar pairs: sort words in each pair and then convert
# them to tuple so we can convert whole list into set.
pairs = set(map(tuple, map(sorted, _pairs)))

c = collections.Counter()

for tweet in tweets:
    for pair in pairs:
        if pair[0] in tweet and pair[1] in tweet:
            c.update({pair: 1})

print c.most_common(10)

In [245]:
cond = 'Feeding difficulties'
graph.nodes[name_to_id[cond]]['name'], graph.nodes[name_to_id[cond]]['synonym']

('Feeding difficulties',
 ['"Feeding difficulties" EXACT layperson []',
  '"Feeding problems" EXACT layperson []',
  '"Poor feeding" EXACT layperson []'])

In [16]:
cond = 'Focal white matter lesions'
mixup = []
for i in range(len(graph.nodes[name_to_id[cond]]['name'].split(' '))):
    l_cond = graph.nodes[name_to_id[cond]]['name'].split(' ')
    print(l_cond)
    del l_cond[i]
    print(l_cond)
    mixup.append(' '.join(l_cond))
print(mixup)

['Focal', 'white', 'matter', 'lesions']
['white', 'matter', 'lesions']
['Focal', 'white', 'matter', 'lesions']
['Focal', 'matter', 'lesions']
['Focal', 'white', 'matter', 'lesions']
['Focal', 'white', 'lesions']
['Focal', 'white', 'matter', 'lesions']
['Focal', 'white', 'matter']
['white matter lesions', 'Focal matter lesions', 'Focal white lesions', 'Focal white matter']


In [None]:
Intellectual disability

## Find parent or child relationships

In [27]:
cond = 'Intellectual disability'
node = name_to_id[cond]
for child, parent, key in graph.out_edges(node, keys=True):
    print(f'• {id_to_name[child]} ⟶ {key} ⟶ {id_to_name[parent]}')

• Intellectual disability ⟶ is_a ⟶ Abnormality of higher mental function
• Intellectual disability ⟶ is_a ⟶ Neurodevelopmental abnormality


### Find all superterms of Disproportionate tall stature

In [30]:
sorted(id_to_name[superterm] for superterm in networkx.descendants(graph, 'HP:0001519'))

['Abnormality of body height',
 'All',
 'Growth abnormality',
 'Phenotypic abnormality',
 'Tall stature']

#### superterms of synovitus

In [38]:
sorted(id_to_name[superterm] for superterm in networkx.descendants(graph, 'HP:0100769'))

['Abnormal joint morphology',
 'Abnormality of skeletal morphology',
 'Abnormality of the skeletal system',
 'Abnormality of the synovia',
 'All',
 'Phenotypic abnormality']

### Find all subterms of abnormal pigmentation of the oral mucosa

In [41]:
sorted(id_to_name[subterm] for subterm in networkx.ancestors(graph, 'HP:0100669'))

['Intra-oral hyperpigmentation',
 'Oral melanoacanthoma',
 'Oral melanotic macule',
 'Reticulate pigmentation of oral mucosa',
 'White oral mucosal macule']

In [64]:

sorted(id_to_name[subterm] for subterm in networkx.descendants(graph, name_to_id['Abnormal erythrocyte morphology']))

['Abnormality of blood and blood-forming tissues',
 'All',
 'Phenotypic abnormality']

## Find all paths to the root

In [5]:
paths = networkx.all_simple_paths(
    graph,
    source=name_to_id['Reduced alpha/beta synthesis ratio'],
    target=name_to_id['All']
)
for path in paths:
    print('•', ' ⟶ '.join(id_to_name[node] for node in path))

• Reduced alpha/beta synthesis ratio ⟶ Imbalanced hemoglobin synthesis ⟶ Abnormal hemoglobin ⟶ Abnormal erythrocyte morphology ⟶ Abnormality of blood and blood-forming tissues ⟶ Phenotypic abnormality ⟶ All
