In [1]:
import pandas as pd
import numpy as np

### Read dictionary
The first 3 lines are comments
- Downloaded from: https://hpo.jax.org/app/download/annotation 
- http://compbio.charite.de/jenkins/job/hpo.annotations/lastStableBuild/

In [9]:
df_hpo2 = pd.read_table(r'hpo/phenotype.hpoa', sep='\t', header=4, encoding = "ISO-8859-1")
print('Nr of HPO codes:', len(df_hpo))
df_hpo2.head()

Nr of HPO codes: 188323


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#DatabaseID,DiseaseName,Qualifier,HPO_ID,Reference,Evidence,Onset,Frequency,Sex,Modifier,Aspect,Biocuration
0,OMIM:210100,"BETA-AMINOISOBUTYRIC ACID, URINARY EXCRETION OF",,HP:0000007,OMIM:210100,IEA,,,,,I,HPO:iea[2009-02-17]
1,OMIM:210100,"BETA-AMINOISOBUTYRIC ACID, URINARY EXCRETION OF",,HP:0003355,OMIM:210100,IEA,,,,,P,HPO:skoehler[2009-02-17]
2,OMIM:163600,NIPPLES INVERTED,,HP:0000006,OMIM:163600,IEA,,,,,I,HPO:iea[2009-02-17]
3,OMIM:163600,NIPPLES INVERTED,,HP:0003186,OMIM:163600,IEA,,,,,P,HPO:iea[2009-02-17]
4,OMIM:615763,"#615763 CORTICAL DYSPLASIA, COMPLEX, WITH OTHE...",,HP:0002365,OMIM:615763,TAS,,HP:0040283,,,P,HPO:skoehler[2014-08-24]


In [11]:
len(df_hpo2['DiseaseName'].unique())

16950

In [4]:
df_hpo = pd.read_table(r'hpo/misc/phenotype_annotation.tab', sep='\t')
df_hpo.head()

Unnamed: 0,#disease-db,disease-identifier,disease-name,negation,HPO-ID,reference,evidence-code,onset,frequencyHPO,modifier,sub-ontology,alt-names,curators,frequencyRaw,sex
0,DECIPHER,1,Wolf-Hirschhorn Syndrome,,HP:0000252,DECIPHER:1,IEA,,,,P,WOLF-HIRSCHHORN SYNDROME,HPO:skoehler[2013-05-29],,
1,DECIPHER,1,Wolf-Hirschhorn Syndrome,,HP:0001249,DECIPHER:1,IEA,,,,P,WOLF-HIRSCHHORN SYNDROME,HPO:skoehler[2013-05-29],,
2,DECIPHER,1,Wolf-Hirschhorn Syndrome,,HP:0001250,DECIPHER:1,IEA,,,,P,WOLF-HIRSCHHORN SYNDROME,HPO:skoehler[2013-05-29],,
3,DECIPHER,1,Wolf-Hirschhorn Syndrome,,HP:0001252,DECIPHER:1,IEA,,,,P,WOLF-HIRSCHHORN SYNDROME,HPO:skoehler[2013-05-29],,
4,DECIPHER,1,Wolf-Hirschhorn Syndrome,,HP:0001518,DECIPHER:1,IEA,,,,P,WOLF-HIRSCHHORN SYNDROME,HPO:skoehler[2013-05-29],,


In [12]:
len(df_hpo['disease-name'].unique())

11722

In [8]:
df_hpo['sub-ontology'].unique()


array(['P', 'I', 'M', 'C', nan], dtype=object)

## Extracting data from scientific article

In [2]:
import urllib.request
import nltk

user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'


url = "https://www.cell.com/ajhg/fulltext/S0002-9297(17)30459-7"
#cookies = {'required_cookie': required_value}
headers={'User-Agent':user_agent,} 
cookieProcessor = urllib.request.HTTPCookieProcessor()
opener = urllib.request.build_opener(cookieProcessor)

request=urllib.request.Request(url,None, headers) #The assembled request
response = opener.open(request,timeout=100)
data = response.read() # The data u need

In [3]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(data)
raw = soup.get_text()


In [4]:
print('Main text spanning from:', raw.find('Main Text'), 'to', raw.find('Acknowledgments'))

Main text spanning from: 13350 to 57531


In [48]:
tokens = nltk.word_tokenize(raw)

In [5]:
raw.find('Table 1')

16489

In [22]:
import spacy
from spacy import displacy
import scispacy
from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_core_sci_sm")

# Add the abbreviation pipe to the spacy pipeline.
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
    print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

Abbreviation 	 Definition
SBMA 	 (6, 7) Spinal and bulbar muscular atrophy
SBMA 	 (33, 34) Spinal and bulbar muscular atrophy
AR 	 (29, 30) androgen receptor


In [56]:
#nlp.remove_pipe('EntityLinker')
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x2208ccdcf60>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x220e512f948>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x2208f3f21c8>),
 ('AbbreviationDetector',
  <scispacy.abbreviation.AbbreviationDetector at 0x2208cd5a6a0>)]

In [62]:
linker.kb.cui_to_entity

{'C0000729': CUI: C0000729, Name: Abdominal cramps
 Definition: An involuntary muscular contraction involving a muscle of the abdomen or of a hollow organ within the abdomen.
 TUI(s): T184
 Aliases: (total: 0): 
 	 ,
 'C0000731': CUI: C0000731, Name: Distended abdomen
 Definition: Swelling of the abdomen resulting from excessive food intake, malnutrition, liver disease, primary abdominal tumors, and tumors metastatic to the abdominal cavity.
 TUI(s): T033
 Aliases: (total: 6): 
 	 Abdominal distention, Abdominal distension, Bloating, Abdominal bloating, Abdominal swelling, Belly bloating,
 'C0000734': CUI: C0000734, Name: Abdominal mass
 Definition: An abnormal growth in the abdomen.
 TUI(s): T033
 Aliases: (total: 0): 
 	 ,
 'C0000737': CUI: C0000737, Name: Abdominal pain
 Definition: Sensation of discomfort, distress, or agony in the abdominal region.
 TUI(s): T184
 Aliases: (total: 3): 
 	 Abdominal pain, Stomach pain, Pain in stomach,
 'C0000744': CUI: C0000744, Name: Abetalipoprot

In [59]:
# Influenza, commonly known as "the flu", is an infectious disease caused by an influenza virus. 
from scispacy.linking import EntityLinker


linker = EntityLinker(resolve_abbreviations=True, name="rxnorm")


#nlp.add_pipe(linker)

doc = nlp('Methotrexate Anvil, Symptoms can be mild to severe. The most common symptoms include: a high fever, runny nose, sore throat, muscle pains, headache, coughing, and feeling tired ... Acetylcysteine for reducing the oxygen transport and caffeine to stimulate ...')

# Let's look at a random entity!
#entity = doc.ents[1]

#print("Name: ", entity)

#for umls_ent in entity._.kb_ents:
#    print(linker.kb.cui_to_entity[umls_ent[0]])

KeyError: 'Methotrexate'

In [53]:
print(linker.kb.cui_to_entity['C0000729'])

CUI: C0000729, Name: Abdominal cramps
Definition: An involuntary muscular contraction involving a muscle of the abdomen or of a hollow organ within the abdomen.
TUI(s): T184
Aliases: (total: 0): 
	 


In [49]:
# document level
ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents]
print(ents)  # [('Ada Lovelace', 'PERSON', 'Q7259'), ('London', 'GPE', 'Q84')]

# token level
ent_ada_0 = [doc[0].text, doc[0].ent_type_, doc[0].ent_kb_id_]
ent_ada_1 = [doc[1].text, doc[1].ent_type_, doc[1].ent_kb_id_]
ent_london_5 = [doc[5].text, doc[5].ent_type_, doc[5].ent_kb_id_]
print(ent_ada_0)  # ['Ada', 'PERSON', 'Q7259']
print(ent_ada_1)  # ['Lovelace', 'PERSON', 'Q7259']
print(ent_london_5)  # ['London', 'GPE', 'Q84']

[('Influenza', 'ENTITY', ''), ('flu', 'ENTITY', ''), ('infectious disease', 'ENTITY', ''), ('influenza virus', 'ENTITY', ''), ('Methotrexate Anvil', 'ENTITY', ''), ('Symptoms', 'ENTITY', ''), ('severe', 'ENTITY', ''), ('symptoms', 'ENTITY', ''), ('high fever', 'ENTITY', ''), ('runny nose', 'ENTITY', ''), ('sore throat', 'ENTITY', ''), ('muscle pains', 'ENTITY', ''), ('headache', 'ENTITY', ''), ('coughing', 'ENTITY', ''), ('feeling', 'ENTITY', ''), ('tired', 'ENTITY', ''), ('Acetylcysteine', 'ENTITY', ''), ('reducing', 'ENTITY', ''), ('oxygen transport', 'ENTITY', ''), ('caffeine', 'ENTITY', ''), ('stimulate', 'ENTITY', '')]
['Influenza', 'ENTITY', '']
[',', '', '']
['"', '', '']


In [25]:
for umls_ent in entity._.hpo_ents:
        print(linker.umls.cui_to_entity[umls_ent[0]])

(Influenza,
 flu,
 infectious disease,
 influenza virus,
 Methotrexate Anvil,
 Symptoms,
 severe,
 symptoms,
 high fever,
 runny nose,
 sore throat,
 muscle pains,
 headache,
 coughing,
 feeling,
 tired,
 Acetylcysteine,
 reducing,
 oxygen transport,
 caffeine,
 stimulate)

In [10]:
doc.ents

(Influenza,
 flu,
 infectious disease,
 influenza virus,
 Methotrexate Anvil,
 Symptoms,
 severe,
 symptoms,
 high fever,
 runny nose,
 sore throat,
 muscle pains,
 headache,
 coughing,
 feeling,
 tired,
 Acetylcysteine,
 reducing,
 oxygen transport,
 caffeine,
 stimulate)

In [65]:
linker.kb.cui_to_entity

{'C0000729': CUI: C0000729, Name: Abdominal cramps
 Definition: An involuntary muscular contraction involving a muscle of the abdomen or of a hollow organ within the abdomen.
 TUI(s): T184
 Aliases: (total: 0): 
 	 ,
 'C0000731': CUI: C0000731, Name: Distended abdomen
 Definition: Swelling of the abdomen resulting from excessive food intake, malnutrition, liver disease, primary abdominal tumors, and tumors metastatic to the abdominal cavity.
 TUI(s): T033
 Aliases: (total: 6): 
 	 Abdominal distention, Abdominal distension, Bloating, Abdominal bloating, Abdominal swelling, Belly bloating,
 'C0000734': CUI: C0000734, Name: Abdominal mass
 Definition: An abnormal growth in the abdomen.
 TUI(s): T033
 Aliases: (total: 0): 
 	 ,
 'C0000737': CUI: C0000737, Name: Abdominal pain
 Definition: Sensation of discomfort, distress, or agony in the abdominal region.
 TUI(s): T184
 Aliases: (total: 3): 
 	 Abdominal pain, Stomach pain, Pain in stomach,
 'C0000744': CUI: C0000744, Name: Abetalipoprot

In [12]:
entity._.kb_ents

[]

In [11]:
entity = next(doc.sents)
for umls_ent in entity._.kb_ents:
    print(linker.kb.cui_to_entity[umls_ent[0]])

In [54]:
from spacy.pipeline import EntityLinker
entity_linker = EntityLinker(nlp.vocab)


AttributeError: 'EntityLinker' object has no attribute '_'

In [11]:
[(x.orth_,x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences[20])) if not y.is_stop and y.pos_ != 'PUNCT']]

<generator at 0x295bc1a58b8>

In [14]:
#import scispacy
#import spacy

displacy.render(nlp(next(doc.sents)), jupyter=True, style='ent')

TypeError: Argument 'string' has incorrect type (expected str, got spacy.tokens.span.Span)

In [16]:
displacy.render(next(doc.sents), style='dep', jupyter=True)

In [22]:
list(doc.sents)[2]

The most common symptoms include: a high fever, runny nose, sore throat, muscle pains, headache, coughing, and feeling tired ...

In [23]:
displacy.render(list(doc.sents)[2], style='ent', jupyter=True)

In [None]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [None]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')