# Phenotype extraction

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

## Extract phenotype candidates from papers

### Load corpus

In [2]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLDocParser, GWASXMLDocParser

xml_parser = GWASXMLDocParser(
    path=abstract_dir,
    doc='./*',
    title='.//front//article-title//text()',
    abstract='.//abstract//p//text()',
    par1='.//body/p[1]//text()',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [3]:
from snorkel.parser import HTMLParser
from snorkel.parser import SentenceParser
from snorkel.parser import CorpusParser

sent_parser = SentenceParser()
html_parser = HTMLParser(path='../data/db/papers/')

corpus_name = 'gwas-text-corpus.pkl'

try:
    with open(corpus_name,"r") as pkl:
        corpus = cPickle.load(pkl)
except:
    cp = CorpusParser(xml_parser, sent_parser, max_docs=100)
    %time corpus = cp.parse_corpus(name='GWAS Corpus')
    # pickling currently doesn't work...
#     with open(corpus_name,"w") as pkl:
#         corpus = cPickle.dump(corpus, pkl)

CPU times: user 2.38 s, sys: 201 ms, total: 2.58 s
Wall time: 24.7 s


### Extract candidates

In [13]:
from snorkel.candidates import Ngrams
from snorkel.matchers import DictionaryMatch
from snorkel.candidates import EntityExtractor

from extractor.util import change_name
from extractor.matcher import PhenotypeMatcher
from db.kb import KnowledgeBase

# Define a candidate space
ngrams = Ngrams(n_max=4)

# collect phenotype list
kb = KnowledgeBase()
phenotype_list = kb.get_phenotype_candidates()

# Define matchers
# phen_matcher = DictionaryMatch(d=phenotype_list, longest_match_only=True, ignore_case=True, stemmer='porter')
phen_matcher = PhenotypeMatcher(d=phenotype_list, ignore_case=True, mod_fn=change_name)

# Extractor
ngrams = Ngrams(n_max=7)
phen_extractor = EntityExtractor(ngrams, phen_matcher)

# collect candidates
%time phen_c = phen_extractor.extract(corpus.get_sentences(), name='all')
print len(phen_c), 'candidates extracted'

CPU times: user 23.9 s, sys: 189 ms, total: 24.1 s
Wall time: 24.1 s
16731 candidates extracted


We would like to remove nested candidates (manually, for now).

In [14]:
# load existing candidates into a dict
span_dict = { str(span.context) : list() for span in phen_c }
for span in phen_c:
    span_dict[str(span.context)].append( (span.char_start, span.char_end) )

def nested(ivl1, ivl2):
    if ivl1 != ivl2 and ivl2[0] <= ivl1[0] <= ivl1[1] <= ivl2[1]:
        return True
    else:
        return False

new_phen_c = list()
for span in phen_c:
    span_ivl = span.char_start, span.char_end
    span_name = str(span.context)
    if all([not nested(span_ivl, other_ivl) for other_ivl in span_dict[span_name]]):
        new_phen_c.append(span)
        
print len(phen_c) - len(new_phen_c), 'candidates dropped, now we have', len(new_phen_c)
phen_c = new_phen_c

6932 candidates dropped, now we have 9799


### Create gold-truth set

In [20]:
from db.kb import KnowledgeBase
from nltk.stem import PorterStemmer
from extractor.util import change_name

kb = KnowledgeBase() # reload
gold_set_phens = frozenset \
([ 
    (doc.name, phen.ontology_ref) for doc in corpus.documents 
                                  for phen in kb.phen_by_pmid(doc.name, source='efo')
])

# map phenotype names to their id
# TODO: change this to be like in the version below! (i.e. using sets)
phen2id = \
{
    change_name(syn) : phen.ontology_ref for doc in corpus.documents
                                                  for phen in kb.phen_by_pmid(doc.name, source='efo')
                                                  for syn in [phen.name] + phen.synonyms.split('|')
}

id2phen = \
{
    phen.ontology_ref : phen for doc in corpus.documents
                             for phen in kb.phen_by_pmid(doc.name, source='efo')
}

print 'Found %d gold mentions, e.g.:' % len(gold_set_phens)
print list(gold_set_phens)[:5]
print len(phen2id), len(id2phen)

Found 322 gold mentions, e.g.:
[('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004764'), ('17903294', u'http://purl.obolibrary.org/obo/GO_0070527'), ('20038947', u'http://www.ebi.ac.uk/efo/EFO_0003761'), ('19197348', u'http://www.ebi.ac.uk/efo/EFO_0004748'), ('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004626')]
833 216


## Statistics

First, we need to understand whether the extraction or the classification approach will be better.

### Number of phenotypes per paper

#### Number of EFO phenotypes per paper

In [16]:
docid2efo = \
{
    doc.name : [ phen.name for phen in kb.phen_by_pmid(doc.name, source='efo') ] for doc in corpus.documents
}
print 'EFO phenotype numbers per paper:', sorted([len(v) for v in docid2efo.values()])

EFO phenotype numbers per paper: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7, 10, 11, 13, 17, 37, 66]


#### Number of GWAS catalog (i.e. aggregate) phenotypes per paper

In [17]:
docid2efo = \
{
    doc.name : [ phen.name for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog') ] for doc in corpus.documents
}
print 'GWAS catalog phenotype numbers per paper:', sorted([len(v) for d, v in docid2efo.items()])

GWAS catalog phenotype numbers per paper: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3]


### Candidate recall statistics

#### Over EFO phenotypes

In [18]:
from extractor.util import gold_phen_stats
gold_phen_stats(phen_c, gold_set_phens, phen2id)

# of gold annotations	= 322
# of candidates		= 9799
Candidate recall	= 0.419
Candidate precision	= 0.014


Why is the recall low?

In [22]:
from extractor.util import gold_phen_recall
kb = KnowledgeBase()
id2doc = {doc.name : doc for doc in corpus.documents}
gold_dict_phen = { doc_id : set() for doc_id, phen_id in gold_set_phens }
for doc_id, phen_id in gold_set_phens:
    gold_dict_phen[doc_id].add(phen_id)

phen_not_found = list(gold_phen_recall(phen_c, gold_set_phens, phen2id))
print len(phen_not_found)
for doc_id, phen_id in phen_not_found[:100]:
    if len(gold_dict_phen[doc_id]) > 3: continue
    print doc_id
    for phen_id2 in gold_dict_phen[doc_id]:
        print id2phen[phen_id2].name, phen_id2
#     print gold_dict_phen[doc_id]
    print id2phen[phen_id].name, id2phen[phen_id].ontology_ref
    print id2phen[phen_id].synonyms    
    print kb.title_by_pmid(doc_id)
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    print

187
19798445
blood metabolite measurement http://www.ebi.ac.uk/efo/EFO_0005664
sphingolipid measurement http://www.ebi.ac.uk/efo/EFO_0004622
blood metabolite measurement http://www.ebi.ac.uk/efo/EFO_0005664

Genetic determinants of circulating sphingolipid concentrations in European populations.
Genetic Determinants of Circulating Sphingolipid Concentrations in European Populations. Sphingolipids have essential roles as structural components of cell membranes and in cell signalling, and disruption of their metabolism causes several diseases, with diverse neurological, psychiatric, and metabolic consequences. Increasingly, variants within a few of the genes that encode enzymes involved in sphingolipid metabolism are being associated with complex disease phenotypes. Direct experimental evidence supports a role of specific sphingolipid species in several common complex chronic disease processes including atherosclerotic plaque formation, myocardial infarction (MI), cardiomyopathy, pancrea

#### Over aggregate phenotypes

We say that a mention for an aggregate phenotype is correct, if it corresponds to the name of the GWC phenotype or to the phenotype of any equivalent EFO phenotype.

In [38]:
from db.kb import KnowledgeBase
from nltk.stem import PorterStemmer
from extractor.util import change_name

kb = KnowledgeBase() # reload
gold_set_agg_phens = frozenset \
([ 
    (doc.name, phen.id) for doc in corpus.documents 
                        for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog')
])

# map phenotype names to their id
agg_phen2id = dict()
for doc in corpus.documents:
    for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog'):
        for eq_phen in phen.equivalents:
            for syn in [phen.name] + [eq_phen.name] + eq_phen.synonyms.split('|'):
                syn_name = change_name(syn)
                if syn_name not in agg_phen2id: agg_phen2id[syn_name] = set()
                agg_phen2id[syn_name].add(phen.id)
                
agg_id2phen = \
{
    phen.id : phen for doc in corpus.documents
                   for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog')
}

print 'Found %d gold mentions, e.g.:' % len(gold_set_phens)
print list(gold_set_phens)[:5]
print len(phen2id), len(id2phen)

from extractor.util import gold_phen_stats, gold_agg_phen_stats
gold_agg_phen_stats(phen_c, gold_set_agg_phens, agg_phen2id)

Found 122 gold mentions, e.g.:
[('17903295', 19116), ('17903300', 19103), ('19412175', 19249), ('18464913', 19158), ('20066028', 19672)]
869 101
Statistics over EFO phenotypes:
# of gold annotations	= 122
# of candidates		= 9799
Candidate recall	= 0.951
Candidate precision	= 0.012


Why is the recall low?

In [41]:
from extractor.util import gold_agg_phen_recall
kb = KnowledgeBase()
id2doc = {doc.name : doc for doc in corpus.documents}
gold_dict_phen = { doc_id : set() for doc_id, phen_id in gold_set_phens }
for doc_id, phen_id in gold_set_phens:
    gold_dict_phen[doc_id].add(phen_id)

phen_not_found = list(gold_agg_phen_recall(phen_c, gold_set_agg_phens, agg_phen2id))
print len(phen_not_found)
for doc_id, phen_id in phen_not_found[:100]:
    if len(gold_dict_phen[doc_id]) > 3: continue
    print doc_id
    for phen_id2 in gold_dict_phen[doc_id]:
        print id2phen[phen_id2].name, phen_id2
        for eq_phen in id2phen[phen_id2].equivalents:
            print eq_phen.name, eq_phen.ontology_ref
#     print gold_dict_phen[doc_id]
    print id2phen[phen_id].name
#     print id2phen[phen_id].synonyms    
#     print kb.title_by_pmid(doc_id)
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    print

6
19430483
systolic blood pressure 19252
systolic blood pressure http://www.ebi.ac.uk/efo/EFO_0006335
diastolic blood pressure 19253
diastolic blood pressure http://www.ebi.ac.uk/efo/EFO_0006336
systolic blood pressure
Eight blood pressure loci identified by genome-wide association study of 34,433 people of European ancestry. Elevated blood pressure is a common, heritable cause of cardiovascular disease worldwide. To date, identification of common genetic variants influencing blood pressure has proven challenging. We tested 2.5m genotyped and imputed SNPs for association with systolic and diastolic blood pressure in 34,433 subjects of European ancestry from the Global BPgen consortium and followed up findings with direct genotyping (N≤71,225 European ancestry, N=12,889 Indian Asian ancestry) and  in silico  comparison (CHARGE consortium, N=29,136). We identified association between systolic or diastolic blood pressure and common variants in 8 regions near the  CYP17A1  ( P =7×10 −24 ),

## Extraction performance

## Classification performance