# Phenotype extraction

In [42]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Extract phenotype candidates from papers

### Load corpus

In [18]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLDocParser, GWASXMLDocParser

xml_parser = GWASXMLDocParser(
    path=abstract_dir,
    doc='./*',
    title='.//front//article-title//text()',
    abstract='.//abstract//p//text()',
    par1='.//body/p[1]//text()',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [20]:
from snorkel.parser import HTMLParser
from snorkel.parser import SentenceParser
from snorkel.parser import CorpusParser

sent_parser = SentenceParser()
html_parser = HTMLParser(path='../data/db/papers/')

corpus_name = 'gwas-text-corpus.pkl'

try:
    with open(corpus_name,"r") as pkl:
        corpus = cPickle.load(pkl)
except:
    cp = CorpusParser(xml_parser, sent_parser, max_docs=100)
    %time corpus = cp.parse_corpus(name='GWAS Corpus')
    # pickling currently doesn't work...
#     with open(corpus_name,"w") as pkl:
#         corpus = cPickle.dump(corpus, pkl)

CPU times: user 2.54 s, sys: 155 ms, total: 2.7 s
Wall time: 14.9 s


### Extract candidates

In [35]:
from snorkel.candidates import Ngrams
from snorkel.matchers import DictionaryMatch
from snorkel.candidates import EntityExtractor

from extractor.util import change_name
from db.kb import KnowledgeBase

# Define a candidate space
ngrams = Ngrams(n_max=4)

# collect phenotype list
kb = KnowledgeBase()
phenotype_list = kb.get_phenotype_candidates(mod_fn=change_name)

# Define matchers
phen_matcher = DictionaryMatch(d=phenotype_list, longest_match_only=True, ignore_case=True, stemmer='porter')

# Extractor
ngrams = Ngrams(n_max=7)
phen_extractor = EntityExtractor(ngrams, phen_matcher)

# collect candidates
%time phen_c = phen_extractor.extract(corpus.get_sentences(), name='all')
print len(phen_c), 'candidates extracted'

CPU times: user 11 s, sys: 95 ms, total: 11.1 s
Wall time: 11.1 s
7359 candidates extracted


We would like to remove nested candidates (manually, for now).

In [36]:
# load existing candidates into a dict
span_dict = { str(span.context) : list() for span in phen_c }
for span in phen_c:
    span_dict[str(span.context)].append( (span.char_start, span.char_end) )

def nested(ivl1, ivl2):
    if ivl1 != ivl2 and ivl2[0] <= ivl1[0] <= ivl1[1] <= ivl2[1]:
        return True
    else:
        return False

new_phen_c = list()
for span in phen_c:
    span_ivl = span.char_start, span.char_end
    span_name = str(span.context)
    if all([not nested(span_ivl, other_ivl) for other_ivl in span_dict[span_name]]):
        new_phen_c.append(span)
        
print len(phen_c) - len(new_phen_c), 'candidates dropped, now we have', len(new_phen_c)
phen_c = new_phen_c

0 candidates dropped, now we have 7359


### Create gold-truth set

In [37]:
from db.kb import KnowledgeBase
from nltk.stem import PorterStemmer
from extractor.util import change_name

kb = KnowledgeBase() # reload
gold_set_phens = frozenset \
([ 
    (doc.name, phen.ontology_ref) for doc in corpus.documents 
                                  for phen in kb.phen_by_pmid(doc.name)
])

# map phenotype names to their id
phen2id = \
{
    change_name(syn) : phen.ontology_ref for doc in corpus.documents
                                                  for phen in kb.phen_by_pmid(doc.name)
                                                  for syn in [phen.name] + phen.synonyms.split('|')
}

id2phen = \
{
    phen.ontology_ref : phen for doc in corpus.documents
                             for phen in kb.phen_by_pmid(doc.name)
}

print 'Found %d gold mentions, e.g.:' % len(gold_set_phens)
print list(gold_set_phens)[:5]
print len(phen2id), len(id2phen)

Found 322 gold mentions, e.g.:
[('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004764'), ('17903294', u'http://purl.obolibrary.org/obo/GO_0070527'), ('20038947', u'http://www.ebi.ac.uk/efo/EFO_0003761'), ('19197348', u'http://www.ebi.ac.uk/efo/EFO_0004748'), ('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004626')]
833 216


## Statistics

First, we need to understand whether the extraction or the classification approach will be better.

### Number of phenotypes per paper

#### Number of EFO phenotypes per paper

In [53]:
docid2efo = \
{
    doc.name : { phen.name for phen in kb.phen_by_pmid(doc.name, source='efo') } for doc in corpus.documents
}
print 'EFO phenotype numbers per paper:', sorted([len(v) for v in docid2efo.values()])

EFO phenotype numbers per paper: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7, 10, 11, 13, 17, 37, 66]


#### Number of GWAS catalog (i.e. aggregate) phenotypes per paper

In [54]:
docid2efo = \
{
    doc.name : { phen.name for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog') } for doc in corpus.documents
}
print 'GWAS catalog phenotype numbers per paper:', sorted([(len(v), d) for d, v in docid2efo.items()])

GWAS catalog phenotype numbers per paper: [(1, '17447842'), (1, '17658951'), (1, '17684544'), (1, '17903293'), (1, '17903308'), (1, '17997608'), (1, '18159244'), (1, '18262040'), (1, '18282107'), (1, '18369459'), (1, '18455228'), (1, '18464913'), (1, '18604267'), (1, '18776929'), (1, '18823527'), (1, '18840781'), (1, '18846228'), (1, '18941528'), (1, '19043545'), (1, '19056611'), (1, '19081515'), (1, '19096518'), (1, '19116933'), (1, '19122664'), (1, '19132087'), (1, '19169254'), (1, '19197348'), (1, '19219042'), (1, '19247474'), (1, '19300482'), (1, '19300499'), (1, '19300500'), (1, '19304780'), (1, '19305408'), (1, '19343178'), (1, '19359265'), (1, '19401414'), (1, '19412175'), (1, '19412176'), (1, '19421330'), (1, '19430480'), (1, '19448621'), (1, '19454037'), (1, '19503088'), (1, '19503597'), (1, '19557161'), (1, '19557197'), (1, '19570815'), (1, '19571809'), (1, '19578366'), (1, '19587794'), (1, '19597492'), (1, '19609347'), (1, '19651812'), (1, '19668339'), (1, '19714205'), (1, '

### Candidate recall statistics

#### Over EFO phenotypes

In [55]:
from extractor.util import gold_phen_stats
gold_phen_stats(phen_c, gold_set_phens, phen2id)

# of gold annotations	= 322
# of candidates		= 7359
Candidate recall	= 0.276
Candidate precision	= 0.012


Why is the recall low?

In [59]:
from extractor.util import gold_phen_recall
kb = KnowledgeBase()
id2doc = {doc.name : doc for doc in corpus.documents}

phen_not_found = list(gold_phen_recall(phen_c, gold_set_phens, phen2id))
print len(phen_not_found)
for doc_id, phen_id in phen_not_found[:100]:
    if len(gold_dict_phen[doc_id]) > 3: continue
    print doc_id
    for phen_id2 in gold_dict_phen[doc_id]:
        print id2phen[phen_id2].name, phen_id2
#     print gold_dict_phen[doc_id]
    print id2phen[phen_id].name, id2phen[phen_id].ontology_ref
    print id2phen[phen_id].synonyms    
    print kb.title_by_pmid(doc_id)
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    print

233
19798445
blood metabolite measurement http://www.ebi.ac.uk/efo/EFO_0005664
sphingolipid measurement http://www.ebi.ac.uk/efo/EFO_0004622
blood metabolite measurement http://www.ebi.ac.uk/efo/EFO_0005664

Genetic determinants of circulating sphingolipid concentrations in European populations.
Genetic Determinants of Circulating Sphingolipid Concentrations in European Populations Sphingolipids have essential roles as structural components of cell membranes and in cell signalling, and disruption of their metabolism causes several diseases, with diverse neurological, psychiatric, and metabolic consequences. Increasingly, variants within a few of the genes that encode enzymes involved in sphingolipid metabolism are being associated with complex disease phenotypes. Direct experimental evidence supports a role of specific sphingolipid species in several common complex chronic disease processes including atherosclerotic plaque formation, myocardial infarction (MI), cardiomyopathy, pancreat

#### Over aggregate phenotypes

## Extraction performance

## Classification performance