# Phenotype extraction

In [6]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Extract phenotype candidates from papers

### Load corpus

In [8]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLDocParser, GWASXMLAbstractParser

xml_parser = GWASXMLAbstractParser(
    path=abstract_dir,
    doc='./*',
    title='.//front//article-title//text()',
    abstract='.//abstract//p//text()',
    par1='.//body/p[1]//text()',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [9]:
from snorkel.parser import HTMLParser
from snorkel.parser import SentenceParser
from snorkel.parser import CorpusParser

sent_parser = SentenceParser()
html_parser = HTMLParser(path='../data/db/papers/')

corpus_name = 'gwas-text-corpus.pkl'

try:
    with open(corpus_name,"r") as pkl:
        corpus = cPickle.load(pkl)
except:
    cp = CorpusParser(xml_parser, sent_parser, max_docs=100)
    %time corpus = cp.parse_corpus(name='GWAS Corpus')
    # pickling currently doesn't work...
#     with open(corpus_name,"w") as pkl:
#         corpus = cPickle.dump(corpus, pkl)

CPU times: user 2.29 s, sys: 282 ms, total: 2.58 s
Wall time: 14.2 s


### Extract candidates

In [657]:
from snorkel.candidates import Ngrams
from snorkel.matchers import DictionaryMatch, Union, RegexMatchSpan
from snorkel.candidates import EntityExtractor
from snorkel.utils import slice_into_ngrams

from extractor.util import change_name
from extractor.matcher import PhenotypeMatcher
from db.kb import KnowledgeBase

def make_ngrams(L, n_max=10, n_min=3, delim=' '):
    for l in L:
        yield l
        tokens = l.strip().split(delim)
        for ngram in slice_into_ngrams(tokens, n_max=n_max, n_min=n_min, delim=delim):
            yield ngram

# Define a candidate space
ngrams = Ngrams(n_max=7)

# collect phenotype list
kb = KnowledgeBase()
# efo phenotypes
efo_phenotype_list0 = kb.get_phenotype_candidates(source='efo', peek=True) # TODO: remove peaking
efo_phenotype_list = list(make_ngrams(efo_phenotype_list0))
# mesh diseases
mesh_phenotype_list0 = kb.get_phenotype_candidates(source='mesh')
mesh_phenotype_list = list(make_ngrams(mesh_phenotype_list0))
# mesh chemicals
chem_phenotype_list = kb.get_phenotype_candidates(source='chemical')
# regex matches
rgx = u'[A-Za-z\u2013-]+ (disease|trait|phenotype|outcome|response|quantitative trait|measurement|response)s?'

# Define matchers
efo_phen_matcher = PhenotypeMatcher(d=efo_phenotype_list, ignore_case=True, mod_fn=change_name)
mesh_phen_matcher = PhenotypeMatcher(d=mesh_phenotype_list, ignore_case=True, mod_fn=change_name)
chem_phen_matcher = DictionaryMatch(d=chem_phenotype_list, longest_match_only=True, ignore_case=True)
regex_phen_matcher = RegexMatchSpan(rgx=rgx)
phen_matcher = Union(efo_phen_matcher, mesh_phen_matcher, chem_phen_matcher, regex_phen_matcher)
# phen_matcher = PhenotypeMatcher(d=phenotype_list, ignore_case=True, mod_fn=change_name)

# Extractor
phen_extractor = EntityExtractor(ngrams, phen_matcher)

# collect candidates
%time phen_c = phen_extractor.extract(corpus.get_sentences(), name='all')
print len(phen_c), 'candidates extracted'

CPU times: user 54.3 s, sys: 360 ms, total: 54.7 s
Wall time: 54.6 s
5650 candidates extracted


In [658]:
# phen_c0 = phen_c
# phenotype_list0 = phenotype_list
print [p for p in phenotype_list if 'cholesterol' in p]

[u'cholesterol ester storage disease', u'familial hypercholesterolemia', u'low density lipoprotein cholesterol measurement', u'nhdl cholesterol', u'cholesterol ester', u'total cholesterol measurement', u'lecithin-cholesterol acyltransferase deficiency', u'hypercholesterolemia', u'cholesterol esters', u'xy disorder of sex development due to cholesterol synthesis defect', u'epicholesterol', u'7-dehydrocholesterol reductase deficiency', u'hdl cholesterol', u'cholesterol', u'cholesterol homeostasis', u'reduce cholesterol levels', u'alpha lipoprotein cholesterol', u'high density lipoprotein cholesterol measurement', u'homozygous familial hypercholesterolemia', u'cholesterol embolism', u'non-high density lipoprotein cholesterol measurement', u'nhdl cholesterol measurement', u'hypocholesterolemia', u'hypercholesterolemia due to cholesterol 7alpha-hydroxylase deficiency', u'cholesterol-ester transfer protein deficiency', u'non-high density lipoprotein cholesterol', u'hdl(2) cholesterol', u'ldl

In [659]:
print len(tmp)

0


In [660]:
len(phenotype_list0)
phenotype_list = list(make_ngrams(phenotype_list0))
len(phenotype_list)
print phenotype_list[:10]
print phenotype_list0[:10]
print stemmer.stem("parkinson's")

[u'', u'ccl5 measurement', u'cholera infantum', u'chemokine ccl2 level', u'chemokine ccl2 level', u'cigarettes per day measurement', u'cigarettes per day', u'cigarettes per day measurement', u'per day measurement', u'inflammatory marker measurement']
[u'', u'ccl5 measurement', u'cholera infantum', u'chemokine ccl2 level', u'cigarettes per day measurement', u'inflammatory marker measurement', u'kidney diseases', u'white spot', u'response to sertraline', u'acylcarnitine measurement']
parkinson'


In [661]:
def make_ngrams(L, n_max=10, n_min=3, delim=' '):
    for l in L:
        yield l
        tokens = l.strip().split(delim)
        for ngram in slice_into_ngrams(tokens, n_max=n_max, n_min=n_min, delim=delim):
            yield ngram
            
L=[p for p in phenotype_list if 'diabetes' in p]
print L[0]
print [n for n in make_ngrams(L)]

type i diabetes mellitus
[u'type i diabetes mellitus', u'type i diabetes', u'type i diabetes mellitus', u'i diabetes mellitus', u'type i diabetes', u'type i diabetes', u'type i diabetes mellitus', u'type i diabetes', u'type i diabetes mellitus', u'i diabetes mellitus', u'i diabetes mellitus', u'i diabetes mellitus', u'diabetes mellitus type 02', u'diabetes mellitus type', u'diabetes mellitus type 02', u'mellitus type 02', u'diabetes mellitus type', u'diabetes mellitus type', u'diabetes mellitus type 02', u'diabetes mellitus type', u'diabetes mellitus type 02', u'mellitus type 02', u'dm - diabetes mellitus', u'dm - diabetes', u'dm - diabetes mellitus', u'- diabetes mellitus', u'dm - diabetes', u'dm - diabetes', u'dm - diabetes mellitus', u'dm - diabetes', u'dm - diabetes mellitus', u'- diabetes mellitus', u'- diabetes mellitus', u'- diabetes mellitus', u'type ii diabetes mellitus', u'type ii diabetes', u'type ii diabetes mellitus', u'ii diabetes mellitus', u'type ii diabetes', u'type ii

We would like to remove nested candidates (manually, for now).

In [662]:
# load existing candidates into a dict
span_dict = { str(span.context) : list() for span in phen_c }
for span in phen_c:
    span_dict[str(span.context)].append( (span.char_start, span.char_end) )

def nested(ivl1, ivl2):
    if ivl1 != ivl2 and ivl2[0] <= ivl1[0] <= ivl1[1] <= ivl2[1]:
        return True
    else:
        return False

new_phen_c = list()
for span in phen_c:
    span_ivl = span.char_start, span.char_end
    span_name = str(span.context)
    if all([not nested(span_ivl, other_ivl) for other_ivl in span_dict[span_name]]):
        new_phen_c.append(span)
        
print len(phen_c) - len(new_phen_c), 'candidates dropped, now we have', len(new_phen_c)
phen_c = new_phen_c

2562 candidates dropped, now we have 3088


### Create gold-truth set

In [663]:
from db.kb import KnowledgeBase
from nltk.stem import PorterStemmer
from extractor.util import change_name

kb = KnowledgeBase() # reload
gold_set_phens = frozenset \
([ 
    (doc.name, phen.ontology_ref) for doc in corpus.documents 
                                  for phen in kb.phen_by_pmid(doc.name, source='efo')
])

# map phenotype names to their id
phen2id = \
{
    change_name(syn) : phen.ontology_ref for doc in corpus.documents
                                                  for phen in kb.phen_by_pmid(doc.name, source='efo')
                                                  for syn in [phen.name] + phen.synonyms.split('|')
}

# this is the more correct version (code below should be changed to use it)
# we collect a set of EFOs matching a string; we use this only at the very end for now
phen2idset = dict()
for doc in corpus.documents:
    for phen in kb.phen_by_pmid(doc.name, source='efo'):
        synonyms = [phen.name] + phen.synonyms.split('|')
        ngrams = make_ngrams(synonyms)
        for ngram in ngrams:
            syn_name = change_name(ngram)
            if syn_name not in phen2idset: phen2idset[syn_name] = set()
            phen2idset[syn_name].add(phen.id)

id2phen = \
{
    phen.ontology_ref : phen for doc in corpus.documents
                             for phen in kb.phen_by_pmid(doc.name, source='efo')
}

print 'Found %d gold mentions, e.g.:' % len(gold_set_phens)
print list(gold_set_phens)[:5]
print len(phen2id), len(id2phen)

Found 322 gold mentions, e.g.:
[('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004764'), ('17903294', u'http://purl.obolibrary.org/obo/GO_0070527'), ('20038947', u'http://www.ebi.ac.uk/efo/EFO_0003761'), ('19197348', u'http://www.ebi.ac.uk/efo/EFO_0004748'), ('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004626')]
441 216


## Statistics

First, we need to understand whether the extraction or the classification approach will be better.

### Number of phenotypes per paper

#### Number of EFO phenotypes per paper

In [664]:
docid2efo = \
{
    doc.name : [ phen.name for phen in kb.phen_by_pmid(doc.name, source='efo') ] for doc in corpus.documents
}
print 'EFO phenotype numbers per paper:', sorted([len(v) for v in docid2efo.values()])

EFO phenotype numbers per paper: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7, 10, 11, 13, 17, 37, 66]


#### Number of GWAS catalog (i.e. aggregate) phenotypes per paper

In [665]:
docid2efo = \
{
    doc.name : [ phen.name for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog') ] for doc in corpus.documents
}
print 'GWAS catalog phenotype numbers per paper:', sorted([len(v) for d, v in docid2efo.items()])

GWAS catalog phenotype numbers per paper: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 9, 9, 10, 11, 11, 11, 12, 12, 13, 13, 14, 14, 15, 16, 19, 20, 21, 21, 21, 22, 22, 24, 29, 33, 34, 37, 38, 40, 42, 49, 73]


### Candidate recall statistics

#### Over EFO phenotypes

In [666]:
from extractor.util import gold_phen_stats
gold_phen_stats(phen_c, gold_set_phens, phen2id)

Statistics over EFO phenotypes:
# of gold annotations	= 322
# of candidates		= 3088
Candidate recall	= 0.370
Candidate precision	= 0.039


Why is the recall low?

In [667]:
from extractor.util import gold_phen_recall
kb = KnowledgeBase()
id2doc = {doc.name : doc for doc in corpus.documents}
gold_dict_phen = { doc_id : set() for doc_id, phen_id in gold_set_phens }
for doc_id, phen_id in gold_set_phens:
    gold_dict_phen[doc_id].add(phen_id)

phen_not_found = list(gold_phen_recall(phen_c, gold_set_phens, phen2id))
print len(phen_not_found)
for doc_id, phen_id in phen_not_found[:100]:
    if len(gold_dict_phen[doc_id]) > 3: continue
    print doc_id
    for phen_id2 in gold_dict_phen[doc_id]:
        print id2phen[phen_id2].name, phen_id2
#     print gold_dict_phen[doc_id]
    print id2phen[phen_id].name, id2phen[phen_id].ontology_ref
    print id2phen[phen_id].synonyms    
    print kb.title_by_pmid(doc_id)
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    print

203
19798445
blood metabolite measurement http://www.ebi.ac.uk/efo/EFO_0005664
sphingolipid measurement http://www.ebi.ac.uk/efo/EFO_0004622
blood metabolite measurement http://www.ebi.ac.uk/efo/EFO_0005664

Genetic determinants of circulating sphingolipid concentrations in European populations.
Genetic Determinants of Circulating Sphingolipid Concentrations in European Populations. Sphingolipids have essential roles as structural components of cell membranes and in cell signalling, and disruption of their metabolism causes several diseases, with diverse neurological, psychiatric, and metabolic consequences. Increasingly, variants within a few of the genes that encode enzymes involved in sphingolipid metabolism are being associated with complex disease phenotypes. Direct experimental evidence supports a role of specific sphingolipid species in several common complex chronic disease processes including atherosclerotic plaque formation, myocardial infarction (MI), cardiomyopathy, pancrea

#### Over aggregate phenotypes

We say that a mention for an aggregate phenotype is correct, if it corresponds to the name of the GWC phenotype or to the phenotype of any equivalent EFO phenotype.

In [668]:
from db.kb import KnowledgeBase
from nltk.stem import PorterStemmer
from extractor.util import change_name

kb = KnowledgeBase() # reload
gold_set_agg_phens = frozenset \
([ 
    (doc.name, phen.id) for doc in corpus.documents 
                        for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog')
])

# map phenotype names to their id (EFO syn -> GWC id)
agg_phen2id = dict()
for doc in corpus.documents:
    for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog'):
        for eq_phen in phen.equivalents:
            for syn in [phen.name] + [eq_phen.name] + eq_phen.synonyms.split('|'):
                syn_name = change_name(syn)
                if eq_phen.name == 'personality': print syn_name
                if syn_name not in agg_phen2id: agg_phen2id[syn_name] = set()
                agg_phen2id[syn_name].add(phen.id)

# map ids to phenotypes (GWC id -> GWC phen obj)                
agg_id2phen = \
{
    phen.id : phen for doc in corpus.documents
                   for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog')
}

print 'Found %d gold mentions, e.g.:' % len(gold_set_phens)
print list(gold_set_phens)[:5]
print len(phen2id), len(id2phen)

from extractor.util import gold_phen_stats, gold_agg_phen_stats
gold_agg_phen_stats(phen_c, gold_set_agg_phens, agg_phen2id)

Found 322 gold mentions, e.g.:
[('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004764'), ('17903294', u'http://purl.obolibrary.org/obo/GO_0070527'), ('20038947', u'http://www.ebi.ac.uk/efo/EFO_0003761'), ('19197348', u'http://www.ebi.ac.uk/efo/EFO_0004748'), ('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004626')]
441 216
Statistics over EFO phenotypes:
# of gold annotations	= 925
# of candidates		= 3088
# of correct candidates	= 802
Candidate recall	= 0.867
Candidate precision	= 0.260


Why is the recall low?

In [669]:
from extractor.util import gold_agg_phen_recall
kb = KnowledgeBase()
id2doc = {doc.name : doc for doc in corpus.documents}
gold_agg_dict_phen = { doc_id : set() for doc_id, phen_id in gold_set_agg_phens }
for doc_id, phen_id in gold_set_agg_phens:
    gold_agg_dict_phen[doc_id].add(phen_id)

# this contains (doc_id, gwc_id) pairs from gold_set_agg_phens that haven't been found
phen_not_found = list(gold_agg_phen_recall(phen_c, gold_set_agg_phens, agg_phen2id))
print len(phen_not_found)
for doc_id, phen_id in phen_not_found[:100]:
    if len(gold_agg_dict_phen[doc_id]) > 3: continue # skip if >3 gwc_phen_ids in doc
    print doc_id
    for phen_id2 in gold_agg_dict_phen[doc_id]: # iterate over gwc_id's in doc
        print agg_id2phen[phen_id2].name, phen_id2 # print its name and id
        for eq_phen in agg_id2phen[phen_id2].equivalents: # loop over equivalent phens
            print '\t', eq_phen.name, eq_phen.ontology_ref, eq_phen.synonyms # print name, ref, synonyms
    print agg_id2phen[phen_id].name # print the one that we haven't found
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    print

123
19578366
glioma (high-grade) 20856
	central nervous system cancer http://www.ebi.ac.uk/efo/EFO_0000326 cns cancer|malignant central nervous system neoplasm|malignant tumor of cns|cancer of cns|malignant tumor of central nervous system|malignant cns neoplasms|malignant neoplasm of cns|malignant tumor of the central nervous system|malignant cns tumor|malignant central nervous system tumor|malignant neoplasm of the cns|malignant neoplasm of the central nervous system|malignant cns neoplasm|central nervous system neoplasms
glioma (high-grade) 20857
	central nervous system cancer http://www.ebi.ac.uk/efo/EFO_0000326 cns cancer|malignant central nervous system neoplasm|malignant tumor of cns|cancer of cns|malignant tumor of central nervous system|malignant cns neoplasms|malignant neoplasm of cns|malignant tumor of the central nervous system|malignant cns tumor|malignant central nervous system tumor|malignant neoplasm of the cns|malignant neoplasm of the central nervous system|malignant c

In [670]:
# FOR DEBUGGING WHY SPANS ARENT MATCHED
# from extractor.util import change_name

# doc_id, phen_id = phen_not_found[3]
# print agg_id2phen[phen_id].name 
# print id2doc[doc_id].sentences[0]
# for span in ngrams.apply(id2doc[doc_id].sentences[0]):
#     print span.get_span()
#     if phen_matcher._f(span):    
#         phen_name = span.get_span()
#         print phen_name
#         print '...', change_name(phen_name) in phenotype_list, phen_name in phen_matcher.d, change_name(phen_name) in phen_matcher.d
#         phen_id = phen2id.get(change_name(phen_name), None)
#         print phen_id
#         if not phen_id or phen_id not in gold_dict_phen[span.context.document.name]:
#             print span.context.document.name, phen_id
#             print gold_dict_phen[span.context.document.name]
        
#         print

In [671]:
# query_word = 'personality'

# # phenotypes = db_session.query(Phenotype).filter(Phenotype.source=='efo').all()
# phenotypes == kb.get_phenotype_candidates_cheating()
# phenotype_names = set()
# for phenotype in phenotypes:
#     if phenotype.name:
#         phenotype_names.add((phenotype.name))
#         synonyms = [(syn) for syn in phenotype.synonyms.split('|')]
#         if query_word in synonyms or query_word == phenotype.name:
#             print phenotype.name, phenotype.ontology_ref
#         phenotype_names.update(synonyms)

In [672]:
# print len(phenotype_names)
# [(word, change_name(word)) for word in phenotype_list if change_name(word) == change_name('personalized')]

## Extraction performance

First, extract some features for each candidate mention.

In [673]:
import cPickle        
from snorkel.features import NgramFeaturizer

pkl_f = 'phenotype_feats.pkl'
try:
    with open(pkl_f, 'rb') as f:
        featurizer = cPickle.load(f)
except:
    featurizer = NgramFeaturizer()
    featurizer.fit_transform(phen_c)

print 'Example features:', [f for f in featurizer.get_features_by_candidate(phen_c[0])[:10]]

Building feature index...
Extracting features...
0/53842
5000/53842
10000/53842
15000/53842
20000/53842
25000/53842
30000/53842
35000/53842
40000/53842
45000/53842
50000/53842
Example features: [u'DDLIB_WORD_SEQ_[Crohn Disease]', u'DDLIB_LEMMA_SEQ_[crohn disease]', u'DDLIB_POS_SEQ_[NN NN]', u'DDLIB_DEP_SEQ_[compound compound]', u'DDLIB_W_LEFT_1_[novel]', u'DDLIB_W_LEFT_POS_1_[JJ]', u'DDLIB_W_LEFT_2_[. novel]', u'DDLIB_W_LEFT_POS_2_[. JJ]', u'DDLIB_W_LEFT_3_[ptger4 . novel]', u'DDLIB_W_LEFT_POS_3_[NN . JJ]']


In [674]:
# classify candidates as correct or not

# doc_id -> set of correct gwc_ids
gold_agg_dict_phen = { doc_id : set() for doc_id, phen_id in gold_set_agg_phens }
for doc_id, phen_id in gold_set_agg_phens:
    gold_agg_dict_phen[doc_id].add(phen_id)

def c2uid(candidate):
    return candidate.context.document.name, candidate.context.position, candidate.char_start, candidate.char_end
    
gt_dict_pos = dict()
gt_dict_neg = dict()
for candidate in phen_c:
    doc_id = candidate.context.document.name
    agg_ids = agg_phen2id.get(change_name(candidate.get_span()), set())
    uid = c2uid(candidate)
    if agg_ids & gold_agg_dict_phen[doc_id]:
        gt_dict_pos[uid] = +1
    else:
        gt_dict_neg[uid] = -1

gt_dict = dict(gt_dict_pos.items() + gt_dict_neg.items())
print 'Defined %d positive and %d negative gold mentions' % (len(gt_dict_pos), len(gt_dict_neg))

Defined 547 positive and 2541 negative gold mentions


In [675]:
# let's look at a few examples
for doc in corpus.documents[:10]:
    doc_id = doc.name
    print doc.name
    for phen_id2 in gold_agg_dict_phen[doc_id]: # iterate over gwc_id's in doc
        print agg_id2phen[phen_id2].name, phen_id2 # print its name and id
        for eq_phen in agg_id2phen[phen_id2].equivalents: # loop over equivalent phens
            print '\t', eq_phen.name, eq_phen.ontology_ref, eq_phen.synonyms # print name, ref, synonyms
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    for candidate in phen_c:
        if candidate.context.document != doc: continue
#         print gt_dict[candidate.uid], candidate.get_span(), [agg_id2phen[phen_id].name for phen_id in phen2id.get(change_name(candidate.get_span()),set())]
        print gt_dict[c2uid(candidate)], candidate.get_span(), 
        phen_id = phen2id.get(change_name(candidate.get_span()), None)
        if phen_id: print '|', id2phen[phen_id].name, id2phen[phen_id].ontology_ref,
        print
    print

17447842
crohn's disease 19827
	crohn's disease http://www.ebi.ac.uk/efo/EFO_0000384 gastritis associated with crohn's disease|ileitis
crohn's disease 19333
	crohn's disease http://www.ebi.ac.uk/efo/EFO_0000384 gastritis associated with crohn's disease|ileitis
crohn's disease 19486
	crohn's disease http://www.ebi.ac.uk/efo/EFO_0000384 gastritis associated with crohn's disease|ileitis
Novel Crohn Disease Locus Identified by Genome-Wide Association Maps to a Gene Desert on 5p13.1 and Modulates Expression of  PTGER4. To identify novel susceptibility loci for Crohn disease (CD), we undertook a genome-wide association study with more than 300,000 SNPs characterized in 547 patients and 928 controls. We found three chromosome regions that provided evidence of disease association with  p -values between 10 −6  and 10 −9 . Two of these ( IL23R  on Chromosome 1 and  CARD15  on Chromosome 16) correspond to genes previously reported to be associated with CD. In addition, a 250-kb region of Chromos

Create training set

In [676]:
import numpy as np

# Split into train and test set
candidates = phen_c
training_candidates = []
gold_candidates     = []
gold_labels         = []
n_half = len(candidates)/2
for c in candidates[:n_half]:
    uid = c2uid(c)
    if uid in gt_dict:
        gold_candidates.append(c)
        gold_labels.append(gt_dict[uid])
    else:
        training_candidates.append(c)
training_candidates.extend(candidates[n_half:])
gold_labels = np.array(gold_labels)
print "Training set size: %s" % len(training_candidates)
print "Gold set size: %s" % len(gold_candidates)
print "Positive labels in training set: %s" % len([c for c in training_candidates if gt_dict.get(c2uid(c),0)==1])
print "Negative labels in training set: %s" % len([c for c in training_candidates if gt_dict.get(c2uid(c),0)==-1])
print "Positive labels in gold set: %s" % len([c for c in gold_candidates if gt_dict[c2uid(c)]==1])
print "Negative labels in gold set: %s" % len([c for c in gold_candidates if gt_dict[c2uid(c)]==-1])

Training set size: 1544
Gold set size: 1544
Positive labels in training set: 272
Negative labels in training set: 1272
Positive labels in gold set: 275
Negative labels in gold set: 1269


In [677]:
print phen_c[0]
print phen_c[0].post_window(d=4)
print phen_c[0].get_attrib_span('words')
print re.search(r'\([A-Z]{2,4}\)', 'Test [AVC)')

Span("Crohn Disease", context=None, chars=[6,18], words=[1,2])
[u'Disease', u'Locus', u'Identified', u'by']
Crohn Disease
None


In [723]:
from nltk.stem import PorterStemmer
import re
stemmer = PorterStemmer()

# load set of dictionary phenotypes
kb = KnowledgeBase()
phenotype_list = kb.get_phenotype_candidates() # TODO: load disease names from NCBI
phenotype_list = [phenotype for phenotype in phenotype_list]
phenotype_set = set(phenotype_list)

# small helpers
def get_phenotype(entity, stem=False):
    phenotype = entity.get_span()
    if stem: phenotype = stemmer.stem(phenotype)
    return phenotype.lower()

def stem_list(L):
    return [stemmer.stem(l.lower()) for l in L]

def LF_gt(m):
    return gt_dict.get(m.uid, 0) if m in training_candidates else 0

# positive LFs
def LF_first_sentence(m):
    return +10 if m.context.position == 0 else 0
def LF_from_regex(m):
    if m.context.position == 0 and not regex_phen_matcher._f(m) and not LF_bad_words(m): return +5
    else: return 0
def LF_with_acronym(m):
    post_txt = ''.join(m.post_window('words',d=5))
    return +1 if re.search(r'\([A-Z]{2,4}\)', post_txt) else 0
def LF_associated(m):
    return +1 if 'associ' in stem_list(m.pre_window('lemmas', d=5)) \
              or 'associ' in stem_list(m.post_window('lemmas', d=5)) \
              else 0
def LF_influencing(m):
    return +1 if 'influenc' in stem_list(m.pre_window('lemmas', d=5)) \
              or 'influenc' in stem_list(m.post_window('lemmas', d=5)) \
              else 0
def LF_identifi(m):
    return +1 if 'identifi' in stem_list(m.pre_window('lemmas', d=5)) \
              or 'identifi' in stem_list(m.post_window('lemmas', d=5)) \
              else 0

LFs_pos = [LF_first_sentence, LF_with_acronym, LF_from_regex]

# negative LFs
def LF_bad_words(m):
    bad_words = ['disease', 'single', 'map', 'genetic variation']
    return -100 if any(m.get_span().lower().startswith(b) for b in bad_words) else 0
def LF_short(m):
    txt = m.get_attrib_span('words', 3)
    return -50 if len(txt) < 5 else 0
def LF_no_nouns(m):
    return -5 if not any(t.startswith('NN') for t in m.get_attrib_tokens('poses')) else 0

def LF_previously(m):
    return -1 if 'previously' in m.pre_window('lemmas', 8) else 0
def LF_further(m):
    lemmas = m.get_attrib_span('lemmas').split(' ')    
    return -1 if 'further' in lemmas or 'furthermore' in lemmas else 0
def LF_also(m):
    words = m.get_span().split(' ')        
    return -1 if 'also' in words else 0
def LF_recently(m):
    words = m.get_span().split(' ')    
    return -1 if 'recently' in words else 0
def LF_addit(m):
    lemmas = m.get_attrib_span('lemmas').split(' ')
    return -1 if 'addit' in lemmas else 0
def LF_may(m):
    words = m.get_span().split(' ')
    return -1 if 'may' in words else 0
def LF_risk(m):
    return +1 if 'risk' in stem_list(m.pre_window('lemmas', d=5)) else 0

LFs_neg = [LF_bad_words, LF_short, LF_no_nouns]

LFs = LFs_pos + LFs_neg

In [724]:
from snorkel.snorkel import TrainingSet
from snorkel.features import NgramFeaturizer

training_set = TrainingSet(training_candidates, LFs, featurizer=NgramFeaturizer())

Applying LFs...
Featurizing...
Building feature index...
Extracting features...
0/28746
5000/28746
10000/28746
15000/28746
20000/28746
25000/28746
LF Summary Statistics: 6 LFs applied to 1544 candidates
------------------------------------------------------------
Coverage (candidates w/ > 0 labels):		51.62%
Overlap (candidates w/ > 1 labels):		49.03%
Conflict (candidates w/ conflicting labels):	2.01%


In [725]:
lf_stats = training_set.lf_stats()
lf_stats[:5]

Unnamed: 0,conflicts,coverage,j,overlaps
LF_first_sentence,0.168394,0.673575,0,0.673575
LF_with_acronym,0.003238,0.029145,1,0.003238
LF_from_regex,0.074482,0.317358,2,0.317358
LF_bad_words,0.388601,3.950777,3,3.950777
LF_short,0.615285,15.738342,4,15.738342


In [726]:
from snorkel.snorkel import Learner, PipelinedLearner
import snorkel.learning
from snorkel.learning import LogReg

learner = Learner(training_set, model=snorkel.learning.LogReg())

# Splitting into CV and test set
n_half = len(gold_candidates)/2
test_candidates = gold_candidates[:n_half]
test_labels     = gold_labels[:n_half]
cv_candidates   = gold_candidates[n_half:]
cv_labels       = gold_labels[n_half:]

In [727]:
from snorkel.learning_utils import GridSearch

gs       = GridSearch(learner, ['mu', 'lf_w0'], [[1e-5, 1e-7],[1.0,2.0]])
gs_stats = gs.fit(cv_candidates, cv_labels)

Testing mu = 1.00e-05, lf_w0 = 1.00e+00
Begin training for rate=0.01, mu=1e-05
	Learning epoch = 0	Gradient mag. = 2.550242
	Learning epoch = 250	Gradient mag. = 1.378368
	Learning epoch = 500	Gradient mag. = 0.768510
	Learning epoch = 750	Gradient mag. = 0.490302
Final gradient magnitude for rate=0.01, mu=1e-05: 0.359
Applying LFs...
Featurizing...
Testing mu = 1.00e-05, lf_w0 = 2.00e+00
Begin training for rate=0.01, mu=1e-05
	Learning epoch = 0	Gradient mag. = 2.266565
	Learning epoch = 250	Gradient mag. = 1.224469
	Learning epoch = 500	Gradient mag. = 0.681877
	Learning epoch = 750	Gradient mag. = 0.434128
Final gradient magnitude for rate=0.01, mu=1e-05: 0.317
Testing mu = 1.00e-07, lf_w0 = 1.00e+00
Begin training for rate=0.01, mu=1e-07
	Learning epoch = 0	Gradient mag. = 2.550242
	Learning epoch = 250	Gradient mag. = 1.378356
	Learning epoch = 500	Gradient mag. = 0.768486
	Learning epoch = 750	Gradient mag. = 0.490273
Final gradient magnitude for rate=0.01, mu=1e-07: 0.359
Testin

In [728]:
gs_stats

Unnamed: 0,mu,lf_w0,Prec.,Rec.,F1
0,1e-05,1,0.6,0.162162,0.255319
1,1e-05,2,0.609756,0.168919,0.26455
2,0.0,1,0.6,0.162162,0.255319
3,0.0,2,0.609756,0.168919,0.26455


In [729]:
learner.test_wmv(test_candidates, test_labels)

Applying LFs...
Featurizing...
Test set size:	772
----------------------------------------
Precision:	0.396825396825
Recall:		0.568181818182
F1 Score:	0.467289719626
----------------------------------------
TP: 25 | FP: 38 | TN: 277 | FN: 19


In [730]:
preds = learner.predict_wmv(phen_c)
mislabeled_cand = [(c,p, gt_dict.get(c2uid(c), None)) for c, p in zip(phen_c, preds) if p == 1 or p != gt_dict.get(c2uid(c), p)]
for (c,p,g) in mislabeled_cand:
    if c.context.position != 0: continue
    print c.context.document.name, p, g
    print c.context    
    print c.get_span()
    print [LF(c) for LF in LFs]
    print

Applying LFs...
Featurizing...
17447842 1.0 1
Sentence(Document('17447842', Corpus (GWAS Corpus)), 0, u'Novel Crohn Disease Locus Identified by Genome-Wide Association Maps to a Gene Desert on 5p13.1 and Modulates Expression of  PTGER4.')
Crohn Disease
[10, 0, 0, 0, 0, 0]

17658951 1.0 1
Sentence(Document('17658951', Corpus (GWAS Corpus)), 0, u'Genome-Wide Association Scan Shows Genetic Variants in the  FTO  Gene Are Associated with Obesity-Related Traits.')
Obesity-Related Traits
[10, 0, 0, 0, 0, 0]

17684544 1.0 -1
Sentence(Document('17684544', Corpus (GWAS Corpus)), 0, u'Systematic Association Mapping Identifies  NELL1  as a Novel IBD Disease Gene.')
IBD Disease
[10, 0, 0, 0, 0, 0]

17903292 1.0 -1
Sentence(Document('17903292', Corpus (GWAS Corpus)), 0, u"A genome-wide association for kidney function and endocrine-related traits in the NHLBI's Framingham Heart Study.")
endocrine-related traits
[10, 0, 0, 0, 0, 0]

17903293 1.0 -1
Sentence(Document('17903293', Corpus (GWAS Corpus)), 

## Save and analyze the results

### Analyze / Visualize

If a mention occurs in the title, its probably correct, we can take it.

Question: what papers did not have any disease mentions in the title?

In [731]:
preds = learner.predict_wmv(phen_c)
results = [c for p, c in zip(preds, phen_c) if p > 0 and c.context.position == 0]
doc_set = {c.context.document.name for c in results}
missing_docs = {doc.name for doc in corpus.documents} - doc_set
docs = sorted(list(missing_docs))
print len(docs)
for d in missing_docs:
    print d, kb.paper_by_pmid(d).title

Applying LFs...
Featurizing...
6
18262040 LDL-cholesterol concentrations: a genome-wide association study.
19305408 Common variants at ten loci influence QT interval duration in the QTGEN Study.
19587794 Common genetic variation near the phospholamban gene is associated with cardiac repolarisation: meta-analysis of three genome-wide association studies.
19197348 Genome-wide association studies in an isolated founder population from the Pacific Island of Kosrae.
17903296 Genome-wide association with bone mass and geometry in the Framingham Heart Study.
19343178 Meta-analysis of genome-wide scans for human adult stature identifies novel Loci and associations with measures of skeletal frame size.


Let's not visualize what we found.

In [734]:
scores = learner.score_wmv(phen_c)
score_dict = { doc.name : list() for doc in corpus.documents }
for s, c in zip(scores, phen_c):
    score_dict[c.context.document.name].append((s,c))

results = dict()
for pmid, preds in score_dict.items():
    if preds: 
        best_c = sorted(preds, reverse=True)[0][1]
        results[best_c.context.document.name] = best_c
    

Applying LFs...
Featurizing...


In [736]:
# doc_set = {c.context.document.name for c in results}
# missing_docs = {doc.name for doc in corpus.documents} - doc_set
# docs = sorted(list(missing_docs))
# print len(docs)
for d in corpus.documents:
    print d, kb.paper_by_pmid(d.name).title
    print results.get(d.name, None)
    print sorted(score_dict[d.name], reverse=True)[:5]
    print

Document('17447842', Corpus (GWAS Corpus)) Novel Crohn disease locus identified by genome-wide association maps to a gene desert on 5p13.1 and modulates expression of PTGER4.
Span("Crohn Disease", context=None, chars=[6,18], words=[1,2])
[(19.270490808768059, Span("Crohn Disease", context=None, chars=[6,18], words=[1,2])), (1.9959427016534608, Span("Crohn disease (", context=None, chars=[42,56], words=[6,8])), (0.0, Span("locus on Chromosome 5.", context=None, chars=[155,176], words=[27,31])), (0.0, Span("Crohn disease,", context=None, chars=[33,46], words=[6,8])), (0.0, Span("susceptibility to", context=None, chars=[151,167], words=[22,23]))]

Document('17658951', Corpus (GWAS Corpus)) Genome-wide association scan shows genetic variants in the FTO gene are associated with obesity-related traits.
Span("Obesity-Related Traits", context=None, chars=[90,111], words=[13,14])
[(19.270490808768059, Span("Obesity-Related Traits", context=None, chars=[90,111], words=[13,14])), (0.0, Span("BMI 

UnicodeEncodeError: 'ascii' codec can't encode character u'\u2019' in position 15: ordinal not in range(128)


### Save results

In [742]:
with open('phenotypes.extracted.tsv', 'w') as f:
    for d in corpus.documents:
        # pick the top two results:
        (_, r1), (_, r2) = sorted(score_dict[d.name], reverse=True)[:2]
        # if both are in title, report both, otherwise report only the best one
        if r2.context.position == 0:
            phen = r1.get_span() + '|' + r2.get_span()
        else:
            phen = r1.get_span()
        out_str = u'%s\t%s\t\n' % (d.name, phen)        
        f.write(out_str.encode("UTF-8"))
        