# Phenotype extraction

In [6]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Extract phenotype candidates from papers

### Load corpus

In [8]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLDocParser, GWASXMLAbstractParser

xml_parser = GWASXMLAbstractParser(
    path=abstract_dir,
    doc='./*',
    title='.//front//article-title//text()',
    abstract='.//abstract//p//text()',
    par1='.//body/p[1]//text()',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [9]:
from snorkel.parser import HTMLParser
from snorkel.parser import SentenceParser
from snorkel.parser import CorpusParser

sent_parser = SentenceParser()
html_parser = HTMLParser(path='../data/db/papers/')

corpus_name = 'gwas-text-corpus.pkl'

try:
    with open(corpus_name,"r") as pkl:
        corpus = cPickle.load(pkl)
except:
    cp = CorpusParser(xml_parser, sent_parser, max_docs=100)
    %time corpus = cp.parse_corpus(name='GWAS Corpus')
    # pickling currently doesn't work...
#     with open(corpus_name,"w") as pkl:
#         corpus = cPickle.dump(corpus, pkl)

CPU times: user 2.29 s, sys: 282 ms, total: 2.58 s
Wall time: 14.2 s


### Extract candidates

In [436]:
from snorkel.candidates import Ngrams
from snorkel.matchers import DictionaryMatch, Union
from snorkel.candidates import EntityExtractor
from snorkel.utils import slice_into_ngrams

from extractor.util import change_name
from extractor.matcher import PhenotypeMatcher
from db.kb import KnowledgeBase

def make_ngrams(L, n_max=10, n_min=3, delim=' '):
    for l in L:
        yield l
        tokens = l.strip().split(delim)
        for ngram in slice_into_ngrams(tokens, n_max=n_max, n_min=n_min, delim=delim):
            yield ngram

# Define a candidate space
ngrams = Ngrams(n_max=7)

# collect phenotype list
kb = KnowledgeBase()
# phenotype_list = kb.get_phenotype_candidates()
phenotype_list0 = kb.get_phenotype_candidates_cheating() # TODO: revert to correct one!
phenotype_list = list(make_ngrams(phenotype_list0))
# phenotype_list_snorkel = kb.get_snorkel_phenotype_candidates()

# Define matchers
# phen_matcher1 = PhenotypeMatcher(d=phenotype_list, ignore_case=True, mod_fn=change_name)
# phen_matcher2 = DictionaryMatch(d=phenotype_list_snorkel, longest_match_only=True, ignore_case=True)
# phen_matcher = Union(phen_matcher1, phen_matcher2)
# phen_matcher1 = PhenotypeMatcher(d=phenotype_list, ignore_case=True, mod_fn=change_name)

# Extractor
phen_extractor = EntityExtractor(ngrams, phen_matcher)

# collect candidates
%time phen_c = phen_extractor.extract(corpus.get_sentences(), name='all')
print len(phen_c), 'candidates extracted'

CPU times: user 27.4 s, sys: 203 ms, total: 27.6 s
Wall time: 27.5 s
7980 candidates extracted


In [437]:
# phen_c0 = phen_c
# phenotype_list0 = phenotype_list
print [p for p in phenotype_list if 'cholesterol' in p]

[u'cholesterol', u'low density lipoprotein cholesterol measurement', u'low density lipoprotein cholesterol', u'low density lipoprotein cholesterol measurement', u'density lipoprotein cholesterol', u'density lipoprotein cholesterol measurement', u'lipoprotein cholesterol measurement', u'total cholesterol measurement', u'total cholesterol measurement', u'high density lipoprotein cholesterol measurement', u'high density lipoprotein cholesterol', u'high density lipoprotein cholesterol measurement', u'density lipoprotein cholesterol', u'density lipoprotein cholesterol measurement', u'lipoprotein cholesterol measurement']


In [438]:
print len(tmp)

1


In [439]:
len(phenotype_list0)
phenotype_list = list(make_ngrams(phenotype_list0))
len(phenotype_list)
print phenotype_list[:10]
print phenotype_list0[:10]
print stemmer.stem("parkinson's")

[u'', u'ccl5 measurement', u'cholera infantum', u'chemokine ccl2 level', u'chemokine ccl2 level', u'cigarettes per day measurement', u'cigarettes per day', u'cigarettes per day measurement', u'per day measurement', u'inflammatory marker measurement']
[u'', u'ccl5 measurement', u'cholera infantum', u'chemokine ccl2 level', u'cigarettes per day measurement', u'inflammatory marker measurement', u'kidney diseases', u'white spot', u'response to sertraline', u'acylcarnitine measurement']
parkinson'


In [440]:
def make_ngrams(L, n_max=10, n_min=3, delim=' '):
    for l in L:
        yield l
        tokens = l.strip().split(delim)
        for ngram in slice_into_ngrams(tokens, n_max=n_max, n_min=n_min, delim=delim):
            yield ngram
            
L=[p for p in phenotype_list if 'diabetes' in p]
print L[0]
print [n for n in make_ngrams(L)]

type i diabetes mellitus
[u'type i diabetes mellitus', u'type i diabetes', u'type i diabetes mellitus', u'i diabetes mellitus', u'type i diabetes', u'type i diabetes', u'type i diabetes mellitus', u'type i diabetes', u'type i diabetes mellitus', u'i diabetes mellitus', u'i diabetes mellitus', u'i diabetes mellitus', u'diabetes mellitus type 02', u'diabetes mellitus type', u'diabetes mellitus type 02', u'mellitus type 02', u'diabetes mellitus type', u'diabetes mellitus type', u'diabetes mellitus type 02', u'diabetes mellitus type', u'diabetes mellitus type 02', u'mellitus type 02', u'dm - diabetes mellitus', u'dm - diabetes', u'dm - diabetes mellitus', u'- diabetes mellitus', u'dm - diabetes', u'dm - diabetes', u'dm - diabetes mellitus', u'dm - diabetes', u'dm - diabetes mellitus', u'- diabetes mellitus', u'- diabetes mellitus', u'- diabetes mellitus', u'type ii diabetes mellitus', u'type ii diabetes', u'type ii diabetes mellitus', u'ii diabetes mellitus', u'type ii diabetes', u'type ii

We would like to remove nested candidates (manually, for now).

In [441]:
# load existing candidates into a dict
span_dict = { str(span.context) : list() for span in phen_c }
for span in phen_c:
    span_dict[str(span.context)].append( (span.char_start, span.char_end) )

def nested(ivl1, ivl2):
    if ivl1 != ivl2 and ivl2[0] <= ivl1[0] <= ivl1[1] <= ivl2[1]:
        return True
    else:
        return False

new_phen_c = list()
for span in phen_c:
    span_ivl = span.char_start, span.char_end
    span_name = str(span.context)
    if all([not nested(span_ivl, other_ivl) for other_ivl in span_dict[span_name]]):
        new_phen_c.append(span)
        
print len(phen_c) - len(new_phen_c), 'candidates dropped, now we have', len(new_phen_c)
phen_c = new_phen_c

2271 candidates dropped, now we have 5709


### Create gold-truth set

In [442]:
from db.kb import KnowledgeBase
from nltk.stem import PorterStemmer
from extractor.util import change_name

kb = KnowledgeBase() # reload
gold_set_phens = frozenset \
([ 
    (doc.name, phen.ontology_ref) for doc in corpus.documents 
                                  for phen in kb.phen_by_pmid(doc.name, source='efo')
])

# map phenotype names to their id
phen2id = \
{
    change_name(syn) : phen.ontology_ref for doc in corpus.documents
                                                  for phen in kb.phen_by_pmid(doc.name, source='efo')
                                                  for syn in [phen.name] + phen.synonyms.split('|')
}

# this is the more correct version (code below should be changed to use it)
# we collect a set of EFOs matching a string; we use this only at the very end for now
phen2idset = dict()
for doc in corpus.documents:
    for phen in kb.phen_by_pmid(doc.name, source='efo'):
        synonyms = [phen.name] + phen.synonyms.split('|')
        ngrams = make_ngrams(synonyms)
        for ngram in ngrams:
            syn_name = change_name(ngram)
            if syn_name not in phen2idset: phen2idset[syn_name] = set()
            phen2idset[syn_name].add(phen.id)

id2phen = \
{
    phen.ontology_ref : phen for doc in corpus.documents
                             for phen in kb.phen_by_pmid(doc.name, source='efo')
}

print 'Found %d gold mentions, e.g.:' % len(gold_set_phens)
print list(gold_set_phens)[:5]
print len(phen2id), len(id2phen)

Found 322 gold mentions, e.g.:
[('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004764'), ('17903294', u'http://purl.obolibrary.org/obo/GO_0070527'), ('20038947', u'http://www.ebi.ac.uk/efo/EFO_0003761'), ('19197348', u'http://www.ebi.ac.uk/efo/EFO_0004748'), ('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004626')]
441 216


## Statistics

First, we need to understand whether the extraction or the classification approach will be better.

### Number of phenotypes per paper

#### Number of EFO phenotypes per paper

In [443]:
docid2efo = \
{
    doc.name : [ phen.name for phen in kb.phen_by_pmid(doc.name, source='efo') ] for doc in corpus.documents
}
print 'EFO phenotype numbers per paper:', sorted([len(v) for v in docid2efo.values()])

EFO phenotype numbers per paper: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7, 10, 11, 13, 17, 37, 66]


#### Number of GWAS catalog (i.e. aggregate) phenotypes per paper

In [444]:
docid2efo = \
{
    doc.name : [ phen.name for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog') ] for doc in corpus.documents
}
print 'GWAS catalog phenotype numbers per paper:', sorted([len(v) for d, v in docid2efo.items()])

GWAS catalog phenotype numbers per paper: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 9, 9, 10, 11, 11, 11, 12, 12, 13, 13, 14, 14, 15, 16, 19, 20, 21, 21, 21, 22, 22, 24, 29, 33, 34, 37, 38, 40, 42, 49, 73]


### Candidate recall statistics

#### Over EFO phenotypes

In [445]:
from extractor.util import gold_phen_stats
gold_phen_stats(phen_c, gold_set_phens, phen2id)

Statistics over EFO phenotypes:
# of gold annotations	= 322
# of candidates		= 5709
Candidate recall	= 0.376
Candidate precision	= 0.021


Why is the recall low?

In [446]:
from extractor.util import gold_phen_recall
kb = KnowledgeBase()
id2doc = {doc.name : doc for doc in corpus.documents}
gold_dict_phen = { doc_id : set() for doc_id, phen_id in gold_set_phens }
for doc_id, phen_id in gold_set_phens:
    gold_dict_phen[doc_id].add(phen_id)

phen_not_found = list(gold_phen_recall(phen_c, gold_set_phens, phen2id))
print len(phen_not_found)
for doc_id, phen_id in phen_not_found[:100]:
    if len(gold_dict_phen[doc_id]) > 3: continue
    print doc_id
    for phen_id2 in gold_dict_phen[doc_id]:
        print id2phen[phen_id2].name, phen_id2
#     print gold_dict_phen[doc_id]
    print id2phen[phen_id].name, id2phen[phen_id].ontology_ref
    print id2phen[phen_id].synonyms    
    print kb.title_by_pmid(doc_id)
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    print

201
19798445
blood metabolite measurement http://www.ebi.ac.uk/efo/EFO_0005664
sphingolipid measurement http://www.ebi.ac.uk/efo/EFO_0004622
blood metabolite measurement http://www.ebi.ac.uk/efo/EFO_0005664

Genetic determinants of circulating sphingolipid concentrations in European populations.
Genetic Determinants of Circulating Sphingolipid Concentrations in European Populations. Sphingolipids have essential roles as structural components of cell membranes and in cell signalling, and disruption of their metabolism causes several diseases, with diverse neurological, psychiatric, and metabolic consequences. Increasingly, variants within a few of the genes that encode enzymes involved in sphingolipid metabolism are being associated with complex disease phenotypes. Direct experimental evidence supports a role of specific sphingolipid species in several common complex chronic disease processes including atherosclerotic plaque formation, myocardial infarction (MI), cardiomyopathy, pancrea

#### Over aggregate phenotypes

We say that a mention for an aggregate phenotype is correct, if it corresponds to the name of the GWC phenotype or to the phenotype of any equivalent EFO phenotype.

In [447]:
from db.kb import KnowledgeBase
from nltk.stem import PorterStemmer
from extractor.util import change_name

kb = KnowledgeBase() # reload
gold_set_agg_phens = frozenset \
([ 
    (doc.name, phen.id) for doc in corpus.documents 
                        for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog')
])

# map phenotype names to their id (EFO syn -> GWC id)
agg_phen2id = dict()
for doc in corpus.documents:
    for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog'):
        for eq_phen in phen.equivalents:
            for syn in [phen.name] + [eq_phen.name] + eq_phen.synonyms.split('|'):
                syn_name = change_name(syn)
                if eq_phen.name == 'personality': print syn_name
                if syn_name not in agg_phen2id: agg_phen2id[syn_name] = set()
                agg_phen2id[syn_name].add(phen.id)

# map ids to phenotypes (GWC id -> GWC phen obj)                
agg_id2phen = \
{
    phen.id : phen for doc in corpus.documents
                   for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog')
}

print 'Found %d gold mentions, e.g.:' % len(gold_set_phens)
print list(gold_set_phens)[:5]
print len(phen2id), len(id2phen)

from extractor.util import gold_phen_stats, gold_agg_phen_stats
gold_agg_phen_stats(phen_c, gold_set_agg_phens, agg_phen2id)

Found 322 gold mentions, e.g.:
[('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004764'), ('17903294', u'http://purl.obolibrary.org/obo/GO_0070527'), ('20038947', u'http://www.ebi.ac.uk/efo/EFO_0003761'), ('19197348', u'http://www.ebi.ac.uk/efo/EFO_0004748'), ('17658951', u'http://www.ebi.ac.uk/efo/EFO_0004626')]
441 216
Statistics over EFO phenotypes:
# of gold annotations	= 925
# of candidates		= 5709
# of correct candidates	= 834
Candidate recall	= 0.902
Candidate precision	= 0.146


Why is the recall low?

In [448]:
from extractor.util import gold_agg_phen_recall
kb = KnowledgeBase()
id2doc = {doc.name : doc for doc in corpus.documents}
gold_agg_dict_phen = { doc_id : set() for doc_id, phen_id in gold_set_agg_phens }
for doc_id, phen_id in gold_set_agg_phens:
    gold_agg_dict_phen[doc_id].add(phen_id)

# this contains (doc_id, gwc_id) pairs from gold_set_agg_phens that haven't been found
phen_not_found = list(gold_agg_phen_recall(phen_c, gold_set_agg_phens, agg_phen2id))
print len(phen_not_found)
for doc_id, phen_id in phen_not_found[:100]:
    if len(gold_agg_dict_phen[doc_id]) > 3: continue # skip if >3 gwc_phen_ids in doc
    print doc_id
    for phen_id2 in gold_agg_dict_phen[doc_id]: # iterate over gwc_id's in doc
        print agg_id2phen[phen_id2].name, phen_id2 # print its name and id
        for eq_phen in agg_id2phen[phen_id2].equivalents: # loop over equivalent phens
            print '\t', eq_phen.name, eq_phen.ontology_ref, eq_phen.synonyms # print name, ref, synonyms
    print agg_id2phen[phen_id].name # print the one that we haven't found
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    print

91
19578366
glioma (high-grade) 20856
	central nervous system cancer http://www.ebi.ac.uk/efo/EFO_0000326 cns cancer|malignant central nervous system neoplasm|malignant tumor of cns|cancer of cns|malignant tumor of central nervous system|malignant cns neoplasms|malignant neoplasm of cns|malignant tumor of the central nervous system|malignant cns tumor|malignant central nervous system tumor|malignant neoplasm of the cns|malignant neoplasm of the central nervous system|malignant cns neoplasm|central nervous system neoplasms
glioma (high-grade) 20857
	central nervous system cancer http://www.ebi.ac.uk/efo/EFO_0000326 cns cancer|malignant central nervous system neoplasm|malignant tumor of cns|cancer of cns|malignant tumor of central nervous system|malignant cns neoplasms|malignant neoplasm of cns|malignant tumor of the central nervous system|malignant cns tumor|malignant central nervous system tumor|malignant neoplasm of the cns|malignant neoplasm of the central nervous system|malignant cn

In [449]:
# FOR DEBUGGING WHY SPANS ARENT MATCHED
# from extractor.util import change_name

# doc_id, phen_id = phen_not_found[3]
# print agg_id2phen[phen_id].name 
# print id2doc[doc_id].sentences[0]
# for span in ngrams.apply(id2doc[doc_id].sentences[0]):
#     print span.get_span()
#     if phen_matcher._f(span):    
#         phen_name = span.get_span()
#         print phen_name
#         print '...', change_name(phen_name) in phenotype_list, phen_name in phen_matcher.d, change_name(phen_name) in phen_matcher.d
#         phen_id = phen2id.get(change_name(phen_name), None)
#         print phen_id
#         if not phen_id or phen_id not in gold_dict_phen[span.context.document.name]:
#             print span.context.document.name, phen_id
#             print gold_dict_phen[span.context.document.name]
        
#         print

In [450]:
# query_word = 'personality'

# # phenotypes = db_session.query(Phenotype).filter(Phenotype.source=='efo').all()
# phenotypes == kb.get_phenotype_candidates_cheating()
# phenotype_names = set()
# for phenotype in phenotypes:
#     if phenotype.name:
#         phenotype_names.add((phenotype.name))
#         synonyms = [(syn) for syn in phenotype.synonyms.split('|')]
#         if query_word in synonyms or query_word == phenotype.name:
#             print phenotype.name, phenotype.ontology_ref
#         phenotype_names.update(synonyms)

In [451]:
# print len(phenotype_names)
# [(word, change_name(word)) for word in phenotype_list if change_name(word) == change_name('personalized')]

## Extraction performance

First, extract some features for each candidate mention.

In [452]:
import cPickle        
from snorkel.features import NgramFeaturizer

pkl_f = 'phenotype_feats.pkl'
try:
    with open(pkl_f, 'rb') as f:
        featurizer = cPickle.load(f)
except:
    featurizer = NgramFeaturizer()
    featurizer.fit_transform(phen_c)

print 'Example features:', [f for f in featurizer.get_features_by_candidate(phen_c[0])[:10]]

Building feature index...
Extracting features...
0/90342
5000/90342
10000/90342
15000/90342
20000/90342
25000/90342
30000/90342
35000/90342
40000/90342
45000/90342
50000/90342
55000/90342
60000/90342
65000/90342
70000/90342
75000/90342
80000/90342
85000/90342
90000/90342
Example features: [u'DDLIB_WORD_SEQ_[Crohn Disease]', u'DDLIB_LEMMA_SEQ_[crohn disease]', u'DDLIB_POS_SEQ_[NN NN]', u'DDLIB_DEP_SEQ_[compound compound]', u'DDLIB_W_LEFT_1_[novel]', u'DDLIB_W_LEFT_POS_1_[JJ]', u'DDLIB_W_LEFT_2_[. novel]', u'DDLIB_W_LEFT_POS_2_[. JJ]', u'DDLIB_W_LEFT_3_[ptger4 . novel]', u'DDLIB_W_LEFT_POS_3_[NN . JJ]']


In [453]:
# classify candidates as correct or not

# doc_id -> set of correct gwc_ids
gold_agg_dict_phen = { doc_id : set() for doc_id, phen_id in gold_set_agg_phens }
for doc_id, phen_id in gold_set_agg_phens:
    gold_agg_dict_phen[doc_id].add(phen_id)

def c2uid(candidate):
    return candidate.context.document.name, candidate.context.position, candidate.char_start, candidate.char_end
    
gt_dict_pos = dict()
gt_dict_neg = dict()
for candidate in phen_c:
    doc_id = candidate.context.document.name
    agg_ids = agg_phen2id.get(change_name(candidate.get_span()), set())
    uid = c2uid(candidate)
    if agg_ids & gold_agg_dict_phen[doc_id]:
        gt_dict_pos[uid] = +1
    else:
        gt_dict_neg[uid] = -1

gt_dict = dict(gt_dict_pos.items() + gt_dict_neg.items())
print 'Defined %d positive and %d negative gold mentions' % (len(gt_dict_pos), len(gt_dict_neg))

Defined 641 positive and 5068 negative gold mentions


In [454]:
# let's look at a few examples
for doc in corpus.documents[:10]:
    doc_id = doc.name
    print doc.name
    for phen_id2 in gold_agg_dict_phen[doc_id]: # iterate over gwc_id's in doc
        print agg_id2phen[phen_id2].name, phen_id2 # print its name and id
        for eq_phen in agg_id2phen[phen_id2].equivalents: # loop over equivalent phens
            print '\t', eq_phen.name, eq_phen.ontology_ref, eq_phen.synonyms # print name, ref, synonyms
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    for candidate in phen_c:
        if candidate.context.document != doc: continue
#         print gt_dict[candidate.uid], candidate.get_span(), [agg_id2phen[phen_id].name for phen_id in phen2id.get(change_name(candidate.get_span()),set())]
        print gt_dict[c2uid(candidate)], candidate.get_span(), 
        phen_id = phen2id.get(change_name(candidate.get_span()), None)
        if phen_id: print '|', id2phen[phen_id].name, id2phen[phen_id].ontology_ref,
        print
    print

17447842
crohn's disease 19827
	crohn's disease http://www.ebi.ac.uk/efo/EFO_0000384 gastritis associated with crohn's disease|ileitis
crohn's disease 19333
	crohn's disease http://www.ebi.ac.uk/efo/EFO_0000384 gastritis associated with crohn's disease|ileitis
crohn's disease 19486
	crohn's disease http://www.ebi.ac.uk/efo/EFO_0000384 gastritis associated with crohn's disease|ileitis
Novel Crohn Disease Locus Identified by Genome-Wide Association Maps to a Gene Desert on 5p13.1 and Modulates Expression of  PTGER4. To identify novel susceptibility loci for Crohn disease (CD), we undertook a genome-wide association study with more than 300,000 SNPs characterized in 547 patients and 928 controls. We found three chromosome regions that provided evidence of disease association with  p -values between 10 −6  and 10 −9 . Two of these ( IL23R  on Chromosome 1 and  CARD15  on Chromosome 16) correspond to genes previously reported to be associated with CD. In addition, a 250-kb region of Chromos

Create training set

In [455]:
import numpy as np

# Split into train and test set
candidates = phen_c
training_candidates = []
gold_candidates     = []
gold_labels         = []
n_half = len(candidates)/2
for c in candidates[:n_half]:
    uid = c2uid(c)
    if uid in gt_dict:
        gold_candidates.append(c)
        gold_labels.append(gt_dict[uid])
    else:
        training_candidates.append(c)
training_candidates.extend(candidates[n_half:])
gold_labels = np.array(gold_labels)
print "Training set size: %s" % len(training_candidates)
print "Gold set size: %s" % len(gold_candidates)
print "Positive labels in training set: %s" % len([c for c in training_candidates if gt_dict.get(c2uid(c),0)==1])
print "Negative labels in training set: %s" % len([c for c in training_candidates if gt_dict.get(c2uid(c),0)==-1])
print "Positive labels in gold set: %s" % len([c for c in gold_candidates if gt_dict[c2uid(c)]==1])
print "Negative labels in gold set: %s" % len([c for c in gold_candidates if gt_dict[c2uid(c)]==-1])

Training set size: 2855
Gold set size: 2854
Positive labels in training set: 308
Negative labels in training set: 2547
Positive labels in gold set: 333
Negative labels in gold set: 2521


In [456]:
print phen_c[0]
print phen_c[0].post_window(d=4)
print phen_c[0].get_attrib_span('words')
print re.search(r'\([A-Z]{2,4}\)', 'Test [AVC)')

Span("Crohn Disease", context=None, chars=[6,18], words=[1,2])
[u'Disease', u'Locus', u'Identified', u'by']
Crohn Disease
None


In [457]:
from nltk.stem import PorterStemmer
import re
stemmer = PorterStemmer()

# load set of dictionary phenotypes
kb = KnowledgeBase()
phenotype_list = kb.get_phenotype_candidates() # TODO: load disease names from NCBI
phenotype_list = [phenotype for phenotype in phenotype_list]
phenotype_set = set(phenotype_list)

# small helpers
def get_phenotype(entity, stem=False):
    phenotype = entity.get_span()
    if stem: phenotype = stemmer.stem(phenotype)
    return phenotype.lower()

def stem_list(L):
    return [stemmer.stem(l.lower()) for l in L]

def LF_gt(m):
    return gt_dict.get(m.uid, 0) if m in training_candidates else 0

# positive LFs
def LF_first_sentence(m):
    return +1 if m.context.position == 0 else 0
def LF_with_acronym(m):
    post_txt = ''.join(m.post_window('words',d=5))
    return +1 if re.search(r'\([A-Z]{2,4}\)', post_txt) else 0
def LF_associated(m):
    return +1 if 'associ' in stem_list(m.pre_window('lemmas', d=5)) \
              or 'associ' in stem_list(m.post_window('lemmas', d=5)) \
              else 0
def LF_influencing(m):
    return +1 if 'influenc' in stem_list(m.pre_window('lemmas', d=5)) \
              or 'influenc' in stem_list(m.post_window('lemmas', d=5)) \
              else 0
def LF_identifi(m):
    return +1 if 'identifi' in stem_list(m.pre_window('lemmas', d=5)) \
              or 'identifi' in stem_list(m.post_window('lemmas', d=5)) \
              else 0

LFs_pos = [LF_first_sentence, LF_with_acronym, LF_associated, LF_influencing, LF_identifi]

# negative LFs
def LF_bad_words(m):
    bad_words = ['disease', 'single', 'map', 'genetic variation']
    return -10 if any(m.get_span().lower().startswith(b) for b in bad_words) else 0
def LF_previously(m):
    return -1 if 'previously' in m.pre_window('lemmas', 8) else 0
def LF_further(m):
    lemmas = m.get_attrib_span('lemmas').split(' ')    
    return -1 if 'further' in lemmas or 'furthermore' in lemmas else 0
def LF_also(m):
    words = m.get_span().split(' ')        
    return -1 if 'also' in words else 0
def LF_recently(m):
    words = m.get_span().split(' ')    
    return -1 if 'recently' in words else 0
def LF_addit(m):
    lemmas = m.get_attrib_span('lemmas').split(' ')
    return -1 if 'addit' in lemmas else 0
def LF_may(m):
    words = m.get_span().split(' ')
    return -1 if 'may' in words else 0
def LF_risk(m):
    return +1 if 'risk' in stem_list(m.pre_window('lemmas', d=5)) else 0
def LF_short(m):
    txt = m.get_attrib_span('words', 3)
    return -5 if len(txt) < 5 else 0

LFs_neg = [LF_bad_words, LF_previously, LF_further, LF_also, LF_recently, LF_addit, LF_may, LF_risk, LF_short]

LFs = LFs_pos + LFs_neg

In [458]:
from snorkel.snorkel import TrainingSet
from snorkel.features import NgramFeaturizer

training_set = TrainingSet(training_candidates, LFs, featurizer=NgramFeaturizer())

Applying LFs...
Featurizing...
Building feature index...
Extracting features...
0/48951
5000/48951
10000/48951
15000/48951
20000/48951
25000/48951
30000/48951
35000/48951
40000/48951
45000/48951
LF Summary Statistics: 14 LFs applied to 2855 candidates
------------------------------------------------------------
Coverage (candidates w/ > 0 labels):		63.85%
Overlap (candidates w/ > 1 labels):		49.11%
Conflict (candidates w/ conflicting labels):	13.17%


In [459]:
lf_stats = training_set.lf_stats()
lf_stats[:5]

Unnamed: 0,conflicts,coverage,j,overlaps
LF_first_sentence,0.021016,0.076357,0,0.054291
LF_with_acronym,0.004203,0.035377,1,0.007706
LF_associated,0.083713,0.158669,2,0.110333
LF_influencing,0.009457,0.033275,3,0.018564
LF_identifi,0.032925,0.072504,4,0.05394


In [460]:
from snorkel.snorkel import Learner, PipelinedLearner
import snorkel.learning
from snorkel.learning import LogReg

learner = Learner(training_set, model=snorkel.learning.LogReg())

# Splitting into CV and test set
n_half = len(gold_candidates)/2
test_candidates = gold_candidates[:n_half]
test_labels     = gold_labels[:n_half]
cv_candidates   = gold_candidates[n_half:]
cv_labels       = gold_labels[n_half:]

In [461]:
from snorkel.learning_utils import GridSearch

gs       = GridSearch(learner, ['mu', 'lf_w0'], [[1e-5, 1e-7],[1.0,2.0]])
gs_stats = gs.fit(cv_candidates, cv_labels)

Testing mu = 1.00e-05, lf_w0 = 1.00e+00
Begin training for rate=0.01, mu=1e-05
	Learning epoch = 0	Gradient mag. = 0.201454
	Learning epoch = 250	Gradient mag. = 0.376714
	Learning epoch = 500	Gradient mag. = 0.434520
	Learning epoch = 750	Gradient mag. = 0.379716
Final gradient magnitude for rate=0.01, mu=1e-05: 0.333
Applying LFs...
Featurizing...
Testing mu = 1.00e-05, lf_w0 = 2.00e+00
Begin training for rate=0.01, mu=1e-05
	Learning epoch = 0	Gradient mag. = 0.324230
	Learning epoch = 250	Gradient mag. = 0.419418
	Learning epoch = 500	Gradient mag. = 0.366547
	Learning epoch = 750	Gradient mag. = 0.321261
Final gradient magnitude for rate=0.01, mu=1e-05: 0.283
Testing mu = 1.00e-07, lf_w0 = 1.00e+00
Begin training for rate=0.01, mu=1e-07
	Learning epoch = 0	Gradient mag. = 0.201454
	Learning epoch = 250	Gradient mag. = 0.376754
	Learning epoch = 500	Gradient mag. = 0.434512
	Learning epoch = 750	Gradient mag. = 0.379706
Final gradient magnitude for rate=0.01, mu=1e-07: 0.333
Testin

In [462]:
gs_stats

Unnamed: 0,mu,lf_w0,Prec.,Rec.,F1
0,1e-05,1,0.301802,0.465278,0.36612
1,1e-05,2,0.27451,0.486111,0.350877
2,0.0,1,0.301802,0.465278,0.36612
3,0.0,2,0.27451,0.486111,0.350877


In [463]:
learner.test_wmv(test_candidates, test_labels)

Applying LFs...
Featurizing...
Test set size:	1427
----------------------------------------
Precision:	0.156996587031
Recall:		0.414414414414
F1 Score:	0.227722772277
----------------------------------------
TP: 46 | FP: 247 | TN: 475 | FN: 65


In [464]:
preds = learner.predict_wmv(phen_c)
mislabeled_cand = [(c,p, gt_dict.get(c2uid(c), None)) for c, p in zip(phen_c, preds)]# if p == 1 or p != gt_dict.get(c2uid(c), p)]
for (c,p,g) in mislabeled_cand:
    if c.context.position != 0: continue
    print c.context.document.name, p, g
    print c.context    
    print c.get_span()
    print [LF(c) for LF in LFs]
    print

Applying LFs...
Featurizing...
17447842 1.0 1
Sentence(Document('17447842', Corpus (GWAS Corpus)), 0, u'Novel Crohn Disease Locus Identified by Genome-Wide Association Maps to a Gene Desert on 5p13.1 and Modulates Expression of  PTGER4.')
Crohn Disease
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

17447842 1.0 -1
Sentence(Document('17447842', Corpus (GWAS Corpus)), 0, u'Novel Crohn Disease Locus Identified by Genome-Wide Association Maps to a Gene Desert on 5p13.1 and Modulates Expression of  PTGER4.')
Association
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

17447842 -1.0 -1
Sentence(Document('17447842', Corpus (GWAS Corpus)), 0, u'Novel Crohn Disease Locus Identified by Genome-Wide Association Maps to a Gene Desert on 5p13.1 and Modulates Expression of  PTGER4.')
Maps
[1, 0, 1, 0, 1, -10, 0, 0, 0, 0, 0, 0, 0, -5]

17447842 -1.0 -1
Sentence(Document('17447842', Corpus (GWAS Corpus)), 0, u'Novel Crohn Disease Locus Identified by Genome-Wide Association Maps to a Gene Desert on 5p13.1 and M

## Save and analyze the results

We store phenotype mentions that occur in first sentence.

In [465]:
preds = learner.predict_wmv(phen_c)
results = [c for p, c in zip(preds, phen_c) if p > 0 and c.context.position == 0]

Applying LFs...
Featurizing...


In [466]:
# for c in results:
#     print c.context.document.name, c.get_span()

doc_set = {c.context.document.name for c in results}
missing_docs = {doc.name for doc in corpus.documents} - doc_set
docs = sorted(list(missing_docs))
print len(docs)
for d in docs:
    print d, kb.paper_by_pmid(d).title

0


In [467]:
cand = [c for c in phen_c if c.context.document.name == '19587794']
print cand
print cand[0].context.document.sentences[0].text
print cand[0].context.document.sentences

[Span("Genetic Variation", context=None, chars=[7,23], words=[1,2]), Span("Phospholamban", context=None, chars=[35,47], words=[5,5]), Span("Is", context=None, chars=[55,56], words=[7,7]), Span("Association", context=None, chars=[133,143], words=[17,17]), Span("QT interval,", context=None, chars=[52,63], words=[6,8]), Span("sudden cardiac death", context=None, chars=[153,172], words=[21,23]), Span("ventricular arrhythmias", context=None, chars=[125,147], words=[18,19]), Span("death,", context=None, chars=[168,173], words=[23,24]), Span("To", context=None, chars=[0,1], words=[0,0]), Span("a", context=None, chars=[65,65], words=[9,9]), Span("a", context=None, chars=[188,188], words=[27,27]), Span("association", context=None, chars=[225,235], words=[32,32]), Span("QT interval at", context=None, chars=[45,58], words=[6,8]), Span("1", context=None, chars=[62,62], words=[11,11]), Span("6", context=None, chars=[68,68], words=[15,15]), Span("To", context=None, chars=[0,1], words=[0,0]), Span("f

In [426]:
'a' in phen_matcher.d

True

In [427]:
isinstance(phen_matcher, DictionaryMatch)

True

In [428]:
'a' in phenotype_list_snorkel

True

In [434]:
from db import db_session
from db.schema import Phenotype
tmp = db_session.query(Phenotype).filter(Phenotype.source=='snorkel').filter(Phenotype.name=='A').all()

In [435]:
print tmp

[<db.schema.Phenotype object at 0x1167d6ed0>]
