# Phenotype extraction

In [12]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Extract phenotype candidates from papers

### Load corpus

In [13]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLDocParser, GWASXMLAbstractParser

xml_parser = GWASXMLAbstractParser(
    path=abstract_dir,
    doc='./*',
    title='.//front//article-title//text()',
    abstract='.//abstract//p//text()',
    par1='.//body/p[1]//text()',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [14]:
from snorkel.parser import HTMLParser
from snorkel.parser import SentenceParser
from snorkel.parser import CorpusParser

sent_parser = SentenceParser()
html_parser = HTMLParser(path='../data/db/papers/')

corpus_name = 'gwas-text-corpus.pkl'

try:
    with open(corpus_name,"r") as pkl:
        corpus = cPickle.load(pkl)
except:
    cp = CorpusParser(xml_parser, sent_parser)
    %time corpus = cp.parse_corpus(name='GWAS Corpus')
    # pickling currently doesn't work...
#     with open(corpus_name,"w") as pkl:
#         corpus = cPickle.dump(corpus, pkl)

CPU times: user 13.5 s, sys: 994 ms, total: 14.5 s
Wall time: 1min 28s


### Extract candidates

In [None]:
from snorkel.candidates import Ngrams
from snorkel.matchers import DictionaryMatch, Union, RegexMatchSpan
from snorkel.candidates import EntityExtractor
from snorkel.utils import slice_into_ngrams

from extractor.util import change_name
from extractor.matcher import PhenotypeMatcher
from db.kb import KnowledgeBase

def make_ngrams(L, n_max=10, n_min=3, delim=' '):
    for l in L:
        yield l
        tokens = l.strip().split(delim)
        for ngram in slice_into_ngrams(tokens, n_max=n_max, n_min=n_min, delim=delim):
            yield ngram

# Define a candidate space
ngrams = Ngrams(n_max=7)

# collect phenotype list
kb = KnowledgeBase()
# efo phenotypes
efo_phenotype_list0 = kb.get_phenotype_candidates(source='efo-matching', peek=False) # TODO: remove peaking
efo_phenotype_list = list(make_ngrams(efo_phenotype_list0))
# snomed keywords
snomed_phenotype_list = kb.get_phenotype_candidates(source='snomed')
# mesh diseases
mesh_phenotype_list0 = kb.get_phenotype_candidates(source='mesh')
mesh_phenotype_list = list(make_ngrams(mesh_phenotype_list0))
# mesh chemicals
chem_phenotype_list = kb.get_phenotype_candidates(source='chemical')
# regex matches
rgx = u'[A-Za-z\u2013-]+ (disease|trait|phenotype|outcome|response|quantitative trait|measurement|response)s?'

# Define matchers
efo_phen_matcher = PhenotypeMatcher(d=efo_phenotype_list, ignore_case=True, mod_fn=change_name)
snom_phen_matcher = PhenotypeMatcher(d=snomed_phenotype_list, ignore_case=True, mod_fn=change_name)
mesh_phen_matcher = PhenotypeMatcher(d=mesh_phenotype_list, ignore_case=True, mod_fn=change_name)
chem_phen_matcher = DictionaryMatch(d=chem_phenotype_list, longest_match_only=True, ignore_case=True)
regex_phen_matcher = RegexMatchSpan(rgx=rgx)
phen_matcher = Union(efo_phen_matcher, snom_phen_matcher, mesh_phen_matcher, chem_phen_matcher, regex_phen_matcher)
# phen_matcher = PhenotypeMatcher(d=phenotype_list, ignore_case=True, mod_fn=change_name)

# Extractor
phen_extractor = EntityExtractor(ngrams, phen_matcher)

# collect candidates
%time phen_c = phen_extractor.extract(corpus.get_sentences(), name='all')
print len(phen_c), 'candidates extracted'

In [None]:
def make_ngrams(L, n_max=10, n_min=3, delim=' '):
    for l in L:
        yield l
        tokens = l.strip().split(delim)
        for ngram in slice_into_ngrams(tokens, n_max=n_max, n_min=n_min, delim=delim):
            yield ngram

We would like to remove nested candidates (manually, for now).

In [None]:
# load existing candidates into a dict
span_dict = { str(span.context) : list() for span in phen_c }
for span in phen_c:
    span_dict[str(span.context)].append( (span.char_start, span.char_end) )

def nested(ivl1, ivl2):
    if ivl1 != ivl2 and ivl2[0] <= ivl1[0] <= ivl1[1] <= ivl2[1]:
        return True
    else:
        return False

new_phen_c = list()
for span in phen_c:
    span_ivl = span.char_start, span.char_end
    span_name = str(span.context)
    if all([not nested(span_ivl, other_ivl) for other_ivl in span_dict[span_name]]):
        new_phen_c.append(span)
        
print len(phen_c) - len(new_phen_c), 'candidates dropped, now we have', len(new_phen_c)
phen_c = new_phen_c

### Create gold-truth set

In [None]:
from db.kb import KnowledgeBase
from nltk.stem import PorterStemmer
from extractor.util import change_name

kb = KnowledgeBase() # reload
gold_set_phens = frozenset \
([ 
    (doc.name, phen.ontology_ref) for doc in corpus.documents 
                                  for phen in kb.phen_by_pmid(doc.name, source='efo')
])

# map phenotype names to their id
phen2id = \
{
    change_name(syn) : phen.ontology_ref for doc in corpus.documents
    
                                                  for phen in kb.phen_by_pmid(doc.name, source='efo')
                                                  for syn in [phen.name] + phen.synonyms.split('|')
}

# this is the more correct version (code below should be changed to use it)
# we collect a set of EFOs matching a string; we use this only at the very end for now
phen2idset = dict()
for doc in corpus.documents:
    for phen in kb.phen_by_pmid(doc.name, source='efo'):
        synonyms = [phen.name] + phen.synonyms.split('|')
        ngrams = make_ngrams(synonyms)
        for ngram in ngrams:
            syn_name = change_name(ngram)
            if syn_name not in phen2idset: phen2idset[syn_name] = set()
            phen2idset[syn_name].add(phen.id)

id2phen = \
{
    phen.ontology_ref : phen for doc in corpus.documents
                             for phen in kb.phen_by_pmid(doc.name, source='efo')
}

print 'Found %d gold mentions, e.g.:' % len(gold_set_phens)
print list(gold_set_phens)[:5]
print len(phen2id), len(id2phen)

In [None]:
from db import db_session
from db.schema import *

# paper = db_session.query(Paper).filter(Paper.pubmed_id=='20195266').first()
# print [ (assoc.phenotype.name, [(p.name, p.source) for p in assoc.phenotype.equivalents]) for assoc in paper.associations ]
# print first_degree_phens

from db.kb import KnowledgeBase
kb = KnowledgeBase() # reload
print len(kb.phen_by_pmid('20195266', source='efo'))

## Statistics

First, we need to understand whether the extraction or the classification approach will be better.

### Number of phenotypes per paper

#### Number of EFO phenotypes per paper

In [None]:
docid2efo = \
{
    doc.name : [ phen.name for phen in kb.phen_by_pmid(doc.name, source='efo') ] for doc in corpus.documents
}

#### Number of GWAS catalog (i.e. aggregate) phenotypes per paper

In [None]:
docid2efo = \
{
    doc.name : [ phen.name for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog') ] for doc in corpus.documents
}
print 'GWAS catalog phenotype numbers per paper:', sorted([len(v) for d, v in docid2efo.items()])

### Candidate recall statistics

#### Over EFO phenotypes

In [None]:
from extractor.util import gold_phen_stats
gold_phen_stats(phen_c, gold_set_phens, phen2id)

Why is the recall low?

In [15]:
from extractor.util import gold_phen_recall
kb = KnowledgeBase()
id2doc = {doc.name : doc for doc in corpus.documents}
gold_dict_phen = { doc_id : set() for doc_id, phen_id in gold_set_phens }
for doc_id, phen_id in gold_set_phens:
    gold_dict_phen[doc_id].add(phen_id)

phen_not_found = list(gold_phen_recall(phen_c, gold_set_phens, phen2id))
print len(phen_not_found)
for doc_id, phen_id in phen_not_found[:100]:
    if len(gold_dict_phen[doc_id]) > 3: continue
    print doc_id
    for phen_id2 in gold_dict_phen[doc_id]:
        print id2phen[phen_id2].name, phen_id2
#     print gold_dict_phen[doc_id]
    print id2phen[phen_id].name, id2phen[phen_id].ontology_ref
    print id2phen[phen_id].synonyms    
    print kb.title_by_pmid(doc_id)
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    print

KeyError: '17447842'

#### Over aggregate phenotypes

We say that a mention for an aggregate phenotype is correct, if it corresponds to the name of the GWC phenotype or to the phenotype of any equivalent EFO phenotype.

In [39]:
from db.kb import KnowledgeBase
from nltk.stem import PorterStemmer
from extractor.util import change_name

kb = KnowledgeBase() # reload
gold_set_agg_phens = frozenset \
([ 
    (doc.name, phen.id) for doc in corpus.documents 
                        for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog')
])

# map phenotype names to their id (EFO syn -> GWC id)
agg_phen2id = dict()
for doc in corpus.documents:
    for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog'):
        for eq_phen in phen.equivalents:
            for syn in [phen.name] + [eq_phen.name] + eq_phen.synonyms.split('|'):
                syn_name = change_name(syn)
                if syn_name not in agg_phen2id: agg_phen2id[syn_name] = set()
                agg_phen2id[syn_name].add(phen.id)

# map ids to phenotypes (GWC id -> GWC phen obj)                
agg_id2phen = \
{
    phen.id : phen for doc in corpus.documents
                   for phen in kb.phen_by_pmid(doc.name, source='gwas_catalog')
}

print 'Found %d gold mentions, e.g.:' % len(gold_set_phens)
print list(gold_set_phens)[:5]
print len(phen2id), len(id2phen)

from extractor.util import gold_phen_stats, gold_agg_phen_stats
gold_agg_phen_stats(phen_c, gold_set_agg_phens, agg_phen2id)

Found 0 gold mentions, e.g.:
[]
0 0
Statistics over EFO phenotypes:
# of gold annotations	= 9747
# of candidates		= 49091
# of correct candidates	= 0
Candidate recall	= 0.000
Candidate precision	= 0.000


Why is the recall low?

In [15]:
from extractor.util import gold_agg_phen_recall
kb = KnowledgeBase()
id2doc = {doc.name : doc for doc in corpus.documents}
gold_agg_dict_phen = { doc_id : set() for doc_id, phen_id in gold_set_agg_phens }
for doc_id, phen_id in gold_set_agg_phens:
    gold_agg_dict_phen[doc_id].add(phen_id)

# this contains (doc_id, gwc_id) pairs from gold_set_agg_phens that haven't been found
phen_not_found = list(gold_agg_phen_recall(phen_c, gold_set_agg_phens, agg_phen2id))
print len(phen_not_found)
for doc_id, phen_id in phen_not_found[:100]:
    if len(gold_agg_dict_phen[doc_id]) > 3: continue # skip if >3 gwc_phen_ids in doc
    print doc_id
    for phen_id2 in gold_agg_dict_phen[doc_id]: # iterate over gwc_id's in doc
        print agg_id2phen[phen_id2].name, phen_id2 # print its name and id
        for eq_phen in agg_id2phen[phen_id2].equivalents: # loop over equivalent phens
            print '\t', eq_phen.name, eq_phen.ontology_ref, eq_phen.synonyms # print name, ref, synonyms
    print agg_id2phen[phen_id].name # print the one that we haven't found
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    print

1003
22383897
breast cancer 25571
	breast carcinoma http://www.ebi.ac.uk/efo/EFO_0000305 breast neoplasms|neoplasm
breast cancer 25572
	breast carcinoma http://www.ebi.ac.uk/efo/EFO_0000305 breast neoplasms|neoplasm
breast cancer 25573
	breast carcinoma http://www.ebi.ac.uk/efo/EFO_0000305 breast neoplasms|neoplasm
breast cancer
Genome-Wide Association Study in East Asians Identifies Novel Susceptibility Loci for Breast Cancer. Genetic factors play an important role in the etiology of both sporadic and familial breast cancer. We aimed to discover novel genetic susceptibility loci for breast cancer. We conducted a four-stage genome-wide association study (GWAS) in 19,091 cases and 20,606 controls of East-Asian descent including Chinese, Korean, and Japanese women. After analyzing 690,947 SNPs in 2,918 cases and 2,324 controls, we evaluated 5,365 SNPs for replication in 3,972 cases and 3,852 controls. Ninety-four SNPs were further evaluated in 5,203 cases and 5,138 controls, and finally 

In [85]:
print id2doc[doc_id].sentences[0].words

[u'Genome-wide', u'association', u'study', u'identifies', u'multiple', u'susceptibility', u'loci', u'for', u'pulmonary', u'fibrosis', u'.']


In [89]:
# FOR DEBUGGING WHY SPANS ARENT MATCHED
from extractor.util import change_name

doc_id = '23583980'
ngrams = Ngrams(n_max=7)
print id2doc[doc_id].sentences[0]
for span in ngrams.apply(id2doc[doc_id].sentences[0]):
    print span.get_span()
    if phen_matcher._f(span):    
        phen_name = span.get_span()
        print phen_name, change_name(phen_name)
        print '...', phen_name == 'fibrosis', phen_name in phenotype_list, change_name(phen_name) in phenotype_list, phen_name in efo_phen_matcher.d, change_name(phen_name) in efo_phen_matcher.d
        phen_id = phen2id.get(change_name(phen_name), None)
        print phen_id
        if not phen_id or phen_id not in gold_dict_phen[span.context.document.name]:
            print span.context.document.name, phen_id
            print gold_dict_phen[span.context.document.name]
        
        print

Sentence(Document('23583980', Corpus (GWAS Corpus)), 0, u'Genome-wide association study identifies multiple susceptibility loci for pulmonary fibrosis.')
Genome-wide association study identifies multiple susceptibility loci
Genome-wide association study identifies multiple susceptibility loci genom wide associ studi identifi multipl suscept loci
... False False False False False
None
23583980 None
set([u'http://www.ebi.ac.uk/efo/EFO_0004244'])

association study identifies multiple susceptibility loci for
association study identifies multiple susceptibility loci for associ studi identifi multipl suscept loci for
... False False False False False
None
23583980 None
set([u'http://www.ebi.ac.uk/efo/EFO_0004244'])

study identifies multiple susceptibility loci for pulmonary
study identifies multiple susceptibility loci for pulmonary studi identifi multipl suscept loci for pulmonari
... False False False False False
None
23583980 None
set([u'http://www.ebi.ac.uk/efo/EFO_0004244'])

identifi

In [124]:
print [ph for ph in efo_phenotype_list0 if 'alpha' in ph]
print [ph for ph in efo_phen_matcher.d if 'alpha' in ph]

[u'cd8_alpha-negative plasmactyoid dendritic cell', u'alpha-hydro-omega-hydroxypoly(oxyethylene)', u'ajmalan-17alpha', u'(-)-(s)-alpha-ethyl-2-oxo-1-pyrrolidineacetamide', u'alpha-l-fucosidase deficiency', u'branched chain alpha-ketoacid dehydrogenase complex deficiency', u'tumor necrosis factor-alpha', u'primary alpha-dystroglycanopathy', u'alpha-thalassemia - x-linked intellectual disability syndrome', u' alpha-beta intraepithelial t cell', u'alpha-n-acetylgalactosaminidase deficiency', u'(3alpha', u'cd8-alpha-beta-positive', u'l-alpha-acetamido-beta-mercaptopropionic acid', u'fc-epsilon rialpha-high basophil progenitor cell', u'cd8_alpha-positive cd11b-negative dendritic cell', u'alpha globulin measurement', u'hyperphenylalaninemia due to pterin-4-alpha-carbinolamine dehydratase deficiency', u'alpha-tocopherol metabolic process', u'(22s)-5alpha-campestane-3beta', u'(s)-(+)-alpha-amino-4-carboxy-2-methylbenzeneacetic', u't-b+ scid due to il-7ralpha deficiency', u'cd8alpha-negative th

In [128]:
query_word = 'alpha'
from db import db_session
from db.schema import *

phenotypes = db_session.query(Phenotype).filter(Phenotype.source=='efo').all()
# phenotypes == kb.get_phenotype_candidates_cheating()
phenotype_names = set()
for phenotype in phenotypes:
    if phenotype.name:
        phenotype_names.add((phenotype.name))
        synonyms = [(syn) for syn in phenotype.synonyms.split('|')]
        if query_word in synonyms or query_word == phenotype.name:
            print phenotype.name, phenotype.ontology_ref
        phenotype_names.update(synonyms)

polyethylene glycol http://purl.obolibrary.org/obo/CHEBI_46793
anastrozole http://purl.obolibrary.org/obo/CHEBI_2704
lysine http://purl.obolibrary.org/obo/CHEBI_25094
flutamide http://purl.obolibrary.org/obo/CHEBI_5132
dimercaprol http://purl.obolibrary.org/obo/CHEBI_64198
mating type alpha http://www.ebi.ac.uk/efo/EFO_0001270


In [18]:
# print len(phenotype_names)
# [(word, change_name(word)) for word in phenotype_list if change_name(word) == change_name('personalized')]

## Extraction performance

First, extract some features for each candidate mention.

In [None]:
import cPickle        
from snorkel.features import NgramFeaturizer

# pkl_f = 'phenotype_feats.pkl'
# try:
#     with open(pkl_f, 'rb') as f:
#         featurizer = cPickle.load(f)
# except:
featurizer = NgramFeaturizer()
featurizer.fit_transform(phen_c)

print 'Example features:', [f for f in featurizer.get_features_by_candidate(phen_c[0])[:10]]

In [None]:
# classify candidates as correct or not

# doc_id -> set of correct gwc_ids
gold_agg_dict_phen = { doc_id : set() for doc_id, phen_id in gold_set_agg_phens }
for doc_id, phen_id in gold_set_agg_phens:
    gold_agg_dict_phen[doc_id].add(phen_id)

def c2uid(candidate):
    return candidate.context.document.name, candidate.context.position, candidate.char_start, candidate.char_end
    
gt_dict_pos = dict()
gt_dict_neg = dict()
for candidate in phen_c:
    doc_id = candidate.context.document.name
    agg_ids = agg_phen2id.get(change_name(candidate.get_span()), set())
    uid = c2uid(candidate)
    if agg_ids & gold_agg_dict_phen[doc_id]:
        gt_dict_pos[uid] = +1
    else:
        gt_dict_neg[uid] = -1

gt_dict = dict(gt_dict_pos.items() + gt_dict_neg.items())
print 'Defined %d positive and %d negative gold mentions' % (len(gt_dict_pos), len(gt_dict_neg))

In [None]:
# let's look at a few examples
for doc in corpus.documents[:10]:
    doc_id = doc.name
    print doc.name
    for phen_id2 in gold_agg_dict_phen[doc_id]: # iterate over gwc_id's in doc
        print agg_id2phen[phen_id2].name, phen_id2 # print its name and id
        for eq_phen in agg_id2phen[phen_id2].equivalents: # loop over equivalent phens
            print '\t', eq_phen.name, eq_phen.ontology_ref, eq_phen.synonyms # print name, ref, synonyms
    print ' '.join(s.text for s in id2doc[doc_id].sentences)
    for candidate in phen_c:
        if candidate.context.document != doc: continue
#         print gt_dict[candidate.uid], candidate.get_span(), [agg_id2phen[phen_id].name for phen_id in phen2id.get(change_name(candidate.get_span()),set())]
        print gt_dict[c2uid(candidate)], candidate.get_span(), 
        phen_id = phen2id.get(change_name(candidate.get_span()), None)
        if phen_id: print '|', id2phen[phen_id].name, id2phen[phen_id].ontology_ref,
        print
    print

Create training set

In [None]:
import numpy as np

# Split into train and test set
candidates = phen_c
training_candidates = []
gold_candidates     = []
gold_labels         = []
n_half = len(candidates)/2
for c in candidates[:n_half]:
    uid = c2uid(c)
    if uid in gt_dict:
        gold_candidates.append(c)
        gold_labels.append(gt_dict[uid])
    else:
        training_candidates.append(c)
training_candidates.extend(candidates[n_half:])
gold_labels = np.array(gold_labels)
print "Training set size: %s" % len(training_candidates)
print "Gold set size: %s" % len(gold_candidates)
print "Positive labels in training set: %s" % len([c for c in training_candidates if gt_dict.get(c2uid(c),0)==1])
print "Negative labels in training set: %s" % len([c for c in training_candidates if gt_dict.get(c2uid(c),0)==-1])
print "Positive labels in gold set: %s" % len([c for c in gold_candidates if gt_dict[c2uid(c)]==1])
print "Negative labels in gold set: %s" % len([c for c in gold_candidates if gt_dict[c2uid(c)]==-1])

In [None]:
print phen_c[0]
print phen_c[0].post_window(d=4)
print phen_c[0].get_attrib_span('words')
print re.search(r'\([A-Z]{2,4}\)', 'Test [AVC)')

In [None]:
from nltk.stem import PorterStemmer
import re
stemmer = PorterStemmer()

# load set of dictionary phenotypes
kb = KnowledgeBase()
phenotype_list = kb.get_phenotype_candidates() # TODO: load disease names from NCBI
phenotype_list = [phenotype for phenotype in phenotype_list]
phenotype_set = set(phenotype_list)

# load stopwords
with open('../data/phenotypes/snorkel/dicts/manual_stopwords.txt') as f:
    stopwords = {line.strip() for line in f}
stopwords.update(['analysis', 'age', 'drug', 'community', 'detect', 'activity', 'genome',
                  'genetic', 'phenotype', 'response', 'population', 'parameter', 'diagnosis',
                  'level', 'survival', 'maternal', 'paternal', 'clinical', 'joint', 'related',
                  'status', 'risk', 'protein', 'association', 'signal', 'pathway', 'genotype', 'scale'])
from nltk.corpus import stopwords as nltk_stopwords
stopwords.update(nltk_stopwords.words('english'))
stopwords = {stemmer.stem(word) for word in stopwords}

# small helpers
def get_phenotype(entity, stem=False):
    phenotype = entity.get_span()
    if stem: phenotype = stemmer.stem(phenotype)
    return phenotype.lower()

def stem_list(L):
    return [stemmer.stem(l.lower()) for l in L]

def LF_gt(m):
    return gt_dict.get(m.uid, 0) if m in training_candidates else 0

# positive LFs
def LF_first_sentence(m):
    return +10 if m.context.position == 0 else 0
def LF_from_regex(m):
    if m.context.position == 0 and not regex_phen_matcher._f(m) and not LF_bad_words(m): return +5
    else: return 0
def LF_with_acronym(m):
    post_txt = ''.join(m.post_window('words',d=5))
    return +1 if re.search(r'\([A-Z]{2,4}\)', post_txt) else 0
def LF_many_words(m):
    return +1 if len(m.get_span().split()) >= 3 else 0
def LF_start_of_sentence(m):
    return +1 if m.get_word_start() <= 3 else 0

LFs_pos = [LF_first_sentence, LF_with_acronym, LF_from_regex, LF_many_words]

# negative LFs
def LF_bad_words(m):
    bad_words = ['disease', 'single', 'map', 'genetic variation', '( p <']
    return -100 if any(m.get_span().lower().startswith(b) for b in bad_words) else 0
def LF_short(m):
    txt = m.get_attrib_span('words', 3)
    return -50 if len(txt) < 5 else 0
def LF_no_nouns(m):
    return -5 if not any(t.startswith('NN') for t in m.get_attrib_tokens('poses')) else 0
def LF_pvalue(m):
    txt = m.get_span().lower()
    return -100 if 'p <' in txt or 'p =' in txt else 0
def LF_stopwords(m):
    words = m.get_span().lower().split()
    return -10 if all(word in stopwords for word in words) or \
                  all(stemmer.stem(word) in stopwords for word in words) or \
                  all(change_name(word) in stopwords for word in words) else 0


LFs_neg = [LF_bad_words, LF_short, LF_no_nouns, LF_pvalue, LF_stopwords]

LFs = LFs_pos + LFs_neg

In [None]:
from snorkel.snorkel import TrainingSet
from snorkel.features import NgramFeaturizer

training_set = TrainingSet(training_candidates, LFs, featurizer=NgramFeaturizer())

In [None]:
lf_stats = training_set.lf_stats()
lf_stats[:5]

In [None]:
from snorkel.snorkel import Learner, PipelinedLearner
import snorkel.learning
from snorkel.learning import LogReg

learner = Learner(training_set, model=snorkel.learning.LogReg())

# Splitting into CV and test set
n_half = len(gold_candidates)/2
test_candidates = gold_candidates[:n_half]
test_labels     = gold_labels[:n_half]
cv_candidates   = gold_candidates[n_half:]
cv_labels       = gold_labels[n_half:]

In [None]:
from snorkel.learning_utils import GridSearch

gs       = GridSearch(learner, ['mu', 'lf_w0'], [[1e-5, 1e-7],[1.0,2.0]])
gs_stats = gs.fit(cv_candidates, cv_labels)

In [None]:
gs_stats

In [None]:
learner.test_wmv(test_candidates, test_labels)

## Save and analyze the results

### Analyze / Visualize

If a mention occurs in the title, its probably correct, we can take it.

Question: what papers did not have any disease mentions in the title?

In [None]:
preds = learner.predict_wmv(phen_c)
results = [c for p, c in zip(preds, phen_c) if p > 0 and c.context.position == 0]
doc_set = {c.context.document.name for c in results}
missing_docs = {doc.name for doc in corpus.documents} - doc_set
docs = sorted(list(missing_docs))
print len(docs)
for d in missing_docs:
    print d, kb.paper_by_pmid(d).title

Let's not visualize what we found.

In [None]:
scores = learner.score_wmv(phen_c)
score_dict = { doc.name : list() for doc in corpus.documents }
for s, c in zip(scores, phen_c):
    score_dict[c.context.document.name].append((s,c))

results = dict()
for pmid, preds in score_dict.items():
    if preds: 
        best_c = sorted(preds, reverse=True)[0][1]
        results[best_c.context.document.name] = best_c
    

In [None]:
# doc_set = {c.context.document.name for c in results}
# missing_docs = {doc.name for doc in corpus.documents} - doc_set
# docs = sorted(list(missing_docs))
# print len(docs)
for d in corpus.documents:
    print d, kb.paper_by_pmid(d.name).title
    print unicode(results.get(d.name, None)), LF_stopwords(results.get(d.name))
    try:
        print sorted(score_dict[d.name], reverse=True)[:5]
    except UnicodeEncodeError:
        print 'Unicode error'
    print

In [None]:
# print 'waist circumference' in phenotype_list
# [ph for ph in phenotype_list if 'sex' in ph]
[ph for ph in efo_phenotype_list if 'waist' in ph]


### Save results

In [285]:
with open('phenotypes.extracted.tsv', 'w') as f:
    for d in corpus.documents:
        # pick the top two results:
        best = sorted(score_dict[d.name], reverse=True)[:2]
        # if both are in title, report both, otherwise report only the best one
        if len(best) > 1 and best[1][1].context.position == 0 and best[1][0] > 5:
            (_, r1), (_, r2) = best
            phen = r1.get_span() + '|' + r2.get_span()
        else:
            phen = best[0][1].get_span()
        phen = re.sub('\n', ' ', phen)
        out_str = u'%s\t%s\t\n' % (d.name, phen)        
        f.write(out_str.encode("UTF-8"))
        