# SNP/Phenotype detection from raw text

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import cPickle, os, sys
np.random.seed(seed=1701)
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (18,6)

In [2]:
# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')

# set up paths
abstract_dir = '../data/db/papers'

## Assemble a corpus

Collect all the tables in our dataset

In [3]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLTableDocParser

xml_parser = UnicodeXMLTableDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [4]:
import lxml.etree as et
with open('../data/db/papers/19305408.xml') as f:
    for i,doc in enumerate(et.parse(f).xpath('./*')):
        print doc
#         print et.tostring(doc.xpath('.//table')[0])
#         print doc.xpath('.//table')
#         print '\n'.join([et.tostring(elem) for elem in doc.xpath('.//table') if elem is not None])

<Element article at 0x10bfbf280>


In [5]:
from snorkel.parser import HTMLParser
from extractor.parser import UnicodeTableParser
from snorkel.parser import CorpusParser
import cPickle

table_parser = UnicodeTableParser()
html_parser = HTMLParser(path='../data/db/papers/')

corpus_name = 'gwas-table-corpus.pkl'

try:
    with open(corpus_name,"r") as pkl:
        corpus = cPickle.load(pkl)
except:
    cp = CorpusParser(xml_parser, table_parser, max_docs=50)
    %time corpus = cp.parse_corpus(name='GWAS Corpus')
    # pickling currently doesn't work...
#     with open(corpus_name,"w") as pkl:
#         corpus = cPickle.dump(corpus, pkl)

CPU times: user 2min 4s, sys: 6.6 s, total: 2min 10s
Wall time: 3min 6s


In [6]:
# pickling currently doesn't work...
# import cPickle
# with open(corpus_name,"w") as pkl:
#     corpus = cPickle.dump(corpus, pkl)

In [7]:
print corpus.documents[0]

Document('17447842', Corpus (GWAS Corpus))


In [8]:
for cell in corpus.get_tables()[97].cells:
#     print cell.row_num , cell.col_num, cell.text
    pass

In [9]:
print corpus.get_tables()[100].document

Document('18604267', Corpus (GWAS Corpus))


In [10]:
from snorkel.models import Context
context = Context()
print context.id

None


## Try extracting rs-ids first

In [11]:
# add new paths
sys.path.append('../src/crawler')
sys.path.append('../src/crawler/db')

# import new libs
from kb import KnowledgeBase
from extractor.util import gold_rsid_stats, gold_rsid_precision

from snorkel.candidates import Ngrams
from snorkel.matchers import DictionaryMatch, RegexMatchSpan, Union
from snorkel.candidates import EntityExtractor

# from snorkel.candidates import Candidates

### Create a gold set

Get a list of all the rs-ids we know.

In [12]:
kb = KnowledgeBase()

In [13]:
rs_ids = kb.get_rsid_candidates()
print rs_ids[0]

rs4950928


Get a gold set:

In [14]:
gold_set = frozenset( [ (doc.name, rs_id) for doc in corpus.documents for rs_id in kb.rsids_by_pmid(int(doc.name)) ] )
gold_set_rsids = [rs_id for doc_id, rs_id in gold_set]

In [15]:
gold_rsid_dict = {doc_id : set() for doc_id, rs_id in gold_set}
for docid, rsid in gold_set:
    gold_rsid_dict[docid].add(rsid)

In [16]:
print len(gold_set)

437


### Extract candidates:

In [17]:
from snorkel.candidates import EntityExtractor
from snorkel.candidates import TableNgrams

# Define a candidate space
ngrams = TableNgrams(n_max=1)

# Define matchers
dict_rsid_matcher = DictionaryMatch(d=rs_ids, longest_match_only=False)
gold_rsid_matcher = DictionaryMatch(d=gold_set_rsids, longest_match_only=False)
regx_rsid_matcher = RegexMatchSpan(rgx=r'rs\d+')
rsid_matcher = Union(dict_rsid_matcher, regx_rsid_matcher)

rsid_extractor = EntityExtractor(ngrams, rsid_matcher)
%time rs_candidates = rsid_extractor.extract(corpus.get_tables(), name='all')

# # collect candidates
# %time gold_dict_c = Candidates(ngrams, gold_rsid_matcher, corpus.get_contexts())
# %time rsid_c = Candidates(ngrams, rsid_matcher, corpus.get_contexts())

CPU times: user 2.45 s, sys: 42.6 ms, total: 2.49 s
Wall time: 2.49 s


In [18]:
for cand in rs_candidates[:10]: 
    print cand
print "%s candidates extracted" % len(rs_candidates)
print rs_candidates[0].context
print rs_candidates[0].context.cell

Span("rs2076756", context=None, chars=[0,8], words=[0,0])
Span("rs1992662", context=None, chars=[0,8], words=[0,0])
Span("rs1992660", context=None, chars=[0,8], words=[0,0])
Span("rs1793004", context=None, chars=[0,8], words=[0,0])
Span("rs10521209", context=None, chars=[0,9], words=[0,0])
Span("rs2631372", context=None, chars=[0,8], words=[0,0])
Span("rs2925757", context=None, chars=[0,8], words=[0,0])
Span("rs6947579", context=None, chars=[0,8], words=[0,0])
Span("rs1553575", context=None, chars=[0,8], words=[0,0])
Span("rs10484545", context=None, chars=[0,9], words=[0,0])
2933 candidates extracted
Phrase('17684544', 1, 20, 0, u'rs2076756')
Cell('17684544', 1, 20, u'rs2076756')


### Statistics

Statistics on all the rsid candidates:

In [19]:
gold_rsid_stats(rs_candidates, gold_set)

# of gold annotations	= 437
# of candidates		= 2455
Candidate recall	= 0.854
Candidate precision	= 0.152


Interesting: some SNPs seem to be never mentioned (e.g. rs12122100) while others (rs727153) appear only in the text.

Sometimes, it's not picked up for a different, strange reason: see rs13314993.

In [20]:
from extractor.util import gold_rsid_recall

incorrect_rsids = list(gold_rsid_recall(rs_candidates, gold_set))
for ngram in incorrect_rsids[:10]:
    print ngram

('19043545', 'rs6807064')
('18823527', 'rs727153')
('18464913', 'rs9461688')
('17997608', 'rs6855911')
('18464913', 'rs9635963')
('19043545', 'rs9924951')
('18464913', 'rs2729409')
('18464913', 'rs4889294')
('18464913', 'rs1285407')
('18282107', 'rs7341475')


In [21]:
from extractor.util import gold_rsid_precision

strange_ngrams = list(gold_rsid_precision(rs_candidates, gold_set))
for ngram in strange_ngrams[70:100]:
    print ngram.context

Phrase('17903292', 1, 371, 0, u'rs295136')
Phrase('17903292', 1, 389, 0, u'rs754958')
Phrase('17903292', 1, 396, 0, u'rs10495487')
Phrase('17903292', 1, 403, 0, u'rs10489578')
Phrase('17903292', 1, 410, 0, u'rs10515134')
Phrase('17903292', 1, 417, 0, u'rs10489578')
Phrase('17903292', 1, 424, 0, u'rs10484370')
Phrase('17903292', 1, 431, 0, u'rs10511176')
Phrase('17903292', 1, 438, 0, u'rs10502302')
Phrase('17903292', 2, 9, 0, u'rs2839235')
Phrase('17903292', 2, 16, 0, u'rs10512437')
Phrase('17903292', 2, 23, 0, u'rs2480555')
Phrase('17903292', 2, 30, 0, u'rs10486135')
Phrase('17903292', 2, 37, 0, u'rs727087')
Phrase('17903292', 2, 44, 0, u'rs1005066')
Phrase('17903292', 2, 51, 0, u'rs2885618')
Phrase('17903292', 2, 58, 0, u'rs10496887')
Phrase('17903292', 2, 72, 0, u'rs10485409')
Phrase('17903292', 2, 86, 0, u'rs10502192')
Phrase('17903292', 2, 93, 0, u'rs2077678')
Phrase('17903292', 2, 100, 0, u'rs723464')
Phrase('17903292', 2, 107, 0, u'rs9305355')
Phrase('17903292', 2, 114, 0, u'rs10

In [22]:
gold_rsid_dict['17903292']

{'rs10499559', 'rs1158167', 'rs1712790', 'rs6977660', 'rs9305354', 'rs9322817'}

## Get candidate p-values

Need to use regular expressions for this.

In [23]:
from extractor.matcher import PvalMatcher
rgx1 = u'[1-9]\d?[\xb7\.]?\d*\s*[\xd7\*]\s*10\s*[-\u2212\u2013]\s*\d+'
pval_rgx_matcher1 = RegexMatchSpan(rgx=rgx1)
rgx2 = u'[1-9]\d?[\xb7\.]?\d*\s*[eE][-\u2212\u2013]\d+'
pval_rgx_matcher2 = RegexMatchSpan(rgx=rgx2)
rgx3 = u'0\.0000+\d+'
pval_rgx_matcher3 = RegexMatchSpan(rgx=rgx3)
pval_matcher = Union(pval_rgx_matcher1, pval_rgx_matcher2, pval_rgx_matcher3)
# pval_matcher = PvalMatcher()
ngrams = TableNgrams(n_max=7)
pval_extractor = EntityExtractor(ngrams, pval_matcher)
%time pval_c = pval_extractor.extract(corpus.get_tables(), name='all')

CPU times: user 5.05 s, sys: 73.4 ms, total: 5.13 s
Wall time: 5.13 s


In [24]:
ngrams = TableNgrams(n_max=7)

print 'Got %d candidates, e.g.:' % len(pval_c)
for candidate in pval_c[:10]:
    print unicode(candidate)
    print candidate.context
#     for t in ngrams.apply(candidate.context):
#         print unicode(t.get_span())
#         print re.match(rgx, t.get_span())
#     print [unicode(t.get_span()) for t in ngrams.apply(candidate.context)]
#     print
#     print candidate.get_attrib_span('words')
    print

Got 3228 candidates, e.g.:
Span("1.93E-13", context=None, chars=[0,7], words=[0,0])
Phrase('17684544', 1, 26, 0, u'1.93E-13')

Span("2.04E-12", context=None, chars=[0,7], words=[0,0])
Phrase('17684544', 1, 27, 0, u'2.04E-12')

Span("6.80E-20", context=None, chars=[0,7], words=[0,0])
Phrase('17684544', 1, 31, 0, u'6.80E-20')

Span("1.39E-21", context=None, chars=[0,7], words=[0,0])
Phrase('17684544', 1, 32, 0, u'1.39E-21')

Span("5.90E-08", context=None, chars=[0,7], words=[0,0])
Phrase('17684544', 1, 34, 0, u'5.90E-08')

Span("7.59E-05", context=None, chars=[0,7], words=[0,0])
Phrase('17684544', 1, 47, 0, u'7.59E-05')

Span("4.53E-05", context=None, chars=[0,7], words=[0,0])
Phrase('17684544', 1, 63, 0, u'4.53E-05')

Span("8.65E-05", context=None, chars=[0,7], words=[0,0])
Phrase('17684544', 1, 95, 0, u'8.65E-05')

Span("8.68E-05", context=None, chars=[0,7], words=[0,0])
Phrase('17684544', 1, 96, 0, u'8.68E-05')

Span("1.68E-06", context=None, chars=[0,7], words=[0,0])
Phrase('17684544

In [25]:
for ngram in pval_c:
    if ngram.context.document.name == '19096518':
        print str(ngram.context.cell)

Cell('19096518', 0, 18, u'6.8E-10')
Cell('19096518', 0, 19, u'7.8E-05')
Cell('19096518', 0, 28, u'1.2E-08')
Cell('19096518', 0, 29, u'8.1E-04')
Cell('19096518', 0, 38, u'2.5E-07')
Cell('19096518', 0, 39, u'7.8E-03')
Cell('19096518', 0, 48, u'1.7E-07')
Cell('19096518', 0, 49, u'5.8E-03')
Cell('19096518', 0, 58, u'2.3E-08')
Cell('19096518', 0, 59, u'1.3E-03')
Cell('19096518', 0, 68, u'2.8E-12')
Cell('19096518', 0, 69, u'9.6E-07')
Cell('19096518', 0, 78, u'6.2E-12')
Cell('19096518', 0, 79, u'1.1E-06')
Cell('19096518', 0, 88, u'9.8E-08')
Cell('19096518', 0, 89, u'3.8E-03')
Cell('19096518', 0, 98, u'5.5E-08')
Cell('19096518', 0, 99, u'2.7E-03')
Cell('19096518', 0, 108, u'6.4E-09')
Cell('19096518', 0, 109, u'5.5E-04')
Cell('19096518', 1, 15, u'4.7E-10')
Cell('19096518', 1, 23, u'5.7E-12')
Cell('19096518', 1, 31, u'5.4E-08')
Cell('19096518', 1, 39, u'1.8E-25')
Cell('19096518', 1, 47, u'4.3E-19')
Cell('19096518', 2, 23, u'2.2E-07')
Cell('19096518', 2, 28, u'1.5E-26')
Cell('19096518', 4, 54, u'

Filter nested p-value estimates e.g. 1.2\*10^-7 and 2*10^-7

In [26]:
# load existing candidates into a dict
span_dict = { str(span.context.cell) : list() for span in pval_c }
for span in pval_c:
    span_dict[str(span.context.cell)].append( (span.char_start, span.char_end) )

def nested(ivl1, ivl2):
    if ivl1 != ivl2 and ivl2[0] <= ivl1[0] <= ivl1[1] <= ivl2[1]:
        return True
    else:
        return False

new_pval_c = list()
for span in pval_c:
    span_ivl = span.char_start, span.char_end
    span_name = str(span.context.cell)
    if all([not nested(span_ivl, other_ivl) for other_ivl in span_dict[span_name]]):
        new_pval_c.append(span)
#     else:
#         print span_ivl, span_dict[span_name]
#         print unicode(span)
#         print unicode(span.context.cell.text)
#         print span.uid
#         print span.context.row_num, span.context.col_num
#         print unicode(span)
#         print
#         print [(cell, cell.row_num, cell.col_num) for cell in span.context.table.cells]
#         break

print len(new_pval_c), len(pval_c)
pval_c = new_pval_c

3181 3228


### Statistics

First, how many p-values that should be present do we extract?

In [41]:
# we need to define a p-value -> float converter
from extractor.util import pvalue_to_float, gold_pval_stats, gold_pval_precision

print pvalue_to_float(u"6.2×10 −5")
print pvalue_to_float(u"1.85×10 −20")

6.2e-05
1.85e-20


In [42]:
from kb import KnowledgeBase
kb = KnowledgeBase()
gold_set_pvals = frozenset([ (doc.name, pval) for doc in corpus.documents for pval in kb.pvals_by_pmid(int(doc.name)) ])
gold_set_dict = {doc.name: kb.assoc_by_pmid(int(doc.name)) for doc in corpus.documents }

We will be looking at precision/recall over p-values that are known to be associated with a SNP, and the rsid of that SNP occurs somewhere in the document (as determined above).

In [43]:
from extractor.util import gold_pval_stats, gold_pval_stats_limited

print 'Found %d gold mentions, e.g.:' % len(gold_set_pvals)
print list(gold_set_pvals)[:5]
print

print 'p-value candidate extraction statistics:'
# print gold_pval_stats(pval_c, gold_set_pvals, gold_set_dict)
print gold_pval_stats_limited(pval_c, gold_set_dict, rs_candidates)

Found 299 gold mentions, e.g.:
[('17903300', 1e-07), ('17903300', 2e-07), ('18604267', 3e-10), ('18464913', 6e-06), ('17903294', 2e-06)]

p-value candidate extraction statistics:
[('17684544', 1e-21), ('17684544', 4e-07), ('17684544', 3e-06), ('17903292', 9.000000000000001e-09), ('17903292', 2e-06), ('17903292', 4e-06), ('17903292', 7e-06), ('17903292', 8e-06), ('17903293', 1e-14), ('17903293', 4e-12)]
[('17684544', -21.0), ('17684544', -7.0), ('17684544', -6.0), ('17903292', -9.0), ('17903292', -6.0), ('17903293', -14.0), ('17903293', -12.0), ('17903293', -8.0), ('17903293', -7.0), ('17903293', -6.0)]
[('17684544', -49.0), ('17684544', -41.0), ('17684544', -34.0), ('17684544', -28.0), ('17684544', -22.0), ('17684544', -21.0), ('17684544', -20.0), ('17684544', -18.0), ('17684544', -17.0), ('17684544', -14.0)]
# of gold annotations	= 147
# of candidates		= 434
Candidate recall	= 0.932
Candidate precision	= 0.316
None


Some debugging... We will print the papers and their pvalues that are not found in the paper. Below, is the list of all SNPs associated with the paper, so we can find the SNP that is missing.

In [44]:
from extractor.util import gold_pval_recall, gold_pval_precision
missing = list(gold_pval_recall(pval_c, gold_set_dict, rs_candidates))
# print gold_pval_precision(pval_c, gold_set_pvals)

In [45]:
from math import log10, floor

for pmid, pval in missing[:5]:
    print pmid, pval
    print [(a.pvalue, a.phenotype.name, floor(log10(a.pvalue)), a.snp.rs_id) for a in gold_set_dict[pmid]]
    print

17903302 -9.0
[(2e-06, u'Blood pressure', -6.0, u'rs10493340'), (3e-06, u'Blood pressure', -6.0, u'rs1963982'), (3e-06, u'Blood pressure', -6.0, u'rs935334'), (2e-06, u'Tonometry', -6.0, u'rs6063312'), (3e-06, u'Tonometry', -6.0, u'rs770189'), (6e-06, u'Tonometry', -6.0, u'rs10514688'), (6e-06, u'Tonometry', -6.0, u'rs7042864'), (8e-06, u'Tonometry', -6.0, u'rs1322512'), (2e-07, u'Tonometry', -7.0, u'rs3773643'), (2e-06, u'Tonometry', -6.0, u'rs3793427'), (2e-06, u'Tonometry', -6.0, u'rs6492654'), (3e-06, u'Tonometry', -6.0, u'rs1367248'), (4e-06, u'Tonometry', -6.0, u'rs10521232'), (4e-06, u'Tonometry', -6.0, u'rs3766680'), (4e-06, u'Tonometry', -6.0, u'rs1371924'), (8e-06, u'Tonometry', -6.0, u'rs10488172'), (4e-06, u'Blood pressure', -6.0, u'rs4370013'), (4e-06, u'Blood pressure', -6.0, u'rs10491334'), (4.9999999999999996e-06, u'Blood pressure', -6.0, u'rs2121070'), (7e-06, u'Blood pressure', -6.0, u'rs2509458'), (3e-07, u'Blood pressure', -7.0, u'rs7591163'), (1e-09, u'Blood pressu

The SNPs above are either labeled incorrectly (most of the time), or could also not occur in tables.

## Extract candidate relations between SNPs and p-values

In [46]:
from snorkel.candidates import RelationExtractor
# Relation Extractor:
relation_extractor = RelationExtractor(rsid_extractor, pval_extractor)

In [47]:
%time candidates = relation_extractor.extract(corpus.get_tables(), name='all')

for cand in candidates[:10]: 
    print cand
print "%s relations extracted" % len(candidates)

CPU times: user 3min 8s, sys: 3.39 s, total: 3min 12s
Wall time: 3min 15s
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("1.93E-13", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("2.04E-12", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("6.80E-20", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("1.39E-21", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("5.90E-08", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("7.59E-05", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("4.53E-05", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, cha

In [48]:
candidates[0].span1

Span("1.93E-13", context=None, chars=[0,7], words=[0,0])

### Statistics

Let's consider the subset of gold SNPs that have been found to match somewhere with a table.

In [49]:
rs_subset = set([span.get_span().lower() for span in rs_candidates])
print list(rs_subset)[:5]

[u'rs10484246', u'rs7865184', u'rs1959289', u'rs2105819', u'rs1349721']


In [50]:
gold_relations = set([(assoc.snp.rs_id, assoc.pvalue) for doc in corpus.documents for assoc in kb.assoc_by_pmid(doc.name) if assoc.snp.rs_id.lower() in rs_subset])
print list(gold_relations)[:5]
print len(gold_relations)

[(u'rs1402837', 5e-10), (u'rs10811661', 7e-07), (u'rs7815788', 4.9999999999999996e-06), (u'rs4466137', 3e-06), (u'rs10489849', 1e-06)]
392


In [51]:
kb.assoc_by_pmid(18483556)[10].pvalue

2e-24

In [52]:
from extractor.util import gold_rspval_stats

gold_rspval_stats(candidates, gold_relations)

# of gold annotations	= 387
# of candidates		= 12477
Candidate recall	= 0.951
Candidate precision	= 0.029


## Learning the rsid/pvalue relation

### Load the gold set

In [53]:
from extractor.util import get_exponent, pvalue_to_float

gold_relations = set([(assoc.snp.rs_id, assoc.pvalue) for doc in corpus.documents for assoc in kb.assoc_by_pmid(doc.name) if assoc.snp.rs_id.lower() in rs_subset])
gold_relations = set([ (rs_id, get_exponent(pval)) for rs_id, pval in gold_relations ])

In [54]:
gt_dict = dict()
n_pos = 0
n_neg = 0
for crel in candidates[:10000]:
    if (crel.span0.get_span(), get_exponent(pvalue_to_float(crel.span1.get_span()))) in gold_relations:
        gt_dict[crel.id] = +1
        n_pos += 1
    else:
        gt_dict[crel.id] = -1
        n_neg += 1

print n_pos, n_neg

745 9255


### Load the features

Extract features, and store them to a file.

In [None]:
from snorkel.features import TableNgramPairFeaturizer

pkl_f = 'phenotype_feats.pkl'
try:
    with open(pkl_f, 'rb') as f:
        featurizer = cPickle.load(f)
except:
    featurizer = TableNgramPairFeaturizer()
    featurizer.fit_transform(candidates)

In [None]:
for f in featurizer.get_features_by_candidate(candidates[0])[:10]: print f

In [None]:
pkl_f = 'rsid_pval_feats.pkl'
with open(pkl_f, 'w+') as f:
    cPickle.dump(featurizer, f)

Split into training and test sets

In [None]:
# Split into train and test set
training_candidates = []
gold_candidates     = []
gold_labels         = []
n_half = len(candidates)/2
for c in candidates[:n_half]:
    if c.uid in gt_dict:
        gold_candidates.append(c)
        gold_labels.append(gt_dict[c.uid])
    else:
        training_candidates.append(c)
training_candidates.extend(candidates[n_half:])
gold_labels = np.array(gold_labels)
print "Training set size: %s" % len(training_candidates)
print "Gold set size: %s" % len(gold_candidates)
print "Positive labels in gold set: %s" % len([c for c in gold_candidates if gt_dict[c.uid]==1])
print "Negative labels in gold set: %s" % len([c for c in gold_candidates if gt_dict[c.uid]==-1])

Create the model.

### Labeling functions

In [299]:
rsid_keywords = ["id", "rsid", "snp"]
pval_keywords = ["value", "p-value", "p-val", "p_value", "pvalue"]

# positive LFs
def LF_pval_header(m):
    # if "pvalue" is mentioned in first or last row, it's probably correct
def LF_pval_aligned(m):
    # if "pvalue" is mentioned in aligned cells, it's probably correct
def LF_id_aligned(m):
    # if "id" is mentioned in aligned cells, it's probably correct    
def LF_id_aligned(m):
    # if there is an aligned phenotype, then it is probably correct
    pass

# negative LFs
def LF_dont_align(m):
    # if the two spans don't align in the table, then they're clearly wrong
    return -1 if 'previously' in m.pre_window('lemmas', 8) else 0
def LF_obese(m):
    return -1 if 'obese' in m.mention('words') and len(m.mention('words')) == 1 else 0
def LF_further(m):
    return -1 if 'further' in m.lemmas or 'furthermore' in m.lemmas else 0
def LF_also(m):
    return -1 if 'also' in m.words else 0
def LF_age(m):
    return -1 if 'age' in m.mention('words') or 'aging' in m.mention('words') or 'ages' in m.mention('words') else 0
def LF_trait(m):
    return -1 if 'trait' in m.mention('lemmas') else 0
def LF_weight(m):
    return -1 if 'weight' in m.mention('lemmas') else 0
def LF_recently(m):
    return -1 if 'recently' in m.words else 0
def LF_addit(m):
    return -1 if 'addit' in m.lemmas else 0
def LF_may(m):
    return -1 if 'may' in m.words else 0
def LF_short(m):
    return -1 if len(m.mention('lemmas')) == 1 in m.lemmas else 0

pos_LFs = []
neg_LFs = []
LFs = pos_LFs + neg_LFs

In [306]:
from snorkel.snorkel import TrainingSet
from snorkel.features import NgramFeaturizer

training_set = TrainingSet(training_candidates, LFs, featurizer=TableNgramPairFeaturizer())

#### Results

In [None]:
lf_stats = training_set.lf_stats()
lf_stats[:5]

In [None]:
lf_stats.hist("coverage")

In [None]:
from snorkel.snorkel import Learner
from snorkel.learning import LogReg

learner = Learner(training_set, model=LogReg(bias_term=True))

In [None]:
# Splitting into CV and test set
n_half = len(gold_candidates)/2
test_candidates = gold_candidates[:n_half]
test_labels     = gold_labels[:n_half]
cv_candidates   = gold_candidates[n_half:]
cv_labels       = gold_labels[n_half:]

In [None]:
from snorkel.learning_utils import GridSearch

gs       = GridSearch(learner, ['mu', 'lf_w0'], [[1e-5, 1e-7],[1.0,2.0]])
gs_stats = gs.fit(cv_candidates, cv_labels)

In [None]:
gs_stats

In [None]:
learner.test(test_candidates, test_labels)

In [None]:
learner.feature_stats(n_max=10)