# SNP/Phenotype detection from raw text

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import cPickle, os, sys
np.random.seed(seed=1701)
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (18,6)

In [2]:
# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')

# set up paths
abstract_dir = '../data/db/papers'

## Assemble a corpus

Collect all the tables in our dataset

In [3]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLTableDocParser

xml_parser = UnicodeXMLTableDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [4]:
from snorkel.parser import HTMLParser
from extractor.parser import UnicodeTableParser
from snorkel.parser import CorpusParser
import cPickle

table_parser = UnicodeTableParser()
html_parser = HTMLParser(path='../data/db/papers/')

corpus_name = 'gwas-table-corpus.pkl'

try:
    with open(corpus_name,"r") as pkl:
        corpus = cPickle.load(pkl)
except:
    cp = CorpusParser(xml_parser, table_parser)
    %time corpus = cp.parse_corpus(name='GWAS Corpus')
    # pickling currently doesn't work...
#     with open(corpus_name,"w") as pkl:
#         corpus = cPickle.dump(corpus, pkl)

CPU times: user 16min 35s, sys: 49.8 s, total: 17min 25s
Wall time: 24min 36s


In [5]:
# pickling currently doesn't work...
# import cPickle
# with open(corpus_name,"w") as pkl:
#     corpus = cPickle.dump(corpus, pkl)

## Try extracting rs-ids first

In [21]:
# add new paths
sys.path.append('../src/crawler')
sys.path.append('../src/crawler/db')

# import new libs
from kb import KnowledgeBase
from extractor.util import gold_rsid_stats, gold_rsid_precision

from snorkel.candidates import Ngrams
from snorkel.matchers import DictionaryMatch, RegexMatchSpan, Union
from snorkel.candidates import EntityExtractor

# from snorkel.candidates import Candidates

### Create a gold set

Get a list of all the rs-ids we know.

In [7]:
kb = KnowledgeBase()

Get a gold set:

In [8]:
gold_set = frozenset( [ (doc.name, rs_id) for doc in corpus.documents for rs_id in kb.rsids_by_pmid(int(doc.name)) ] )
gold_set_rsids = [rs_id for doc_id, rs_id in gold_set]

In [9]:
gold_rsid_dict = {doc_id : set() for doc_id, rs_id in gold_set}
for docid, rsid in gold_set:
    gold_rsid_dict[docid].add(rsid)

In [10]:
print len(gold_set)

10073


### Extract candidates:

In [22]:
from snorkel.candidates import EntityExtractor
from snorkel.candidates import TableNgrams

# Define a candidate space
ngrams = TableNgrams(n_max=1)

# Define matchers
gold_rsid_matcher = DictionaryMatch(d=gold_set_rsids, longest_match_only=False)
regx_rsid_matcher = RegexMatchSpan(rgx=r'rs\d+(/[^s]+)?')
# dict_rsid_matcher = DictionaryMatch(d=rs_ids, longest_match_only=False)
# rsid_matcher = Union(dict_rsid_matcher, regx_rsid_matcher)

rsid_extractor = EntityExtractor(ngrams, regx_rsid_matcher)
# %time rs_candidates = rsid_extractor.extract(corpus.get_tables(), name='all')

In [37]:
for cand in rs_candidates[:10]: 
    print cand
print "%s candidates extracted" % len(rs_candidates)
print rs_candidates[0].context
print rs_candidates[0].context.cell

for cand in rs_candidates:
    if cand.context.document.name == '23455636':
        print cand.get_span(), cand.context.table.position

Span("rs2076756", context=None, chars=[0,8], words=[0,0])
Span("rs1992662", context=None, chars=[0,8], words=[0,0])
Span("rs1992660", context=None, chars=[0,8], words=[0,0])
Span("rs1793004", context=None, chars=[0,8], words=[0,0])
Span("rs10521209", context=None, chars=[0,9], words=[0,0])
Span("rs2631372", context=None, chars=[0,8], words=[0,0])
Span("rs2925757", context=None, chars=[0,8], words=[0,0])
Span("rs6947579", context=None, chars=[0,8], words=[0,0])
Span("rs1553575", context=None, chars=[0,8], words=[0,0])
Span("rs10484545", context=None, chars=[0,9], words=[0,0])
15330 candidates extracted
Phrase('17684544', 1, 20, 0, u'rs2076756')
Cell('17684544', 1, 20, u'rs2076756')
rs10490924/T 1
rs10737680/A 1
rs429608/G 1
rs2230199/C 1
rs5749482/G 1
rs4420638/A 1
rs1864163/G 1
rs943080/T 1
rs13278062/T 1
rs920915/C 1
rs4698775/G 1
rs3812111/T 1
rs13081855/T 1
rs3130783/A 1
rs8135665/T 1
rs334353/T 1
rs8017304/A 1
rs6795735/T 1
rs9542236/C 1


### Statistics

Statistics on all the rsid candidates:

In [15]:
gold_rsid_stats(rs_candidates, gold_set)

# of gold annotations	= 10537
# of candidates		= 12005
Candidate recall	= 0.427
Candidate precision	= 0.375


Interesting: some SNPs seem to be never mentioned (e.g. rs12122100) while others (rs727153) appear only in the text.

Sometimes, it's not picked up for a different, strange reason: see rs13314993.

In [16]:
from extractor.util import gold_rsid_recall

incorrect_rsids = list(gold_rsid_recall(rs_candidates, gold_set))
for ngram in incorrect_rsids[:10]:
    print ngram

('24945404', 'rs13204965')
('24386095', 'rs10158897')
('23251661', 'rs10514310')
('23382691', 'rs10822136')
('20686565', 'rs6065906')
('20395239', 'rs10483727')
('23251661', 'rs9997524')
('20858683', 'rs853789')
('22005930', 'rs9811423')
('22589738', 'rs1498095')


In [17]:
from extractor.util import gold_rsid_precision

strange_ngrams = list(gold_rsid_precision(rs_candidates, gold_set))
for ngram in strange_ngrams[70:100]:
    print ngram.context

Phrase('17903294', 6, 127, 0, u'rs10514919')
Phrase('17903294', 6, 136, 0, u'rs10514919')
Phrase('17903294', 6, 145, 0, u'rs2015729')
Phrase('17903294', 6, 154, 0, u'rs2015729')
Phrase('17903294', 6, 163, 0, u'rs2015729')
Phrase('17903294', 6, 199, 0, u'rs6956010')
Phrase('17903294', 6, 208, 0, u'rs6956010')
Phrase('17903294', 6, 217, 0, u'rs6956010')
Phrase('17903294', 6, 226, 0, u'rs917858')
Phrase('17903294', 6, 235, 0, u'rs917859')
Phrase('17903294', 6, 244, 0, u'rs2239138')
Phrase('17903294', 6, 253, 0, u'rs216901')
Phrase('17903294', 6, 262, 0, u'rs216903')
Phrase('17903294', 6, 271, 0, u'rs216904')
Phrase('17903294', 7, 29, 0, u'rs7741731')
Phrase('17903294', 7, 77, 0, u'rs4075265')
Phrase('17903294', 7, 89, 0, u'rs915171')
Phrase('17903294', 7, 113, 0, u'rs6938586')
Phrase('17903294', 7, 125, 0, u'rs3777442')
Phrase('17903294', 7, 185, 0, u'rs2105250')
Phrase('17903294', 7, 197, 0, u'rs9321263')
Phrase('17903294', 7, 209, 0, u'rs1811949')
Phrase('17903294', 7, 221, 0, u'rs93212

In [40]:
# store candidates that occur in sufficiently large tables:
rsid_by_table = dict()
for cand in rs_candidates:
    rsid = cand.get_span()
    key = cand.context.document.name, cand.context.table.position
    if key not in rsid_by_table: rsid_by_table[key] = set()
    rsid_by_table[key].add((rsid, cand.context.cell.row_num, cand.context.cell.col_num))
    
with open('rsids.singletons.all.tsv', 'w') as f:
    for (pmid, table_id), rsids in rsid_by_table.items():
        if len(rsids) < 10: continue
        for rsid, row_num, col_num in rsids:
            f.write('%s\t%s\t%s\t%s\t%s\n' % (pmid, table_id, row_num, col_num, rsid))

In [28]:
print rsid_by_table[('17903305', 4)]

set([u'rs6577648', u'rs254315', u'rs10520880', u'rs38276', u'rs986831', u'rs10520246', u'rs7610584', u'rs3751832', u'rs3794889', u'rs10492797', u'rs9327886', u'rs10505624', u'rs208354', u'rs4418248', u'rs10514443', u'rs10486031', u'rs10513681', u'rs216666', u'rs10515347', u'rs10487577', u'rs2253319', u'rs7989050', u'rs4801149', u'rs10512920', u'rs3017183', u'rs2371438', u'rs6555491', u'rs10520247', u'rs10256504', u'rs7329659', u'rs392715', u'rs2834645', u'rs10497958', u'rs1261256', u'rs6102912', u'rs4782742'])


## Get candidate p-values

Need to use regular expressions for this.

In [64]:
from extractor.matcher import PvalMatcher
from snorkel.candidates import EntityExtractor, TableNgrams
from snorkel.matchers import RegexMatchSpan, Union

# 1: p-value matcher

rgx1 = u'[1-9]\d?[\xb7\.]?\d*[\s\u2009]*[\xd7\*][\s\u2009]*10[\s\u2009]*[-\u2212\u2013\u2012][\s\u2009]*\d+'
pval_rgx_matcher1 = RegexMatchSpan(rgx=rgx1)
rgx2 = u'[1-9]\d?[\xb7\.]?\d*[\s\u2009]*[eE][\s\u2009]*[-\u2212\u2013\u2012][\s\u2009]*\d+'
pval_rgx_matcher2 = RegexMatchSpan(rgx=rgx2)
rgx3 = u'0\.0000+\d+'
pval_rgx_matcher3 = RegexMatchSpan(rgx=rgx3)
pval_rgx_matcher = Union(pval_rgx_matcher1, pval_rgx_matcher2, pval_rgx_matcher3)
# pval_matcher = PvalMatcher()
ngrams = TableNgrams(n_max=7)
pval_rgx_extractor = EntityExtractor(ngrams, pval_rgx_matcher)
# %time pval_c = pval_extractor.extract(corpus.get_tables(), name='all')

# 2: column-based matcher

from snorkel.candidates import CellSpace
from snorkel.matchers import CellNameRegexMatcher

cells = CellSpace()
pval_rgx = 'p\s?.?\s?value'
pval_rgxname_matcher = CellNameRegexMatcher(axis='col', rgx=pval_rgx, n_max=3, ignore_case=True, header_only=True, max_chars=20)
pval_rgxname_extractor = EntityExtractor(cells, pval_rgxname_matcher)

# %time pval_regex_candidates = pval_rgxname_extractor.extract(corpus.get_tables(), name='all')

# 3: combine the two

from snorkel.candidates import UnionExtractor
pval_extractor = UnionExtractor([pval_rgx_extractor, pval_rgxname_extractor])

CPU times: user 3min 7s, sys: 18.9 s, total: 3min 26s
Wall time: 4min 34s


In [61]:
special_doc = [doc for doc in corpus.documents if doc.name == '23509613'][0]
special_table = special_doc.tables[4]

pval_regex_candidates = pval_rgx_extractor.extract([special_table])
# pval_regex_canidates = pval_rgxname_extractor.extract(corpus.get_tables())
print len(pval_regex_candidates)
for c in pval_regex_candidates[:5]:
    print c.context.cell.row_num, c.context.cell.col_num, c.get_span(), #c.context.cell.head_cell('col').text

 28
2 8 1.44 E − 06 3 8 6.08 E − 06 4 8 1.09 E − 05 5 8 1.11 E − 05 6 8 2.61 E − 05


In [33]:
ngrams = TableNgrams(n_max=7)

print 'Got %d candidates, e.g.:' % len(pval_c)
for candidate in pval_c[:10]:
    print unicode(candidate)
    print candidate.context
#     for t in ngrams.apply(candidate.context):
#         print unicode(t.get_span())
#         print re.match(rgx, t.get_span())
#     print [unicode(t.get_span()) for t in ngrams.apply(candidate.context)]
#     print
#     print candidate.get_attrib_span('words')
    print

NameError: name 'pval_c' is not defined

Filter nested p-value estimates e.g. 1.2\*10^-7 and 2*10^-7

In [27]:
# load existing candidates into a dict
span_dict = { str(span.context.cell) : list() for span in pval_c }
for span in pval_c:
    span_dict[str(span.context.cell)].append( (span.char_start, span.char_end) )

def nested(ivl1, ivl2):
    if ivl1 != ivl2 and ivl2[0] <= ivl1[0] <= ivl1[1] <= ivl2[1]:
        return True
    else:
        return False

new_pval_c = list()
for span in pval_c:
    span_ivl = span.char_start, span.char_end
    span_name = str(span.context.cell)
    if all([not nested(span_ivl, other_ivl) for other_ivl in span_dict[span_name]]):
        new_pval_c.append(span)
#     else:
#         print span_ivl, span_dict[span_name]
#         print unicode(span)
#         print unicode(span.context.cell.text)
#         print span.uid
#         print span.context.row_num, span.context.col_num
#         print unicode(span)
#         print
#         print [(cell, cell.row_num, cell.col_num) for cell in span.context.table.cells]
#         break

print len(new_pval_c), len(pval_c)
pval_c = new_pval_c

1246 1246


### Statistics

First, how many p-values that should be present do we extract?

In [28]:
# we need to define a p-value -> float converter
from extractor.util import pvalue_to_float, gold_pval_stats, gold_pval_precision

print pvalue_to_float(u"6.2×10 −5")
print pvalue_to_float(u"1.85×10 −20")

6.2e-05
1.85e-20


In [29]:
from kb import KnowledgeBase
kb = KnowledgeBase()
gold_set_pvals = frozenset([ (doc.name, pval) for doc in corpus.documents for pval in kb.pvals_by_pmid(int(doc.name)) ])
gold_set_dict = {doc.name: kb.assoc_by_pmid(int(doc.name)) for doc in corpus.documents }

We will be looking at precision/recall over p-values that are known to be associated with a SNP, and the rsid of that SNP occurs somewhere in the document (as determined above).

In [30]:
from extractor.util import gold_pval_stats, gold_pval_stats_limited

print 'Found %d gold mentions, e.g.:' % len(gold_set_pvals)
print list(gold_set_pvals)[:5]
print

print 'p-value candidate extraction statistics:'
# print gold_pval_stats(pval_c, gold_set_pvals, gold_set_dict)
print gold_pval_stats_limited(pval_c, gold_set_dict, rs_candidates)

Found 107 gold mentions, e.g.:
[('17903300', 1e-07), ('17903300', 2e-07), ('17903294', 2e-06), ('17903296', 2e-07), ('17903303', 4.9999999999999996e-06)]

p-value candidate extraction statistics:
[('17684544', 1e-21), ('17684544', 4e-07), ('17684544', 3e-06), ('17903292', 9.000000000000001e-09), ('17903292', 2e-06), ('17903292', 4e-06), ('17903292', 7e-06), ('17903292', 8e-06), ('17903293', 1e-14), ('17903293', 4e-12)]
[('17684544', -21.0), ('17684544', -7.0), ('17684544', -6.0), ('17903292', -9.0), ('17903292', -6.0), ('17903293', -14.0), ('17903293', -12.0), ('17903293', -8.0), ('17903293', -7.0), ('17903293', -6.0)]
[('17684544', -49.0), ('17684544', -41.0), ('17684544', -34.0), ('17684544', -28.0), ('17684544', -22.0), ('17684544', -21.0), ('17684544', -20.0), ('17684544', -18.0), ('17684544', -17.0), ('17684544', -14.0)]
# of gold annotations	= 35
# of candidates		= 78
Candidate recall	= 0.886
Candidate precision	= 0.397
None


Some debugging... We will print the papers and their pvalues that are not found in the paper. Below, is the list of all SNPs associated with the paper, so we can find the SNP that is missing.

In [31]:
from extractor.util import gold_pval_recall, gold_pval_precision
missing = list(gold_pval_recall(pval_c, gold_set_dict, rs_candidates))
# print gold_pval_precision(pval_c, gold_set_pvals)

In [32]:
from math import log10, floor

for pmid, pval in missing[:5]:
    print pmid, pval
    print [(a.pvalue, a.phenotype.name, floor(log10(a.pvalue)), a.snp.rs_id) for a in gold_set_dict[pmid]]
    print

17903298 -8.0
[(2e-06, u'Diabetes related insulin traits', -6.0, u'rs2877832'), (3e-06, u'Diabetes related insulin traits', -6.0, u'rs2877832'), (2e-08, u'Fasting plasma glucose', -8.0, u'rs2722425'), (4.9999999999999996e-06, u'Fasting plasma glucose', -6.0, u'rs10510634'), (7e-07, u'Diabetes (incident)', -7.0, u'rs10497721'), (8e-06, u'Diabetes related insulin traits', -6.0, u'rs10486607'), (9e-06, u'Diabetes related insulin traits', -6.0, u'rs2066219'), (4.9999999999999996e-06, u'Fasting plasma glucose', -6.0, u'rs180730'), (6e-06, u'Fasting plasma glucose', -6.0, u'rs180730'), (9e-06, u'Fasting plasma glucose', -6.0, u'rs2722425'), (7e-06, u'Fasting plasma glucose', -6.0, u'rs7731657')]

17903302 -9.0
[(2e-06, u'Blood pressure', -6.0, u'rs10493340'), (3e-06, u'Blood pressure', -6.0, u'rs1963982'), (3e-06, u'Blood pressure', -6.0, u'rs935334'), (2e-06, u'Tonometry', -6.0, u'rs6063312'), (3e-06, u'Tonometry', -6.0, u'rs770189'), (6e-06, u'Tonometry', -6.0, u'rs10514688'), (6e-06, u'To

The SNPs above are either labeled incorrectly (most of the time), or could also not occur in tables.

## Extract candidate relations between SNPs and p-values

In [67]:
from snorkel.candidates import AlignedTableRelationExtractor
# Relation Extractor:
relation_extractor = AlignedTableRelationExtractor(rsid_extractor, pval_rgx_extractor, axis='row')

In [68]:
%time candidates = relation_extractor.extract(corpus.get_tables(), name='all')

for cand in candidates[:10]: 
    print cand
print "%s relations extracted" % len(candidates)

CPU times: user 8min 13s, sys: 7min 49s, total: 16min 3s
Wall time: 49min 45s
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("1.93E-13", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("2.04E-12", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("6.80E-20", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("1.39E-21", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("5.90E-08", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs1992662", context=None, chars=[0,8], words=[0,0]), Span("7.59E-05", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs1992660", context=None, chars=[0,8], words=[0,0]), Span("4.53E-05", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs10521209", context=None

In [28]:
print candidates[1000].span0.context.cell.row_num
print candidates[1000].span1.context.cell.row_num

63
63


### Statistics

Let's consider the subset of gold SNPs that have been found to match somewhere with a table.

In [29]:
rs_subset = set([span_pair.span0.get_span().lower() for span_pair in candidates])
print list(rs_subset)[:5]

[u'rs9932186', u'rs6880595', u'rs12989701', u'rs6606686', u'rs12470505']


In [30]:
gold_relations = set([(assoc.snp.rs_id, assoc.pvalue) for doc in corpus.documents for assoc in kb.assoc_by_pmid(doc.name) if assoc.snp.rs_id.lower() in rs_subset])
print list(gold_relations)[:5]
print len(gold_relations)

[(u'rs1883025', 3e-27), (u'rs10811661', 7e-07), (u'rs1532085', 7e-47), (u'rs174546', 6e-07), (u'rs163879', 2e-11)]
5304


In [31]:
kb.assoc_by_pmid(18483556)[10].pvalue

2e-24

In [33]:
from extractor.util import gold_rspval_stats

gold_rspval_stats(candidates, gold_relations)

# of gold annotations	= 4853
# of candidates		= 16645
Candidate recall	= 0.792
Candidate precision	= 0.231


Why low precision?

In [34]:
from extractor.util import gold_rspval_precision

strange_rels = gold_rspval_precision(candidates, gold_relations)
for rel in strange_rels[:5]:
    print rel
    print rel.span0.context.table, rel.span0.context.cell.row_num, rel.span0.context.cell.col_num
    print

SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("1.93E-13", context=None, chars=[0,7], words=[0,0]))
Table('17684544', 1) 2 1

SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("2.04E-12", context=None, chars=[0,7], words=[0,0]))
Table('17684544', 1) 2 1

SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("6.80E-20", context=None, chars=[0,7], words=[0,0]))
Table('17684544', 1) 2 1

SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("5.90E-08", context=None, chars=[0,7], words=[0,0]))
Table('17684544', 1) 2 1

SpanPair(Span("rs1992662", context=None, chars=[0,8], words=[0,0]), Span("7.59E-05", context=None, chars=[0,7], words=[0,0]))
Table('17684544', 1) 3 1



It seems like there are many pvalues for the same span and only one of them is actually correct.

#### Save this for now

In [69]:
import re
from extractor.util import pvalue_to_float

def clean_rsid(rsid):
    return re.sub('/.+', '', rsid)

with open('pval-rsid.raw.cols.tsv', 'w') as f:
    for rel in candidates:
        pmid = rel.span0.context.document.name
        table_id = rel.span0.context.table.position
        row_id = rel.span0.context.cell.row_num
        col_id = rel.span0.context.cell.col_num
        rsid = rel.span0.get_span()
        pval = pvalue_to_float(rel.span1.get_span())

        out_str = '%s\t%s\t%d\t%d\t%d\t%f\n' % (pmid, clean_rsid(rsid), table_id, row_id, col_id, pval)
        f.write(out_str)

Store tables that have pvalue columns

In [79]:
import re
pval_rgx = 'p\s?.?\s?value'
lod_rgx = 'LOD'

with open('table-annotations.tsv', 'w') as f:
    for doc in corpus.documents:
        for table in doc.tables:
            lod_found = 0
            pval_found = 0
            for cell in table.cells:
                if not pval_found and len(cell.text) < 30 and re.search(pval_rgx, cell.text, re.IGNORECASE):
                    pval_found = 1
                if not lod_found and re.search(lod_rgx, cell.text):
                    lod_found = 1
                if pval_found and lod_found: break
                    
            out_str = '%s\t%s\t%s\t%s\n' % (doc.name, table.position, pval_found, lod_found)
            f.write(out_str) 

## Learning the rsid/pvalue relation

### Load the gold set

In [37]:
from extractor.util import get_exponent, pvalue_to_float

gold_relations = set([(assoc.snp.rs_id, assoc.pvalue) for doc in corpus.documents for assoc in kb.assoc_by_pmid(doc.name) if assoc.snp.rs_id.lower() in rs_subset])
gold_relations = set([ (rs_id, get_exponent(pval)) for rs_id, pval in gold_relations ])

In [39]:
def spair2uid(span_pair):
    doc_id = span_pair.span0.context.document.name
    table_id = str(span_pair.span0.context.table.position)
    str1 = span_pair.span0.get_span()
    str2 = span_pair.span1.get_span()
    return (doc_id, table_id, str1, str2)

gt_dict_pos = dict()
gt_dict_neg = dict()
for crel in candidates:
    uid = spair2uid(crel)
    if (crel.span0.get_span(), get_exponent(pvalue_to_float(crel.span1.get_span()))) in gold_relations:
        gt_dict_pos[uid] = +1
    else:
        gt_dict_neg[uid] = -1

print len(gt_dict_pos), len(gt_dict_neg)
gt_dict = dict(gt_dict_pos.items() + gt_dict_neg.items())

4614 16280


In [40]:
len(gt_dict)

20894

### Load the features

Extract features, and store them to a file.

In [None]:
from snorkel.features import TableNgramPairFeaturizer

# pkl_f = 'phenotype_feats.pkl'
# try:
#     with open(pkl_f, 'rb') as f:
#         featurizer = cPickle.load(f)
# except:
featurizer = TableNgramPairFeaturizer()
featurizer.fit_transform(candidates)

Building feature index...


In [None]:
for f in featurizer.get_features_by_candidate(candidates[0])[:10]: print f

In [None]:
pkl_f = 'rsid_pval_feats.pkl'
with open(pkl_f, 'w+') as f:
    cPickle.dump(featurizer, f)

Split into training and test sets

In [None]:
# Split into train and test set
training_candidates = []
gold_candidates     = []
gold_labels         = []
n_half = len(candidates)/2
for c in candidates[:n_half]:
    uid = spair2uid(c)
    if uid in gt_dict:
        gold_candidates.append(c)
        gold_labels.append(gt_dict[uid])
    else:
        training_candidates.append(c)
training_candidates.extend(candidates[n_half:])
gold_labels = np.array(gold_labels)
print "Training set size: %s" % len(training_candidates)
print "Gold set size: %s" % len(gold_candidates)
print "Positive labels in training set: %s" % len([c for c in training_candidates if gt_dict.get(spair2uid(c),0)==1])
print "Negative labels in training set: %s" % len([c for c in training_candidates if gt_dict.get(spair2uid(c),0)==-1])
print "Positive labels in gold set: %s" % len([c for c in gold_candidates if gt_dict[spair2uid(c)]==1])
print "Negative labels in gold set: %s" % len([c for c in gold_candidates if gt_dict[spair2uid(c)]==-1])

Create the model.

### Labeling functions

In [48]:
rsid_keywords = ["id", "rsid", "snp"]
pval_keywords = ["value", "p-value", "p-val", "p_value", "pvalue"]

# positive LFs
def LF_pval_header(m):
    # if "pvalue" is mentioned in first or last row, it's probably correct
    pass
def LF_pval_aligned(m):
    # if "pvalue" is mentioned in aligned cells, it's probably correct
    return +1 if any(kw in m.span1.aligned_ngrams('words') for kw in pval_keywords) else 0
def LF_id_aligned(m):
    # if "id" is mentioned in aligned cells, it's probably correct    
    return +1 if any(kw in m.span0.aligned_ngrams('words') for kw in rsid_keywords) else 0
def LF_phen_aligned(m):
    # if there is an aligned phenotype, then it is probably correct
    pass

# negative LFs
def LF_align(m):
    # if the two spans don't align in the table, then they're clearly wrong    
    return -1 if m.span0.context.row_num != m.span1.context.row_num and \
                 m.span0.context.col_num != m.span1.context.col_num \
              else 0
def LF_diff_col(m):
    return -1 if m.span0.context.col_num != m.span1.context.col_num else 0

pos_LFs = [LF_pval_aligned, LF_id_aligned]
neg_LFs = [LF_align]
LFs = pos_LFs + neg_LFs

In [50]:
from snorkel.snorkel import TrainingSet
from snorkel.features import NgramFeaturizer

training_set = TrainingSet(training_candidates, LFs, featurizer=TableNgramPairFeaturizer())

TypeError: __init__() got an unexpected keyword argument 'featurizer'

#### Results

In [None]:
lf_stats = training_set.lf_stats()
lf_stats[:5]

In [None]:
lf_stats.hist("coverage")

In [None]:
from snorkel.snorkel import Learner
import snorkel.learning
from snorkel.learning import LogReg
print snorkel.learning.__dict__

learner = Learner(training_set, model=snorkel.learning.LogRegSKLearn())

In [None]:
# Splitting into CV and test set
n_half = len(gold_candidates)/2
test_candidates = gold_candidates[:n_half]
test_labels     = gold_labels[:n_half]
cv_candidates   = gold_candidates[n_half:]
cv_labels       = gold_labels[n_half:]

In [None]:
from snorkel.learning_utils import GridSearch

gs       = GridSearch(learner, ['mu', 'lf_w0'], [[1e-5, 1e-7],[1.0,2.0]])
gs_stats = gs.fit(cv_candidates, cv_labels)

In [None]:
gs_stats

In [None]:
learner.test(test_candidates, test_labels)

In [None]:
learner.feature_stats(n_max=10)

In [None]:
mislabeled_cand = learner.mislabeled_test_candidates(test_candidates, test_labels)
for (c,p,g) in mislabeled_cand[50:500]:
    snp_name = c.span0.get_span()
    if snp_name not in gold_rel_dict: continue
    print c.span0.context.document.name
    print c.span0.context    
    print c.span1.context
    print

In [None]:
gold_rel_dict = dict(gold_relations)

In [None]:
gold_rel_dict