# Extracting SNP/p-value relations from tables

This module parses XML tables and extracts relations between SNPs and the p-values at which they are deemed to be significant.

## Preparations

We start by configuring Jupyter and setting up our environment.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle
import numpy as np
import sqlalchemy

# set the paths to snorkel and gwasdb
sys.path.append('../snorkel-tables')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up the directory with the input papers
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

# create a Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

### Load corpus

We load our usual corpus of GWAS papers.

In [2]:
from snorkel.parser import XMLMultiDocParser
from extractor.parser import UnicodeXMLTableDocParser

xml_parser = XMLMultiDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [3]:
from snorkel.parser import CorpusParser, OmniParser
from snorkel.models import Corpus

# parses tables into rows, cols, cells...
table_parser = OmniParser(timeout=1000000)

try:
    corpus = session.query(Corpus).filter(Corpus.name == 'GWAS Table Corpus').one()
except:
    cp = CorpusParser(xml_parser, table_parser)
    %time corpus = cp.parse_corpus(name='GWAS Table Corpus', session=session)
    session.add(corpus)
    session.commit()

print 'Loaded corpus of %d documents' % len(corpus)

Loaded corpus of 589 documents


## Candidate Extraction

### Defining candidate matchers

We genereate RSid candidates from all spans that match the following regular expression.

In [4]:
from snorkel.matchers import RegexMatchSpan
rsid_matcher = RegexMatchSpan(rgx=r'rs\d+(/[ATCG]{1,2})*$')

Similarly, p-value candidates are all spans that match the following regular expression.

In [5]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import RegexMatchSpan, Union

# 1: p-value matcher

rgx1 = u'[1-9]\d?[\xb7\.]?\d*[\s\u2009]*[\xd7\xb7\*][\s\u2009]*10[\s\u2009]*[-\u2212\u2013\u2012][\s\u2009]*\d+'
pval_rgx_matcher1 = RegexMatchSpan(rgx=rgx1)
rgx2 = u'[1-9]\d?[\xb7\.]?\d*[\s\u2009]*[eE][\s\u2009]*[-\u2212\u2013\u2012][\s\u2009]*\d+'
pval_rgx_matcher2 = RegexMatchSpan(rgx=rgx2)
rgx3 = u'0\.0000+\d+'
pval_rgx_matcher3 = RegexMatchSpan(rgx=rgx3)
pval_rgx_matcher = Union(pval_rgx_matcher1, pval_rgx_matcher2, pval_rgx_matcher3)

# 2: column-based matcher (currently not used)

from snorkel.matchers import CellNameRegexMatcher

pval_rgx = 'p\s?.?\s?value'
pval_rgxname_matcher = CellNameRegexMatcher(axis='col', rgx=pval_rgx, n_max=3, ignore_case=True, header_only=True, max_chars=20)

# 3: combine the two

pval_matcher = Union(pval_rgx_matcher, pval_rgxname_matcher)

## Extract candidate relations between SNPs and p-values

In [6]:
# create a Snorkel class for the relation we will extract
from snorkel.models import candidate_subclass
RsidPhenRel = candidate_subclass('RsidPvalRel', ['rsid','pval'])

# define our candidate spaces
from snorkel.candidates import TableNgrams
unigrams = TableNgrams(n_max=1)
heptagrams = TableNgrams(n_max=7)

# we will be looking only at aligned cells
from snorkel.throttlers import AlignmentThrottler
row_align_filter = AlignmentThrottler(axis='row', infer=False)

# the first extractor looks at phenotype names in columns with a header indicating it's a phenotype
from snorkel.candidates import CandidateExtractor
ce = CandidateExtractor(RsidPhenRel, [unigrams, heptagrams], [rsid_matcher, pval_rgx_matcher], throttler=row_align_filter)

# collect that cells that will be searched for candidates
tables = [table for doc in corpus.documents for table in doc.tables]

In [7]:
%time rels = ce.extract(tables, 'RsidPvalRel Relations', session)
print "%s relations extracted, e.g." % len(rels)
for cand in rels[:10]:
    print cand

CPU times: user 5min 55s, sys: 1min 13s, total: 7min 9s
Wall time: 8min 43s
22572 relations extracted, e.g.
RsidPvalRel(Span("rs2517646", parent=416004, chars=[0,8], words=[0,0]), Span("1.95E-11", parent=416012, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs12722489", parent=416148, chars=[0,9], words=[0,0]), Span("2.16E-07", parent=416156, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs3132671", parent=416052, chars=[0,8], words=[0,0]), Span("3.78E-09", parent=416060, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs757262", parent=416136, chars=[0,7], words=[0,0]), Span("1.75E-07", parent=416144, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs2857439", parent=416028, chars=[0,8], words=[0,0]), Span("1.86E-10", parent=416036, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs1736916", parent=416112, chars=[0,8], words=[0,0]), Span("1.22E-07", parent=416120, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs3905495", parent=416100, chars=[0,8], words=[0,0]), Span("4.76E-08", parent=416108, ch

In [13]:
for r in rels:
    if r[0].parent.document.name == '17903305':
#     if r[0].parent.document.name == '25133637':
        print unicode(r)

RsidPvalRel(Span("rs10487920", parent=308138, chars=[0,9], words=[0,0]), Span("3.9 × 10 -4", parent=308142, chars=[0,10], words=[0,3]))
RsidPvalRel(Span("rs1924587", parent=308149, chars=[0,8], words=[0,0]), Span("4.6 × 10 -4", parent=308153, chars=[0,10], words=[0,3]))
RsidPvalRel(Span("rs6479347", parent=308144, chars=[0,8], words=[0,0]), Span("4.5 × 10 -4", parent=308148, chars=[0,10], words=[0,3]))
RsidPvalRel(Span("rs2059273", parent=308155, chars=[0,8], words=[0,0]), Span("4.9 × 10 -4", parent=308159, chars=[0,10], words=[0,3]))
RsidPvalRel(Span("rs2822669", parent=308161, chars=[0,8], words=[0,0]), Span("5.7 × 10 -4", parent=308165, chars=[0,10], words=[0,3]))
RsidPvalRel(Span("rs9325782", parent=308297, chars=[0,8], words=[0,0]), Span("8.2 × 10 -4", parent=308300, chars=[0,10], words=[0,3]))
RsidPvalRel(Span("rs905883", parent=308276, chars=[0,7], words=[0,0]), Span("2.0 × 10 -4", parent=308279, chars=[0,10], words=[0,3]))
RsidPvalRel(Span("rs7001069", parent=308315, chars=[0,8

In [67]:
from snorkel.candidates import AlignedTableRelationExtractor
# Relation Extractor:
relation_extractor = AlignedTableRelationExtractor(rsid_extractor, pval_rgx_extractor, axis='row')

In [68]:
%time candidates = relation_extractor.extract(corpus.get_tables(), name='all')

for cand in candidates[:10]: 
    print cand
print "%s relations extracted" % len(candidates)

CPU times: user 8min 13s, sys: 7min 49s, total: 16min 3s
Wall time: 49min 45s
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("1.93E-13", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("2.04E-12", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("6.80E-20", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("1.39E-21", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs2076756", context=None, chars=[0,8], words=[0,0]), Span("5.90E-08", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs1992662", context=None, chars=[0,8], words=[0,0]), Span("7.59E-05", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs1992660", context=None, chars=[0,8], words=[0,0]), Span("4.53E-05", context=None, chars=[0,7], words=[0,0]))
SpanPair(Span("rs10521209", context=None

Filter nested p-value estimates e.g. 1.2\*10^-7 and 2*10^-7

In [27]:
# load existing candidates into a dict
span_dict = { str(span.context.cell) : list() for span in pval_c }
for span in pval_c:
    span_dict[str(span.context.cell)].append( (span.char_start, span.char_end) )

def nested(ivl1, ivl2):
    if ivl1 != ivl2 and ivl2[0] <= ivl1[0] <= ivl1[1] <= ivl2[1]:
        return True
    else:
        return False

new_pval_c = list()
for span in pval_c:
    span_ivl = span.char_start, span.char_end
    span_name = str(span.context.cell)
    if all([not nested(span_ivl, other_ivl) for other_ivl in span_dict[span_name]]):
        new_pval_c.append(span)
#     else:
#         print span_ivl, span_dict[span_name]
#         print unicode(span)
#         print unicode(span.context.cell.text)
#         print span.uid
#         print span.context.row_num, span.context.col_num
#         print unicode(span)
#         print
#         print [(cell, cell.row_num, cell.col_num) for cell in span.context.table.cells]
#         break

print len(new_pval_c), len(pval_c)
pval_c = new_pval_c

1246 1246


In [28]:
print candidates[1000].span0.context.cell.row_num
print candidates[1000].span1.context.cell.row_num

63
63


#### Save this for now

In [12]:
import re
from extractor.util import pvalue_to_float

def clean_rsid(rsid):
    return re.sub('/.+', '', rsid)

# NOTE: THIS IS ON UNPROCESSED RELS
with open('pval-rsid.raw.cols.tsv', 'w') as f:
    for rel in rels:
        pmid = rel[0].parent.document.name
        table_id = rel[0].parent.table.position
        row_id = rel[0].parent.cell.row.position
        col_id = rel[0].parent.cell.col.position
        rsid = rel[0].get_span()
        pval = pvalue_to_float(rel[1].get_span())
        
        try:
            out_str = '%s\t%s\t%d\t%d\t%d\t%f\n' % (pmid, clean_rsid(rsid), table_id, row_id, col_id, pval)
        except:
            print pmid, clean_rsid(rsid), table_id, row_id, col_id, pval
        f.write(out_str)

## Extracting singleton relations

In [40]:
# store candidates that occur in sufficiently large tables:
rsid_by_table = dict()
for cand in rs_candidates:
    rsid = cand.get_span()
    key = cand.context.document.name, cand.context.table.position
    if key not in rsid_by_table: rsid_by_table[key] = set()
    rsid_by_table[key].add((rsid, cand.context.cell.row_num, cand.context.cell.col_num))
    
with open('rsids.singletons.all.tsv', 'w') as f:
    for (pmid, table_id), rsids in rsid_by_table.items():
        if len(rsids) < 10: continue
        for rsid, row_num, col_num in rsids:
            f.write('%s\t%s\t%s\t%s\t%s\n' % (pmid, table_id, row_num, col_num, rsid))

In [28]:
print rsid_by_table[('17903305', 4)]

set([u'rs6577648', u'rs254315', u'rs10520880', u'rs38276', u'rs986831', u'rs10520246', u'rs7610584', u'rs3751832', u'rs3794889', u'rs10492797', u'rs9327886', u'rs10505624', u'rs208354', u'rs4418248', u'rs10514443', u'rs10486031', u'rs10513681', u'rs216666', u'rs10515347', u'rs10487577', u'rs2253319', u'rs7989050', u'rs4801149', u'rs10512920', u'rs3017183', u'rs2371438', u'rs6555491', u'rs10520247', u'rs10256504', u'rs7329659', u'rs392715', u'rs2834645', u'rs10497958', u'rs1261256', u'rs6102912', u'rs4782742'])


In [17]:
# TODO: use this to filter singletons initially
import re
pval_rgx = 'p\s?.?\s?value'
lod_rgx = 'LOD'

with open('table-annotations.tsv', 'w') as f:
    for doc in corpus.documents:
        for table in doc.tables:
            lod_found = 0
            pval_found = 0
            for cell in table.cells:
                if not pval_found and len(cell.text) < 30 and (re.search(pval_rgx, cell.text, re.IGNORECASE) or cell.text.lower() == 'p'):
                    pval_found = 1
                if not lod_found and re.search(lod_rgx, cell.text):
                    lod_found = 1
                if pval_found and lod_found: break
                    
            out_str = '%s\t%s\t%s\t%s\n' % (doc.name, table.position, pval_found, lod_found)
            f.write(out_str) 

## Filtering relations

In [12]:
rels = []
loc2rsid = dict()
with open('results/pval-rsid.raw.cols.tsv') as f:
    for line in f:
        pmid, rsid, table_id, row_id, col_id, pval = line.strip().split('\t')
        loc = pmid, table_id, row_id
        rels.append((pmid, rsid, table_id, row_id, col_id, pval))
        if loc not in loc2rsid: loc2rsid[loc] = set()
        loc2rsid[loc].add(rsid)

n = 0
with open('results/pval-rsid.raw.cols.filtered.tsv', 'w') as f:
    for rel in rels:
        pmid, rsid, table_id, row_id, col_id, pval = rel
        loc = pmid, table_id, row_id
        if len(loc2rsid[loc]) > 1: continue
        
        out_str = '%s\t%s\t%s\t%s\t%s\t%s\n' % (pmid, rsid, table_id, row_id, col_id, pval)
        f.write(out_str)
        n += 1
        
print len(rels), n

22102 19848


In [7]:
(loc2rsid.items()[:10])

[(('23704328', 'rs9316505', '0', '10'), {'rs9316505'}),
 (('21658281', 'rs2281680', '1', '9'), {'rs2281680'}),
 (('24709693', 'rs73185595', '2', '11'), {'rs73185595'}),
 (('22001756', 'rs75372', '0', '15'), {'rs75372'}),
 (('25340798', 'rs495366', '2', '17'), {'rs495366'}),
 (('19081515', 'rs6794719', '0', '5'), {'rs6794719'}),
 (('22666496', 'rs7521242', '2', '46'), {'rs7521242'}),
 (('22511988', 'rs887829', '1', '8'), {'rs887829'}),
 (('19801982', 'rs11898505', '0', '16'), {'rs11898505'}),
 (('25233373', 'rs9365619', '1', '12'), {'rs9365619'})]