# Extracting SNP/P-value relations from tables

This module parses XML tables and extracts relations between SNPs and the p-values at which they are deemed to be significant.

## Preparations

We start by configuring Jupyter and setting up our environment.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle
import numpy as np
import sqlalchemy

# set the paths to snorkel and gwasdb
sys.path.append('../snorkel-tables')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up the directory with the input papers
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

# create a Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

### Load corpus

We load our usual corpus of GWAS papers.

In [2]:
from snorkel.parser import XMLMultiDocParser
from extractor.parser import UnicodeXMLTableDocParser

xml_parser = XMLMultiDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [3]:
from snorkel.parser import CorpusParser, OmniParser
from snorkel.models import Corpus

# parses tables into rows, cols, cells...
table_parser = OmniParser(timeout=1000000)

try:
    corpus = session.query(Corpus).filter(Corpus.name == 'GWAS Table Corpus').one()
except:
    cp = CorpusParser(xml_parser, table_parser)
    %time corpus = cp.parse_corpus(name='GWAS Table Corpus', session=session)
    session.add(corpus)
    session.commit()

print 'Loaded corpus of %d documents' % len(corpus)

Loaded corpus of 589 documents


## Candidate Extraction

### Defining candidate matchers

We genereate RSid candidates from all spans that match the following regular expression.

In [4]:
from snorkel.matchers import RegexMatchSpan
rsid_matcher = RegexMatchSpan(rgx=r'rs\d+(/[ATCG]{1,2})*$')

Similarly, p-value candidates are all spans that match the following regular expression.

In [5]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import RegexMatchSpan, Union

# 1: p-value matcher

rgx1 = u'[1-9]\d?[\xb7\.]?\d*[\s\u2009]*[\xd7\xb7\*][\s\u2009]*10[\s\u2009]*[-\u2212\u2013\u2012][\s\u2009]*\d+'
pval_rgx_matcher1 = RegexMatchSpan(rgx=rgx1)
rgx2 = u'[1-9]\d?[\xb7\.]?\d*[\s\u2009]*[eE][\s\u2009]*[-\u2212\u2013\u2012][\s\u2009]*\d+'
pval_rgx_matcher2 = RegexMatchSpan(rgx=rgx2)
rgx3 = u'0\.0000+\d+'
pval_rgx_matcher3 = RegexMatchSpan(rgx=rgx3)
pval_rgx_matcher = Union(pval_rgx_matcher1, pval_rgx_matcher2, pval_rgx_matcher3)

# 2: column-based matcher (currently not used)

from snorkel.matchers import CellNameRegexMatcher

pval_rgx = 'p\s?.?\s?value'
pval_rgxname_matcher = CellNameRegexMatcher(axis='col', rgx=pval_rgx, n_max=3, ignore_case=True, header_only=True, max_chars=20)

# 3: combine the two

pval_matcher = Union(pval_rgx_matcher, pval_rgxname_matcher)

## Extract candidate relations between SNPs and p-values

In [6]:
# create a Snorkel class for the relation we will extract
from snorkel.models import candidate_subclass
RsidPhenRel = candidate_subclass('RsidPvalRel', ['rsid','pval'])

# define our candidate spaces
from snorkel.candidates import TableNgrams
unigrams = TableNgrams(n_max=1)
heptagrams = TableNgrams(n_max=7)

# we will be looking only at aligned cells
from snorkel.throttlers import AlignmentThrottler
row_align_filter = AlignmentThrottler(axis='row', infer=False)

# the first extractor looks at phenotype names in columns with a header indicating it's a phenotype
from snorkel.candidates import CandidateExtractor
ce = CandidateExtractor(RsidPhenRel, [unigrams, heptagrams], [rsid_matcher, pval_rgx_matcher], throttler=row_align_filter)

# collect that cells that will be searched for candidates
tables = [table for doc in corpus.documents for table in doc.tables]

In [10]:
from snorkel.models import CandidateSet
    
try:
    rels = session.query(CandidateSet).filter(CandidateSet.name == 'RsidPvalRel Relations').one()
except:
    %time rels = ce.extract(tables, 'RsidPvalRel Relations', session)
    session.add(rels)
    session.commit()

print "%s relations extracted, e.g." % len(rels)
for cand in rels[:10]:
    print cand

22536 relations extracted, e.g.
RsidPvalRel(Span("rs12722489", parent=416148, chars=[0,9], words=[0,0]), Span("2.16E-07", parent=416156, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs1736916", parent=416112, chars=[0,8], words=[0,0]), Span("1.22E-07", parent=416120, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs2857439", parent=416028, chars=[0,8], words=[0,0]), Span("1.86E-10", parent=416036, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs17421624", parent=415992, chars=[0,9], words=[0,0]), Span("9.72E-15", parent=416000, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs3094157", parent=416124, chars=[0,8], words=[0,0]), Span("1.24E-07", parent=416132, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs7382297", parent=415980, chars=[0,8], words=[0,0]), Span("2.01E-15", parent=415988, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs2647046", parent=415968, chars=[0,8], words=[0,0]), Span("1.43E-17", parent=415976, chars=[0,7], words=[0,0]))
RsidPvalRel(Span("rs3905495", parent=416100, char

#### Save this for now

In [29]:
import re
from extractor.util import pvalue_to_float

def clean_rsid(rsid):
    return re.sub('/.+', '', rsid)

with open('pval-rsid.raw.cols.tsv', 'w') as f:
    for rel in rels:
        pmid = rel[0].parent.document.name
        table_id = rel[0].parent.table.position
        row_id = rel[0].parent.cell.row.position
        col_id = rel[0].parent.cell.col.position
        rsid = rel[0].get_span()
        pval = pvalue_to_float(rel[1].get_span())
        
        try:
            out_str = '%s\t%s\t%d\t%d\t%d\t%f\n' % (pmid, clean_rsid(rsid), table_id, row_id, col_id, pval)
        except:
            print pmid, clean_rsid(rsid), table_id, row_id, col_id, pval
        f.write(out_str)

## Extracting singleton SNPs

There also exist many papers that don't report p-values. To handle these, we also report certain rsids that have *not* been associated with a p-value.

Here, we extract these entites. Later on, we will filter them.

In [48]:
# Define the extractor
from snorkel.models import candidate_subclass
from snorkel.matchers import RegexMatchSpan
from snorkel.candidates import CandidateExtractor

RSID = candidate_subclass('SnorkelRsid2', ['rsid'])

unigrams = TableNgrams(n_max=1)
rsid_singleton_matcher = RegexMatchSpan(rgx=r'rs\d+(/[^s]+)?')
rsid_singleton_extractor = CandidateExtractor(RSID, unigrams, rsid_singleton_matcher)

Perform the extraction process.

In [1]:
from snorkel.models import CandidateSet

try:
    rsid_c = session.query(CandidateSet).filter(CandidateSet.name == 'Rsid Candidates2').one()
except:
    tables = [table for doc in corpus.documents for table in doc.tables]
    print '%d tables loaded' % len(tables)
    %time rsid_c = rsid_extractor.rsid_singleton_extractor(tables, 'Rsid Candidates2', session)
    session.add(rsid_c)
    session.commit()

print '%d candidates extracted' % len(rsid_c)

We store candidates that occur in sufficiently large tables:

In [41]:
rsid_by_table = dict()
for cand in rsid_c:
    rsid = cand[0].get_span()
    key = cand[0].parent.document.name, cand[0].parent.table.position
    if key not in rsid_by_table: rsid_by_table[key] = set()
    rsid_by_table[key].add((rsid, cand[0].parent.cell.row.position, cand[0].parent.cell.col.position))
    
with open('rsids.singletons.all.tsv', 'w') as f:
    for (pmid, table_id), rsids in rsid_by_table.items():
        if len(rsids) < 10: continue
        for rsid, row_num, col_num in rsids:
            f.write('%s\t%s\t%s\t%s\t%s\n' % (pmid, table_id, row_num, col_num, rsid))

Here, we store certain table features that will be used to select which singelton rsid's to report.

In [43]:
import re
pval_rgx = 'p\s?.?\s?value'
lod_rgx = 'LOD'

with open('table-annotations.tsv', 'w') as f:
    for doc in corpus.documents:
        for table in doc.tables:
            lod_found = 0
            pval_found = 0
            for cell in table.cells:
                if not pval_found and len(cell.text) < 30 and (re.search(pval_rgx, cell.text, re.IGNORECASE) or cell.text.lower() == 'p'):
                    pval_found = 1
                if not lod_found and re.search(lod_rgx, cell.text):
                    lod_found = 1
                if pval_found and lod_found: break
                    
            out_str = '%s\t%s\t%s\t%s\n' % (doc.name, table.position, pval_found, lod_found)
            f.write(out_str) 

## Filtering relations

Here, we perform a bit of filtering in post-processing.

In [44]:
rels = []
loc2rsid = dict()
with open('pval-rsid.raw.cols.tsv') as f:
    for line in f:
        pmid, rsid, table_id, row_id, col_id, pval = line.strip().split('\t')
        loc = pmid, table_id, row_id
        rels.append((pmid, rsid, table_id, row_id, col_id, pval))
        if loc not in loc2rsid: loc2rsid[loc] = set()
        loc2rsid[loc].add(rsid)

n = 0
with open('pval-rsid.raw.cols.filtered.tsv', 'w') as f:
    for rel in rels:
        pmid, rsid, table_id, row_id, col_id, pval = rel
        loc = pmid, table_id, row_id
        if len(loc2rsid[loc]) > 1: continue
        
        out_str = '%s\t%s\t%s\t%s\t%s\t%s\n' % (pmid, rsid, table_id, row_id, col_id, pval)
        f.write(out_str)
        n += 1
        
print len(rels), n

22536 20258
