# Phenotype/SNP relation extraction from tables

Here we will demo the module that parses tables in papers and extracts relations between SNPs and phenotypes (in cases in which the paper discusses multiple phenotypes).

## Preparations

We start by configuring Jupyter and setting up our environment.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle
import numpy as np
import sqlalchemy

# set the paths to snorkel and gwasdb
sys.path.append('../snorkel-tables')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up the directory with the input papers
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

# create a Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

### Load corpus

We load our usual corpus of GWAS papers.

In [2]:
from extractor.parser import UnicodeXMLTableDocParser
from snorkel.parser import XMLMultiDocParser

xml_parser = XMLMultiDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [3]:
from snorkel.parser import CorpusParser, OmniParser
from snorkel.models import Corpus

# parses tables into rows, cols, cells...
table_parser = OmniParser(timeout=1000000)

try:
    corpus = session.query(Corpus).filter(Corpus.name == 'GWAS Table Corpus').one()
except:
    cp = CorpusParser(xml_parser, table_parser)
    %time corpus = cp.parse_corpus(name='GWAS Table Corpus', session=session)
    session.add(corpus)
    session.commit()

print 'Loaded corpus of %d documents' % len(corpus)

Loaded corpus of 589 documents


## Candidate extraction

### Define candidate matchers

#### RSid matcher

In [4]:
from snorkel.matchers import RegexMatchSpan
rsid_matcher = RegexMatchSpan(rgx=r'rs\d+(/[ATCG]{1,2})*$')

#### Phenotype matchers

The first matcher checks if we are in a column whose header labels it as a phenotype column.

In [5]:
from snorkel.matchers import CellNameDictionaryMatcher

phen_words = ['trait', 'phenotype', 'outcome'] # words that denote phenotypes
phen_matcher = CellNameDictionaryMatcher(axis='col', d=phen_words, n_max=3, ignore_case=True)

The next matcher will match phenotypes in cells that span an entire axis

In [7]:
from snorkel.matchers import DictionaryMatch
from db.kb import KnowledgeBase
from extractor.util import make_ngrams

# collect phenotype list
kb = KnowledgeBase()
# efo phenotypes
efo_phenotype_list0 = kb.get_phenotype_candidates(source='efo', peek=True) # TODO: remove peaking
efo_phenotype_list = list(make_ngrams(efo_phenotype_list0))
# mesh diseases
mesh_phenotype_list0 = kb.get_phenotype_candidates(source='mesh')
mesh_phenotype_list = list(make_ngrams(mesh_phenotype_list0))
# mesh chemicals
chem_phenotype_list = kb.get_phenotype_candidates(source='chemical')

phenotype_names = efo_phenotype_list + mesh_phenotype_list + chem_phenotype_list
phen_name_matcher = DictionaryMatch(d=phenotype_names, ignore_case=True, stemmer='porter')

### Relation extraction

In [None]:
from snorkel.candidates import CandidateExtractor
from snorkel.throttlers import AlignmentThrottler, SeparatingSpanThrottler, OrderingThrottler, CombinedThrottler

# create a Snorkel class for the relation we will extract
from snorkel.models import candidate_subclass
RsidPhenRel = candidate_subclass('RsidPhenRel', ['rsid','phen'])

# define our candidate spaces
from snorkel.candidates import TableNgrams, TableCells, SpanningTableCells
unigrams = TableNgrams(n_max=1)
cells = TableCells()
spanning_cells = SpanningTableCells(axis='row')

# we will be looking only at aligned cells
row_align_filter = AlignmentThrottler(axis='row', infer=True)

# and at cells where the phenotype is in a spanning header cell above the rsid cell
sep_span_filter = SeparatingSpanThrottler(align_axis='col') # rsid and phen are not separated by spanning cells
col_order_filter = OrderingThrottler(axis='col', first=1) # phen spanning cell comes first
header_filter = CombinedThrottler([sep_span_filter, col_order_filter]) # combine the two throttlers

# the first extractor looks at phenotype names in columns with a header indicating it's a phenotype
ce1 = CandidateExtractor(RsidPhenRel, [unigrams, cells], [rsid_matcher, phen_matcher], throttler=row_align_filter)

# the second extractor looks at phenotype names in columns with a header indicating it's a phenotype
ce2 = CandidateExtractor(RsidPhenRel, [unigrams, spanning_cells], [rsid_matcher, phen_name_matcher], throttler=header_filter, stop_on_duplicates=False)

# collect that cells that will be searched for candidates
tables = [table for doc in corpus.documents for table in doc.tables]

We are now ready to perform relation extraction.

In [None]:
from snorkel.models import CandidateSet

try:
    rels1 = session.query(CandidateSet).filter(CandidateSet.name == 'RsidPhenRel Set 1').one()
except:
    %time rels1 = ce1.extract(tables, 'RsidPhenRel Set 1', session)
    
print "%s relations extracted, e.g." % len(rels1)
for cand in rels1[:10]:
    print cand



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


[=====                                   ] 11%

In [23]:
from snorkel.models import CandidateSet
session.rollback()
session.query(CandidateSet).filter(CandidateSet.name == 'RsidPhenRel Set 2').delete()

1

In [None]:
from snorkel.models import CandidateSet

try:
    rels2 = session.query(CandidateSet).filter(CandidateSet.name == 'RsidPhenRel Set 2').one()
except:
    %time rels2 = ce2.extract(tables, 'RsidPhenRel Set 2', session)
    
print "%s relations extracted, e.g." % len(rels2)
for cand in rels2[:10]: 
    print cand

Finally, we merge the two sets of candiates into a single set.

In [10]:
from snorkel.models import CandidateSet

try:
    rels = session.query(CandidateSet).filter(CandidateSet.name == 'RsidPhenRel Canidates').one()
except:
    rels = CandidateSet(name='RsidPhenRel Canidates')
    for c in rels1: rels.append(c)
    for c in rels2: rels.append(c)

    session.add(rels)
    session.commit()

print '%d candidates in total' % len(rels)

3504 candidates in total


The code below lets us manually inspect our extractor on a single table that we hand-picked.

In [21]:
# hard_doc = [d for d in corpus.documents if d.name == '17903293'][0]

# hard_doc = [d for d in corpus.documents if d.name == '19197348'][0] # spanning phenotype cells
# hard_tables = [hard_doc.tables[3]]

# hard_doc = [d for d in corpus.documents if d.name == '19448621'][0] # spanning phenotype cells
# hard_tables = hard_doc.tables

hard_doc = [d for d in corpus.documents if d.name == '20585627'][0] # inferred cells
hard_tables = [hard_doc.tables[3]]

session.rollback()
session.query(CandidateSet).filter(CandidateSet.name == 'Test 2').delete()

ce = CandidateExtractor(RsidPhenRel, [unigrams, cells], [rsid_matcher, phen_matcher], throttler=row_align_filter, stop_on_duplicates=False)
# ce = CandidateExtractor(RsidPhenRel, [unigrams, spanning_cells], [rsid_matcher, phen_name_matcher], throttler=T_joint, stop_on_duplicates=False)
%time rels_test = ce.extract(hard_tables, 'Test 2', session)
print len(rels_test)
for rel in rels_test:
    print rel

CPU times: user 43.7 s, sys: 13.6 s, total: 57.3 s
Wall time: 58.2 s
57
RsidPhenRel(Span("rs12931267", parent=353074, chars=[0,9], words=[0,0]), Span("Freckling", parent=353023, chars=[0,8], words=[0,0]))
RsidPhenRel(Span("rs4812405", parent=353244, chars=[0,8], words=[0,0]), Span("Freckling", parent=353023, chars=[0,8], words=[0,0]))
RsidPhenRel(Span("rs761238", parent=353224, chars=[0,7], words=[0,0]), Span("Freckling", parent=353023, chars=[0,8], words=[0,0]))
RsidPhenRel(Span("rs7204478", parent=353134, chars=[0,8], words=[0,0]), Span("Freckling", parent=353023, chars=[0,8], words=[0,0]))
RsidPhenRel(Span("rs1805009", parent=353104, chars=[0,8], words=[0,0]), Span("Freckling", parent=353023, chars=[0,8], words=[0,0]))
RsidPhenRel(Span("rs1805008", parent=353094, chars=[0,8], words=[0,0]), Span("Freckling", parent=353023, chars=[0,8], words=[0,0]))
RsidPhenRel(Span("rs11861084", parent=353124, chars=[0,9], words=[0,0]), Span("Freckling", parent=353023, chars=[0,8], words=[0,0]))
Rsi

In [86]:
from snorkel.lf_helpers import _infer_cell
wtf_cell = hard_tables[0].cells[22]
print wtf_cell
print _infer_cell(wtf_cell, axis='col', direct=True, infer=True)

Cell(Doc: 20585627, Table: 1, Row: 2, Col: 0)
Cell(Doc: 20585627, Table: 1, Row: 1, Col: 0)


## Learning the correctness of relations

Next, we will train machine learning models to identify which phenotype candidates are actually correct.

### Generating a labeled set of examples

We first split data into an (unlabeled) training set (since we will use unsupervised risk estimation to train a candidate on it), and a dev/test set.

In [11]:
try:
    train_c = session.query(CandidateSet).filter(CandidateSet.name == 'RsidPhenRel Training Candidates').one()
    devtest_c = session.query(CandidateSet).filter(CandidateSet.name == 'RsidPhenRel Dev/Test Candidates').one()
except:
    # delete any previous sets with that name
    session.query(CandidateSet).filter(CandidateSet.name == 'RsidPhenRel Training Candidates').delete()
    session.query(CandidateSet).filter(CandidateSet.name == 'RsidPhenRel Dev/Test Candidates').delete()

    frac_test = 0.5

    # initialize the new sets
    train_c = CandidateSet(name='RsidPhenRel Training Candidates')
    devtest_c = CandidateSet(name='RsidPhenRel Dev/Test Candidates')

    # choose a random subset for the labeled set
    n_test = len(rels) * frac_test
    test_idx = set(np.random.choice(len(rels), size=(n_test,), replace=False))

    # add to the sets
    for i, c in enumerate(rels):
        if i in test_idx:
            devtest_c.append(c)
        else:
            train_c.append(c)

    # save the results
    session.add(train_c)
    session.add(devtest_c)
    session.commit()

print 'Initialized %d training and %d dev/testing candidates' % (len(train_c), len(devtest_c))



Initialized 1752 training and 1752 dev/testing candidates


### Labelling functions

Following the data programming approach, we define set of labeling functions. We will learn their accuracy via unsupervised learning and use them for classifying candidates.

In [12]:
from snorkel.lf_helpers import *
s=None
doc = [d for d in corpus.documents if d.name == '17903303'][0]
table = doc.tables[3]
for cell in table.cells:
    top_cells = get_aligned_cells(cell, 'col', infer=True)
    top_phrases = [phrase for cell in top_cells for phrase in cell.phrases]
# rels[0][1].parent.table.cells[0].phrases
# corpus.documents[0].phrases



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [13]:
from snorkel.lf_helpers import *

bad_words = ['rs number', 'rs id', 'rsid']

# negative LFs
def LF_number(m):
    txt = m[1].get_span()
    frac_num = len([ch for ch in txt if ch.isdigit()]) / float(len(txt))
    return -1 if len(txt) > 5 and frac_num > 0.4 or frac_num > 0.6 else 0

def LF_bad_phen_mentions(m):
    if cell_spans(m[1].parent.cell, m[1].parent.table, 'row'): return 0
    #     if m[1].context.cell.spans('row'): return 0
    top_cells = get_aligned_cells(m[1].parent.cell, 'col', infer=True)
    top_cells = [cell for cell in top_cells]
#     top_cells = m.span1.context.cell.aligned_cells(axis='col', induced=True)
    try:
        top_phrases = [phrase for cell in top_cells for phrase in cell.phrases]
    except:
        for cell in top_cells:
            print cell, cell.phrases
    if not top_phrases: return 0
    matching_phrases = []
    for phrase in top_phrases:
        if any (phen_matcher._f_ngram(word) for word in phrase.text.split(' ')):
            matching_phrases.append(phrase)
    small_matching_phrases = [phrase for phrase in matching_phrases if len(phrase.text) <= 25]
    return -1 if not small_matching_phrases else 0

def LF_bad_word(m):
    txt = m[1].get_span()
    return -1 if any(word in txt for word in bad_words) else 0

LF_tables_neg = [LF_number, LF_bad_phen_mentions]

# positive LFs
def LF_no_neg(m):
    return +1 if not any(LF(m) for LF in LF_tables_neg) else 0

LF_tables_pos = [LF_no_neg]

LFs = LF_tables_neg + LF_tables_pos

We generate features for the training set.

In [14]:
from snorkel.annotations import LabelManager
label_manager = LabelManager()

try:
    %time L_train = label_manager.load(session, train_c, 'RsidPhenRel LF Labels6')
except sqlalchemy.orm.exc.NoResultFound:
    %time L_train = label_manager.create(session, train_c, 'RsidPhenRel LF Labels6', f=LFs)

Generating annotations for 1752 candidates...
Loading sparse Label matrix...
CPU times: user 9min 23s, sys: 2min 42s, total: 12min 5s
Wall time: 12min 13s


Let's also look at some basic statistics.

In [18]:
L_train.lf_stats()

Unnamed: 0,conflicts,coverage,j,overlaps
LF_number,0,0.144242,0,0.072976
LF_bad_phen_mentions,0,0.152223,1,0.072976
LF_no_neg,0,0.776511,2,0.0


### Training a machine learning model

Next, we train a generative model, just like in the phenotype extraction notebook.

In [15]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=10000, rate=1e-2)

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



Training marginals (!= 0.5):	1752
Features:			3
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.084432
	Learning epoch = 250	Gradient mag. = 0.098711
	Learning epoch = 500	Gradient mag. = 0.107868
	Learning epoch = 750	Gradient mag. = 0.116008
	Learning epoch = 1000	Gradient mag. = 0.122948
	Learning epoch = 1250	Gradient mag. = 0.128657
	Learning epoch = 1500	Gradient mag. = 0.133226
	Learning epoch = 1750	Gradient mag. = 0.136810
	Learning epoch = 2000	Gradient mag. = 0.139583
	Learning epoch = 2250	Gradient mag. = 0.141712
	Learning epoch = 2500	Gradient mag. = 0.143340
	Learning epoch = 2750	Gradient mag. = 0.144582
	Learning epoch = 3000	Gradient mag. = 0.145530
	Learning epoch = 3250	Gradient mag. = 0.146254
	Learning epoch = 3500	Gradient mag. = 0.146807
	Learning epoch = 3750	Gradient mag. = 0.147231
	Learning epoch = 4000	Gradient mag. = 0.147556
	Learning epoch = 4250	Gradient mag. = 0.147805
	Learning epoch = 4500	Gradient mag. = 0.147996
	Learni

In [16]:
gen_model.w

array([ 9.92867574,  9.91268375,  0.98503744])

## Classify all the candidates

In [17]:
from snorkel.annotations import LabelManager
label_manager = LabelManager()

# delete existing labels
# session.rollback()
# session.query(AnnotationKeySet).filter(AnnotationKeySet.name == 'RsidPhenRel LF All Labels').delete()
%time L_all = label_manager.create(session, rels, 'RsidPhenRel LF All Labels', f=LFs)

Generating annotations for 3504 candidates...
Loading sparse Label matrix...
CPU times: user 13min 15s, sys: 3min 22s, total: 16min 37s
Wall time: 16min 47s


Save the results

In [18]:
preds = gen_model.odds(L_all)
good_rels = [(c[0].parent.document.name, c[0].get_span(), c[1].get_span()) for (c, p) in zip(rels, preds) if p > 0]
print len(good_rels), 'relations extracted, e.g.:'
print good_rels[:10]

# store relations to annotate
with open('rels.acronyms.extracted.tsv', 'w') as f:
    for doc_id, str1, str2 in good_rels:
        try:
            out = u'{}\t{}\t{}\n'.format(doc_id, unicode(str1), str2)
            f.write(out.encode("UTF-8"))
        except:
            print 'Error in saving:', str1, str2

2745 relations extracted, e.g.:
[(u'17903300', u'rs7202384', u'Mean BMI'), (u'17903300', u'rs10486301', u'Mean BMI'), (u'17903300', u'rs7533902', u'Mean BMI'), (u'17903300', u'rs2226351', u'Mean WC'), (u'17903300', u'rs711702', u'Mean BMI'), (u'17903300', u'rs2296465', u'Mean BMI'), (u'17903300', u'rs10517461', u'Mean WC'), (u'17903300', u'rs1875517', u'Mean WC'), (u'17903300', u'rs4312989', u'Mean WC'), (u'17903300', u'rs2361128', u'Mean BMI')]


## Resolve acronyms based on ones extracted earlier

In [49]:
from extractor.dictionary import Dictionary, unravel

D = Dictionary()
D.load('acronyms.extracted.all.tsv')
print len(D), 'definitions loaded'

326 definitions loaded


Use dictionary to resolve acronyms

In [50]:
new_rels = [ (doc_id, rs_id, unravel(doc_id, phen, D)) for doc_id, rs_id, phen in rels ]

## Evaluate extracted relations

Let's first evaluate the recall w.r.t. GWAS Central

In [52]:
print ([(pmid, rsid, phen) for pmid, rsid, phen in new_rels if pmid == '17903305'])

[('17903305', u'rs905883', u'Breast cancer'), ('17903305', u'rs7564590', u'Breast cancer'), ('17903305', u'rs7558615', u'Breast cancer'), ('17903305', u'rs9325782', u'Prostate cancer'), ('17903305', u'rs2410373', u'Prostate cancer')]


In [53]:
pmids = sorted(list({pmid for pmid, _, _ in new_rels}))

from db.kb import KnowledgeBase
kb = KnowledgeBase()
assocs = [assoc for pmid in pmids for assoc in kb.assoc_by_pmid(pmid) if assoc.source == 'gwas_central' and assoc.pvalue < 1e-5]
print len(pmids), len(assocs)

77 573


In [54]:
print pmids

['17903292', '17903293', '17903294', '17903295', '17903296', '17903297', '17903298', '17903300', '17903301', '17903302', '17903303', '17903304', '17903305', '17903306', '17903307', '17903308', '19043545', '19197348', '19430483', '19448621', '19557161', '19609347', '19721433', '19820699', '20066028', '20195266', '20395239', '20463881', '20526338', '20548944', '20585627', '20694148', '20838585', '20921969', '20927387', '21203500', '21347282', '21386085', '21483430', '21483845', '21552555', '21738479', '21738480', '21738491', '21931564', '22216198', '22291609', '22509378', '22558069', '22589738', '22832964', '22911880', '23028342', '23118974', '23251661', '23408906', '23696099', '23704328', '23754948', '23836780', '23935956', '23966867', '24324551', '24347629', '24376456', '24379826', '24386095', '24586186', '24886709', '24892410', '24903457', '24945404', '25087078', '25133637', '25188341', '25340798', '25367360']


In [28]:
# collect resolved relations
rel_dict = { (pmid, rsid) : set() for (pmid, rsid, phen) in new_rels }
for (pmid, rsid, phen) in new_rels:
    rel_dict[(pmid, rsid)].add(phen)

gold_rel_dict = { (a.paper.pubmed_id, a.snp.rs_id) : set() for a in assocs }
for a in assocs:
    gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)].add(a.phenotype.name)

NameError: name 'assocs' is not defined

First, evaluate recall: how many associations in GWAS central can we recover?

In [None]:
for a in assocs[:500]:
    s1 = gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    s2 = rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), {})
    if len(s1) != 1 or len(s2) != 1:
        print a.paper.pubmed_id, a.snp.rs_id, a.source
        print 'GWC:', gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
        print 'US: ', rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), None)
        print

Second question: can we learn any more SNPs than the ones that are already in GWAS central?

In [None]:
pmids = sorted(list({pmid for pmid, _, _ in new_rels if int(pmid) < 17903297}))

from db.kb import KnowledgeBase
kb = KnowledgeBase()
assocs = [assoc for pmid in pmids for assoc in kb.assoc_by_pmid(pmid) if assoc.source == 'gwas_central']
print len(assocs)

In [None]:
for a in assocs:
    s1 = gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    s2 = rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), {})
    print a.paper.pubmed_id, a.snp.rs_id, a.source
    print 'GWC:', gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    print 'US: ', rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), None)
    print

## Combine with extracted pvalue/rsid relations

In [57]:
pval_rsid_dict = dict()
pval_dict = dict() # combine all of the pvalues for a SNPs in the same document into one set
with open('pval-rsid.raw.tsv') as f:
    for line in f:
        pmid, rsid, table_id, row_id, col_id, pval = line.strip().split('\t')
        pval, table_id, row_id, col_id = float(pval), int(table_id), int(row_id), int(col_id)
        
        if pmid not in pval_rsid_dict: pval_rsid_dict[pmid] = dict()
        key = (rsid, table_id, row_id)
        if key not in pval_rsid_dict[pmid]: pval_rsid_dict[pmid][key] = set()
        pval_rsid_dict[pmid][key].add(pval)
                
        if pmid not in pval_dict: pval_dict[pmid] = dict()
        if rsid not in pval_dict[pmid]: pval_dict[pmid][rsid] = set()
        pval_dict[pmid][rsid].add(pval)

pval_dict0 = {pmid : {rsid : min(pval_dict[pmid][rsid]) for rsid in pval_dict[pmid]} for pmid in pval_dict}
pval_rsid_dict0 = {pmid : {key : min(pval_rsid_dict[pmid][key]) for key in pval_rsid_dict[pmid]} for pmid in pval_rsid_dict}
pval_dict = pval_dict0
pval_rsid_dict = pval_rsid_dict0

Plan. If phen/rsid has been extracted from tables: take its pvalue from pval_rsid_dict.

If not, we assume that paper has only one phenotype and we take the smallest reported pvalue in the paper.

Our goal for now is just to filter phen/rsid relations that have pval<1e-5.

#### Save all relations that are sufficiently small p-values

In [58]:
# preds = learner.predict_wmv(candidates)
predicted_candidates = [c for (c, p) in zip(candidates, preds) if p == 1]

import re
import unicodedata
def _normalize_str(s):
    try:
        s = s.encode('utf-8')
        return s
    except UnicodeEncodeError: 
        pass
    try:
        s = s.decode('utf-8')
        return s
    except UnicodeDecodeError: 
        pass    
    raise Exception()
    
def clean_rsid(rsid):
    return re.sub('/.+', '', rsid)

with open('phen-rsid.table.rel.all.tsv', 'w') as f:
    for c in predicted_candidates:
        pmid = c.span0.context.document.name
        rsid = c.span0.get_span()
        phen = c.span1.get_span()        
        table_id = c.span0.context.table.position
        row_num = c.span0.context.cell.row_num
        col_num = c.span0.context.cell.col_num # of the rsid

        phen = (unravel(pmid, phen, D))
        if isinstance(phen, unicode):
            phen = phen.encode('utf-8')
        
        try:
            pval = pval_rsid_dict[pmid].get((rsid, table_id, row_num), -1)
        except KeyError:
            pval = -1
#             continue
        if pval > 1e-5: continue

        out_str = '{pmid}\t{rsid}\t{phen}\t{pval}\ttable\t{table_id}\t{row}\t{col}\n'.format(
                    pmid=pmid, rsid=clean_rsid(rsid), phen=phen, pval=pval, table_id=table_id, row=row_num, col=col_num)
        f.write(out_str)

In [None]:
print [(c, c.span0.context.cell.row_num, unravel(c.span0.context.document.name, c.span1.get_span(), D)) for c in candidates if c.span0.get_span() == 'rs10500631']

In [42]:
pval_rsid_dict['17903294'].get(('rs10500631', 1, 5), -1)

-1

In [None]:
for x in pval_rsid_dict['17903294']:    
    print x, pval_rsid_dict['17903294'][x]