# Phenotype/SNP relation extraction from tables

Here we will demo the module that parses tables in papers and extracts relations between SNPs and phenotypes (in cases in which the paper discusses multiple phenotypes).

## Preparations

We start by configuring Jupyter and setting up our environment.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle
import numpy as np
import sqlalchemy

# set the paths to snorkel and gwasdb
sys.path.append('../snorkel-tables')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up the directory with the input papers
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

# create a Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

### Load corpus

We load our usual corpus of GWAS papers.

In [2]:
from extractor.parser import UnicodeXMLTableDocParser
from snorkel.parser import XMLMultiDocParser

xml_parser = XMLMultiDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [3]:
from snorkel.parser import CorpusParser, OmniParser
from snorkel.models import Corpus

# parses tables into rows, cols, cells...
table_parser = OmniParser(timeout=1000000)

try:
    corpus = session.query(Corpus).filter(Corpus.name == 'GWAS Table Corpus').one()
except:
    cp = CorpusParser(xml_parser, table_parser)
    %time corpus = cp.parse_corpus(name='GWAS Table Corpus', session=session)
    session.add(corpus)
    session.commit()

print 'Loaded corpus of %d documents' % len(corpus)

Loaded corpus of 589 documents


## Candidate extraction

### Define candidate matchers

#### RSid matcher

In [4]:
from snorkel.matchers import RegexMatchSpan
rsid_matcher = RegexMatchSpan(rgx=r'rs\d+(/[ATCG]{1,2})*$')

#### Phenotype matchers

The first matcher checks if we are in a column whose header labels it as a phenotype column.

In [5]:
from snorkel.matchers import CellNameDictionaryMatcher

phen_words = ['trait', 'phenotype', 'outcome'] # words that denote phenotypes
phen_matcher = CellNameDictionaryMatcher(axis='col', d=phen_words, n_max=3, ignore_case=True)

The next matcher will match phenotypes in cells that span an entire axis

In [6]:
from snorkel.matchers import DictionaryMatch
from db.kb import KnowledgeBase
from extractor.util import make_ngrams

# collect phenotype list
kb = KnowledgeBase()
# efo phenotypes
efo_phenotype_list0 = kb.get_phenotype_candidates(source='efo', peek=True) # TODO: remove peaking
efo_phenotype_list = list(make_ngrams(efo_phenotype_list0))
# mesh diseases
mesh_phenotype_list0 = kb.get_phenotype_candidates(source='mesh')
mesh_phenotype_list = list(make_ngrams(mesh_phenotype_list0))
# mesh chemicals
chem_phenotype_list = kb.get_phenotype_candidates(source='chemical')

phenotype_names = efo_phenotype_list + mesh_phenotype_list + chem_phenotype_list
phen_name_matcher = DictionaryMatch(d=phenotype_names, ignore_case=True, stemmer='porter')

### Relation extraction

In [7]:
from snorkel.candidates import CandidateExtractor
from snorkel.throttlers import AlignmentThrottler, SeparatingSpanThrottler

# create a Snorkel class for the relation we will extract
from snorkel.models import candidate_subclass
RsidPhenRel = candidate_subclass('RsidPhenRel', ['rsid','phen'])

# define our candidate spaces
from snorkel.candidates import TableNgrams, TableCells, SpanningTableCells
unigrams = TableNgrams(n_max=1)
cells = TableCells()
spanning_cells = SpanningTableCells(axis='row')

# we will be looking only at aligned cells
row_align_filter = AlignmentThrottler(axis='row', infer=True)
sep_span_filter = SeparatingSpanThrottler(align_axis='col')

# the first extractor looks at phenotype names in columns with a header indicating it's a phenotype
ce1 = CandidateExtractor(RsidPhenRel, [unigrams, cells], [rsid_matcher, phen_matcher], throttler=row_align_filter)

# the second extractor looks at phenotype names in columns with a header indicating it's a phenotype
ce2 = CandidateExtractor(RsidPhenRel, [unigrams, spanning_cells], [rsid_matcher, phen_name_matcher], throttler=sep_span_filter)

# collect that cells that will be searched for candidates
tables = [table for doc in corpus.documents for table in doc.tables]

We are now ready to perform relation extraction.

In [8]:
from snorkel.models import CandidateSet
session.rollback()
session.query(CandidateSet).filter(CandidateSet.name == 'Test 1').delete()

0

In [8]:
%time rels1 = ce1.extract(tables, 'RsidPhenRel Set 1', session)
print "%s relations extracted, e.g." % len(rels1)
for cand in rels1[:10]:
    print cand

CPU times: user 6h 3min 54s, sys: 2h 18min 8s, total: 8h 22min 2s
Wall time: 19h 34min 29s
3424 relations extracted, e.g.
RsidPhenRel(Span("rs7202384", parent=302858, chars=[0,8], words=[0,0]), Span("Mean BMI", parent=302857, chars=[0,7], words=[0,1]))
RsidPhenRel(Span("rs10486301", parent=302774, chars=[0,9], words=[0,0]), Span("Mean BMI", parent=302773, chars=[0,7], words=[0,1]))
RsidPhenRel(Span("rs7533902", parent=303021, chars=[0,8], words=[0,0]), Span("Mean BMI", parent=303020, chars=[0,7], words=[0,1]))
RsidPhenRel(Span("rs2226351", parent=303035, chars=[0,8], words=[0,0]), Span("Mean WC", parent=303034, chars=[0,6], words=[0,1]))
RsidPhenRel(Span("rs711702", parent=302922, chars=[0,7], words=[0,0]), Span("Mean BMI", parent=302921, chars=[0,7], words=[0,1]))
RsidPhenRel(Span("rs2296465", parent=302754, chars=[0,8], words=[0,0]), Span("Mean BMI", parent=302753, chars=[0,7], words=[0,1]))
RsidPhenRel(Span("rs10517461", parent=302901, chars=[0,9], words=[0,0]), Span("Mean WC", pare

In [9]:
%time rels2 = ce2.extract(tables, 'RsidPhenRel Set 2', session)
print "%s relations extracted, e.g." % len(rels2)
for cand in rels2[:10]: 
    print cand



ValueError: Duplicate candidates found in Candidate Set (RsidPhenRel Set 2).

Finally, we merge the two sets of candiates into a single set.

In [13]:
from snorkel.models import CandidateSet
rels = CandidateSet(name='Joint RsidPhenRel')

for c in rels1: rels.append(c)
# for c in rels2: rels.add(c)

session.add(rels)
session.commit()

The code below lets us manually inspect our extractor on a single table that we hand-picked.

In [10]:
# hard_doc = [d for d in corpus.documents if d.name == '17903293'][0]
hard_doc = [d for d in corpus.documents if d.name == '19197348'][0] # spanning phenotype cells
hard_tables = [hard_doc.tables[2]]

session.rollback()
session.query(CandidateSet).filter(CandidateSet.name == 'Test 1').delete()
ce1 = CandidateExtractor(RsidPhenRel, [unigrams, cells], [rsid_matcher, phen_matcher], throttler=row_align_filter)
ce2 = CandidateExtractor(RsidPhenRel, [unigrams, spanning_cells], [rsid_matcher, phen_name_matcher], throttler=sep_span_filter)
%time rels_test = ce2.extract(hard_tables, 'Test 1', session)
print len(rels_test)
for rel in rels_test:
    print rel

CPU times: user 48 s, sys: 426 ms, total: 48.4 s
Wall time: 48.8 s
40
RsidPhenRel(Span("rs2222328", parent=68104, chars=[0,8], words=[0,0]), Span("Weight", parent=68152, chars=[0,5], words=[0,0]))
RsidPhenRel(Span("rs6462411", parent=68600, chars=[0,8], words=[0,0]), Span("Thyroid Stimulating Hormone", parent=68585, chars=[0,26], words=[0,2]))
RsidPhenRel(Span("rs1877431", parent=68717, chars=[0,8], words=[0,0]), Span("Thyroid Stimulating Hormone", parent=68585, chars=[0,26], words=[0,2]))
RsidPhenRel(Span("rs10486715", parent=68083, chars=[0,9], words=[0,0]), Span("Waist Circumference", parent=68100, chars=[0,18], words=[0,1]))
RsidPhenRel(Span("rs17718077", parent=68093, chars=[0,9], words=[0,0]), Span("Height", parent=68079, chars=[0,5], words=[0,0]))
RsidPhenRel(Span("rs925488", parent=68664, chars=[0,7], words=[0,0]), Span("Thyroid Stimulating Hormone", parent=68585, chars=[0,26], words=[0,2]))
RsidPhenRel(Span("rs10984516", parent=68737, chars=[0,9], words=[0,0]), Span("Thyroid S

## Learning the correctness of relations

### Creating a gold set

To create a gold set, we save all extracted relations into a csv file. We annotate it manually, and save the result to a second file. It contains pairs of phenotype and rsid strings; if that file exists, we take these as gold truth.

In [38]:
# store relations to annotate
with open('rels.acroynms.unnanotated.tsv', 'w') as f:
    for span_pair in new_candidates:
        doc_id = span_pair.span0.context.document.name
        table_id = span_pair.span0.context.table.position
        row_num = span_pair.span0.context.cell.row_num
        str1 = span_pair.span0.get_span()
        str2 = span_pair.span1.get_span()
        try:
            f.write('%s\t%s\t%d\t%s\t%s\n' % (doc_id, table_id, row_num, str1, str2))
        except:
            continue

In [16]:
# load annotations
annotations = dict()
with open('rels.acronyms.annotated.txt') as f:
    text = f.read()
    for line in text.split('\r'):
        doc_id, table_id, col_n, rs_id, phen, res = line.strip().split('\t')
        res = 1 if int(res) == 1 else -1
        annotations[(doc_id, table_id, rs_id, phen)] = res

We will used the labeled examples as our test set.

In [None]:
def c2uid(c):
    doc_id = c[0].parent.document.name
    table_id = str(c[0].parent.table.position)
    str1 = c[0].get_span()
    str2 = c[1].get_span()
    return (doc_id, table_id, str1, str2)

# Split into train and test set
train_c = CandidateSet(name='Training Rels')
test_c = CandidateSet(name='Test Rels')

for c in rels:
    uid = spair2uid(c)
    if uid in annotations:
        test_c.append(c)
        gold_candidates.append(c)
        gold_labels.append(annotations[uid])
    else:
        train_c.append(c)        
        training_candidates.append(c)


print "Training set size: %s" % len(training_candidates)
print "Gold set size: %s" % len(gold_candidates)
print "Positive labels in training set: %s" % len([c for c in training_candidates if annotations.get(spair2uid(c),0)==1])
print "Negative labels in training set: %s" % len([c for c in training_candidates if annotations.get(spair2uid(c),0)==-1])
print "Positive labels in gold set: %s" % len([c for c in gold_candidates if annotations[spair2uid(c)]==1])
print "Negative labels in gold set: %s" % len([c for c in gold_candidates if annotations[spair2uid(c)]==-1])

### Classify correct relations

In [20]:
bad_words = ['rs number', 'rs id', 'rsid']

# negative LFs
def LF_number(m):
    txt = m.span1.get_span()
    frac_num = len([ch for ch in txt if ch.isdigit()]) / float(len(txt))
    return -1 if len(txt) > 5 and frac_num > 0.4 or frac_num > 0.6 else 0

def LF_bad_phen_mentions(m):
    if m.span1.context.cell.spans('row'): return 0
    top_cells = m.span1.context.cell.aligned_cells(axis='col', induced=True)
    top_phrases = [phrase for cell in top_cells for phrase in cell.phrases]
    if not top_phrases: return 0
    matching_phrases = []
    for phrase in top_phrases:
        if any (phen_matcher._f_span(word) for word in phrase.text.split(' ')):
            matching_phrases.append(phrase)
    small_matching_phrases = [phrase for phrase in matching_phrases if len(phrase.text) <= 25]
    return -1 if not small_matching_phrases else 0

def LF_bad_word(m):
    txt = m.span1.get_span()
    return -1 if any(word in txt for word in bad_words) else 0

LF_tables_neg = [LF_number, LF_bad_phen_mentions]

# positive LFs
def LF_no_neg(m):
    return +1 if not any(LF(m) for LF in LF_tables_neg) else 0

LF_tables_pos = [LF_no_neg]

LF_tables = LF_tables_neg + LF_tables_pos

In [44]:
from snorkel.snorkel import TrainingSet
from snorkel.features import NgramFeaturizer

training_set = TrainingSet(training_candidates, LF_tables, featurizer=TableNgramPairFeaturizer())

Applying LFs...
Featurizing...
Building feature index...
Extracting features...
0/88256
5000/88256
10000/88256
15000/88256
20000/88256
25000/88256
30000/88256
35000/88256
40000/88256
45000/88256
50000/88256
55000/88256
60000/88256
65000/88256
70000/88256
75000/88256
80000/88256
85000/88256
LF Summary Statistics: 3 LFs applied to 2256 candidates
------------------------------------------------------------
Coverage (candidates w/ > 0 labels):		100.00%
Overlap (candidates w/ > 1 labels):		11.26%
Conflict (candidates w/ conflicting labels):	0.00%


In [45]:
from snorkel.snorkel import Learner
import snorkel.learning
from snorkel.learning import LogReg

learner = Learner(training_set, model=LogReg())

# Splitting into CV and test set
n_half = len(gold_candidates)/2
test_candidates = gold_candidates[:n_half]
test_labels     = gold_labels[:n_half]
cv_candidates   = gold_candidates[n_half:]
cv_labels       = gold_labels[n_half:]

from snorkel.learning_utils import GridSearch
gs       = GridSearch(learner, ['mu', 'lf_w0'], [[1e-5, 1e-7],[1.0,2.0]])
gs_stats = gs.fit(cv_candidates, cv_labels)

Testing mu = 1.00e-05, lf_w0 = 1.00e+00
Begin training for rate=0.01, mu=1e-05
	Learning epoch = 0	Gradient mag. = 0.009294
	Learning epoch = 250	Gradient mag. = 0.016126
	Learning epoch = 500	Gradient mag. = 0.033966
	Learning epoch = 750	Gradient mag. = 0.059514
Final gradient magnitude for rate=0.01, mu=1e-05: 0.060
Applying LFs...
Featurizing...
Testing mu = 1.00e-05, lf_w0 = 2.00e+00
Begin training for rate=0.01, mu=1e-05
	Learning epoch = 0	Gradient mag. = 0.017941
	Learning epoch = 250	Gradient mag. = 0.031757
	Learning epoch = 500	Gradient mag. = 0.060985
	Learning epoch = 750	Gradient mag. = 0.061324
Final gradient magnitude for rate=0.01, mu=1e-05: 0.062
Testing mu = 1.00e-07, lf_w0 = 1.00e+00
Begin training for rate=0.01, mu=1e-07
	Learning epoch = 0	Gradient mag. = 0.009294
	Learning epoch = 250	Gradient mag. = 0.016619
	Learning epoch = 500	Gradient mag. = 0.035820
	Learning epoch = 750	Gradient mag. = 0.059530
Final gradient magnitude for rate=0.01, mu=1e-07: 0.060
Testin

In [46]:
learner.test_wmv(test_candidates, test_labels)

Applying LFs...
Featurizing...
Test set size:	741
----------------------------------------
Precision:	1.0
Recall:		1.0
F1 Score:	1.0
----------------------------------------
TP: 602 | FP: 0 | TN: 139 | FN: 0


In [47]:
# preds = learner.predict_wmv(candidates)
acronyms = [spair2uid(c) for (c, p) in zip(candidates, preds) if p == 1]
mislabeled_cand = [(c,p, annotations.get(spair2uid(c), None)) for c, p in zip(candidates, preds)]# if p != annotations.get(spair2uid(c), p)]
for (c,p,g) in mislabeled_cand:
    if c.span0.context.document.name != '19197348': continue
    print c.span0.context.document.name, p, g
    print c.span0.context    
    print c.span0.get_span(), c.span1.get_span()
    txt = c.span1.get_span()
    print [LF(c) for LF in LF_tables]
    top_cells = c.span1.context.cell.aligned_cells(axis='col', induced=True)
    top_phrases = [phrase for cell in top_cells for phrase in cell.phrases]
    print top_phrases
    matching_phrases = []
    for phrase in top_phrases:
        print [(word,phen_matcher._f_span(word)) for word in phrase.text.split(' ')]
        if any (phen_matcher._f_span(word) for word in phrase.text.split(' ')):
            matching_phrases.append(phrase)
            print phrase
#         print phrase, [phen_matcher._f_span(word) for word in phrase.text.split(' ')]
    print

19197348 -1.0 None
Phrase('19197348', 2, 15, 0, u'rs6560749')
rs6560749 BMI
[0, 0, 1]
[Phrase('19197348', 2, 0, 0, u'Rank'), Phrase('19197348', 2, 12, 0, u'1'), Phrase('19197348', 2, 23, 0, u'Height'), Phrase('19197348', 2, 24, 0, u'1'), Phrase('19197348', 2, 35, 0, u'2'), Phrase('19197348', 2, 46, 0, u'Waist Circumference'), Phrase('19197348', 2, 47, 0, u'1'), Phrase('19197348', 2, 58, 0, u'2'), Phrase('19197348', 2, 69, 0, u'3'), Phrase('19197348', 2, 80, 0, u'4'), Phrase('19197348', 2, 91, 0, u'5'), Phrase('19197348', 2, 102, 0, u'Weight'), Phrase('19197348', 2, 103, 0, u'1'), Phrase('19197348', 2, 114, 0, u'2'), Phrase('19197348', 2, 125, 0, u'3'), Phrase('19197348', 2, 136, 0, u'Leptin'), Phrase('19197348', 2, 137, 0, u'1'), Phrase('19197348', 2, 148, 0, u'% body fat'), Phrase('19197348', 2, 149, 0, u'1'), Phrase('19197348', 2, 160, 0, u'2'), Phrase('19197348', 2, 171, 0, u'3'), Phrase('19197348', 2, 182, 0, u'4'), Phrase('19197348', 2, 193, 0, u'5'), Phrase('19197348', 2, 204, 0,

Save the results

In [48]:
preds = learner.predict_wmv(candidates)
rels = [(c.span0.context.document.name, c.span0.get_span(), c.span1.get_span()) for (c, p) in zip(candidates, preds) if p == 1]
print len(rels), 'relations extracted, e.g.:'
print rels[:10]

# store relations to annotate
with open('rels.acronyms.extracted.tsv', 'w') as f:
    for doc_id, str1, str2 in rels:
        try:
            out = u'{}\t{}\t{}\n'.format(doc_id, unicode(str1), str2)
            f.write(out.encode("UTF-8"))
        except:
            print 'Error in saving:', str1, str2

Applying LFs...
Featurizing...
2974 relations extracted, e.g.:
[('17903292', u'rs1158167', u'CysC'), ('17903292', u'rs1712790', u'UAE'), ('17903292', u'rs6977660', u'TSH'), ('17903292', u'rs9322817', u'TSH'), ('17903292', u'rs10499559', u'TSH'), ('17903292', u'rs9305354', u'UAE'), ('17903292', u'rs2145231', u'CysC'), ('17903292', u'rs723464', u'UAE'), ('17903292', u'rs2113379', u'UAE'), ('17903292', u'rs2839235', u'GFR')]


## Resolve acronyms based on ones extracted earlier

In [49]:
from extractor.dictionary import Dictionary, unravel

D = Dictionary()
D.load('acronyms.extracted.all.tsv')
print len(D), 'definitions loaded'

326 definitions loaded


Use dictionary to resolve acronyms

In [50]:
new_rels = [ (doc_id, rs_id, unravel(doc_id, phen, D)) for doc_id, rs_id, phen in rels ]

## Evaluate extracted relations

Let's first evaluate the recall w.r.t. GWAS Central

In [51]:
for doc in corpus.documents:
    assocs = [assoc for assoc in kb.assoc_by_pmid(doc.name) if assoc.source == 'gwas_central' and assoc.pvalue < 1e-5]
    print doc.name, len(assocs), len([(pmid, rsid, phen) for pmid, rsid, phen in new_rels if pmid == doc.name])
    

17447842 22 0
17658951 26 0
17684544 32 0
17903292 14 90
17903293 70 79
17903294 42 100
17903295 38 190
17903296 70 141
17903297 50 108
17903298 4 101
17903300 12 52
17903301 20 177
17903302 27 111
17903303 16 111
17903304 6 103
17903305 17 5
17903306 4 71
17903307 21 87
17903308 6 34
17997608 42 0
18159244 15 0
18262040 27 0
18282107 4 0
18369459 3 0
18455228 4 0
18464913 48 0
18483556 14 0
18604267 4 0
18776929 2 0
18823527 4 0
18840781 5 0
18846228 4 0
18941528 2 0
19043545 16 6
19056611 8 0
19081515 4 0
19096518 4 0
19116933 2 0
19122664 12 0
19132087 4 0
19169254 8 0
19197348 33 68
19219042 2 0
19247474 21 0
19300482 4 0
19300499 4 0
19300500 2 0
19304780 2 0
19305408 13 0
19343178 20 0
19359265 2 0
19401414 7 0
19412175 2 0
19412176 2 0
19421330 10 0
19430480 50 0
19430483 9 25
19448621 6 10
19454037 4 0
19503088 5 0
19503597 50 0
19557161 4 3
19557197 7 0
19570815 5 0
19571809 76 0
19578366 3 0
19587794 4 0
19597492 3 0
19609347 36 12
19651812 2 0
19668339 21 0
19714205 2 0
1971

In [52]:
print ([(pmid, rsid, phen) for pmid, rsid, phen in new_rels if pmid == '17903305'])

[('17903305', u'rs905883', u'Breast cancer'), ('17903305', u'rs7564590', u'Breast cancer'), ('17903305', u'rs7558615', u'Breast cancer'), ('17903305', u'rs9325782', u'Prostate cancer'), ('17903305', u'rs2410373', u'Prostate cancer')]


In [53]:
pmids = sorted(list({pmid for pmid, _, _ in new_rels}))

from db.kb import KnowledgeBase
kb = KnowledgeBase()
assocs = [assoc for pmid in pmids for assoc in kb.assoc_by_pmid(pmid) if assoc.source == 'gwas_central' and assoc.pvalue < 1e-5]
print len(pmids), len(assocs)

77 573


In [54]:
print pmids

['17903292', '17903293', '17903294', '17903295', '17903296', '17903297', '17903298', '17903300', '17903301', '17903302', '17903303', '17903304', '17903305', '17903306', '17903307', '17903308', '19043545', '19197348', '19430483', '19448621', '19557161', '19609347', '19721433', '19820699', '20066028', '20195266', '20395239', '20463881', '20526338', '20548944', '20585627', '20694148', '20838585', '20921969', '20927387', '21203500', '21347282', '21386085', '21483430', '21483845', '21552555', '21738479', '21738480', '21738491', '21931564', '22216198', '22291609', '22509378', '22558069', '22589738', '22832964', '22911880', '23028342', '23118974', '23251661', '23408906', '23696099', '23704328', '23754948', '23836780', '23935956', '23966867', '24324551', '24347629', '24376456', '24379826', '24386095', '24586186', '24886709', '24892410', '24903457', '24945404', '25087078', '25133637', '25188341', '25340798', '25367360']


In [28]:
# collect resolved relations
rel_dict = { (pmid, rsid) : set() for (pmid, rsid, phen) in new_rels }
for (pmid, rsid, phen) in new_rels:
    rel_dict[(pmid, rsid)].add(phen)

gold_rel_dict = { (a.paper.pubmed_id, a.snp.rs_id) : set() for a in assocs }
for a in assocs:
    gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)].add(a.phenotype.name)

NameError: name 'assocs' is not defined

First, evaluate recall: how many associations in GWAS central can we recover?

In [None]:
for a in assocs[:500]:
    s1 = gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    s2 = rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), {})
    if len(s1) != 1 or len(s2) != 1:
        print a.paper.pubmed_id, a.snp.rs_id, a.source
        print 'GWC:', gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
        print 'US: ', rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), None)
        print

Second question: can we learn any more SNPs than the ones that are already in GWAS central?

In [None]:
pmids = sorted(list({pmid for pmid, _, _ in new_rels if int(pmid) < 17903297}))

from db.kb import KnowledgeBase
kb = KnowledgeBase()
assocs = [assoc for pmid in pmids for assoc in kb.assoc_by_pmid(pmid) if assoc.source == 'gwas_central']
print len(assocs)

In [None]:
for a in assocs:
    s1 = gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    s2 = rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), {})
    print a.paper.pubmed_id, a.snp.rs_id, a.source
    print 'GWC:', gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    print 'US: ', rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), None)
    print

## Combine with extracted pvalue/rsid relations

In [57]:
pval_rsid_dict = dict()
pval_dict = dict() # combine all of the pvalues for a SNPs in the same document into one set
with open('pval-rsid.raw.tsv') as f:
    for line in f:
        pmid, rsid, table_id, row_id, col_id, pval = line.strip().split('\t')
        pval, table_id, row_id, col_id = float(pval), int(table_id), int(row_id), int(col_id)
        
        if pmid not in pval_rsid_dict: pval_rsid_dict[pmid] = dict()
        key = (rsid, table_id, row_id)
        if key not in pval_rsid_dict[pmid]: pval_rsid_dict[pmid][key] = set()
        pval_rsid_dict[pmid][key].add(pval)
                
        if pmid not in pval_dict: pval_dict[pmid] = dict()
        if rsid not in pval_dict[pmid]: pval_dict[pmid][rsid] = set()
        pval_dict[pmid][rsid].add(pval)

pval_dict0 = {pmid : {rsid : min(pval_dict[pmid][rsid]) for rsid in pval_dict[pmid]} for pmid in pval_dict}
pval_rsid_dict0 = {pmid : {key : min(pval_rsid_dict[pmid][key]) for key in pval_rsid_dict[pmid]} for pmid in pval_rsid_dict}
pval_dict = pval_dict0
pval_rsid_dict = pval_rsid_dict0

Plan. If phen/rsid has been extracted from tables: take its pvalue from pval_rsid_dict.

If not, we assume that paper has only one phenotype and we take the smallest reported pvalue in the paper.

Our goal for now is just to filter phen/rsid relations that have pval<1e-5.

#### Save all relations that are sufficiently small p-values

In [58]:
# preds = learner.predict_wmv(candidates)
predicted_candidates = [c for (c, p) in zip(candidates, preds) if p == 1]

import re
import unicodedata
def _normalize_str(s):
    try:
        s = s.encode('utf-8')
        return s
    except UnicodeEncodeError: 
        pass
    try:
        s = s.decode('utf-8')
        return s
    except UnicodeDecodeError: 
        pass    
    raise Exception()
    
def clean_rsid(rsid):
    return re.sub('/.+', '', rsid)

with open('phen-rsid.table.rel.all.tsv', 'w') as f:
    for c in predicted_candidates:
        pmid = c.span0.context.document.name
        rsid = c.span0.get_span()
        phen = c.span1.get_span()        
        table_id = c.span0.context.table.position
        row_num = c.span0.context.cell.row_num
        col_num = c.span0.context.cell.col_num # of the rsid

        phen = (unravel(pmid, phen, D))
        if isinstance(phen, unicode):
            phen = phen.encode('utf-8')
        
        try:
            pval = pval_rsid_dict[pmid].get((rsid, table_id, row_num), -1)
        except KeyError:
            pval = -1
#             continue
        if pval > 1e-5: continue

        out_str = '{pmid}\t{rsid}\t{phen}\t{pval}\ttable\t{table_id}\t{row}\t{col}\n'.format(
                    pmid=pmid, rsid=clean_rsid(rsid), phen=phen, pval=pval, table_id=table_id, row=row_num, col=col_num)
        f.write(out_str)

In [None]:
print [(c, c.span0.context.cell.row_num, unravel(c.span0.context.document.name, c.span1.get_span(), D)) for c in candidates if c.span0.get_span() == 'rs10500631']

In [42]:
pval_rsid_dict['17903294'].get(('rs10500631', 1, 5), -1)

-1

In [None]:
for x in pval_rsid_dict['17903294']:    
    print x, pval_rsid_dict['17903294'][x]