# Evaluation notebook

Here, we will evaluate the relations extracted in all the other notebooks

In [9]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

# set up matplotlib
import numpy as np
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

In [39]:
from snorkel.parser import XMLDocParser, HTMLParser, CorpusParser
from extractor.parser import UnicodeXMLTableDocParser, UnicodeTableParser

xml_parser = UnicodeXMLTableDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

table_parser = UnicodeTableParser()
html_parser = HTMLParser(path='../data/db/papers/')
cp = CorpusParser(xml_parser, table_parser, max_docs=15)
%time corpus = cp.parse_corpus(name='GWAS Corpus')

CPU times: user 50.2 s, sys: 2.97 s, total: 53.2 s
Wall time: 1min 16s


## Loading results from other notebooks

#### Phenotype/rsid relations from tables

In [59]:
table_associations = []
with open('phen-rsid.table.rel.tsv') as f:
    for line in f:
        fields = line.strip().split('\t')
        pmid, rsid, phen, pval = fields[:4]
        table_associations.append((pmid, rsid, phen))
        
print len(table_associations), 'loaded, e.g.:'
print table_associations[:5]

857 loaded, e.g.:
[('17903292', 'rs1158167', 'cystatin-c'), ('17903292', 'rs1712790', 'urinary albumin excretion'), ('17903292', 'rs6977660', 'thyroid stimulating hormone'), ('17903292', 'rs9322817', 'thyroid stimulating hormone'), ('17903292', 'rs10499559', 'thyroid stimulating hormone')]


#### PMID/Phenotype relations extracted from titles/abstracts

In [48]:
text_associations = dict()
with open('phenotypes.extracted.tsv') as f:
    for line in f:
        name, phen = line.strip().split('\t')
        text_associations[name] = phen
        
print len(text_associations), 'loaded, e.g.:'
print text_associations.items()[:5]

100 loaded, e.g.:
[('20081856', 'major mood disorders'), ('19056611', 'Type 2 Diabetes'), ('20081858', 'glucose homeostasis|type 2 diabetes'), ('19503597', 'Uric Acid Concentrations.'), ('19734902', "Alzheimer's disease,")]


#### RSID/Pvalue relations extracted from tables

In [50]:
rsid_dict = dict()
with open('pval-rsid.raw.tsv') as f:
    for line in f:
        pmid, rsid, table_id, row_id, col_id, pval = line.strip().split('\t')
        pval, table_id, row_id, col_id = float(pval), int(table_id), int(row_id), int(col_id)
        
        if pval > 1e-5: continue
            
        if pmid not in rsid_dict: rsid_dict[pmid] = set()
        rsid_dict[pmid].add(rsid)

## Construct a list valid relations

In [None]:
# TODO: load pvalues properly, use them to filter stuff downstream, do acroynm resolution here

In [61]:
table_pmids = {pmid for pmid, _, _ in table_associations}
associations = table_associations

for pmid in rsid_dict:
    if pmid in table_pmids: continue
    phen = text_associations[pmid]
    associations.append((pmid, rsid, phen))

## Comparing to GWAS central

In [40]:
from db.kb import KnowledgeBase

kb = KnowledgeBase()
assocs = [assoc for doc in corpus.documents for assoc in kb.assoc_by_pmid(doc.name) if assoc.source == 'gwas_central' and assoc.pvalue < 1e-5]

print '%d documents, %d associations' % (len(corpus.documents), len(assocs))

15 documents, 308 associations


In [62]:
rel_dict = { (pmid, rsid) : set() for (pmid, rsid, phen) in associations }
for (pmid, rsid, phen) in associations:
    rel_dict[(pmid, rsid)].add(phen)

gold_rel_dict = { (a.paper.pubmed_id, a.snp.rs_id) : set() for a in assocs }
for a in assocs:
    gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)].add(a.phenotype.name)

In [64]:
# TODO: where crohns disease?? why weight/bmi, etc not extracted from table? why acronyms at the end are not resolved??
# display directly the results we found
for a in assocs:
#     continue
    
    s1 = gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    s2 = rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), {})
    print a.paper.pubmed_id, a.snp.rs_id, a.source
    print 'GWC:', gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    print 'US: ', rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), None)
    print

17447842 rs11209026 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs11465804 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs1373692 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs10512734 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs4613763 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs1002922 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs1343151 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs4495224 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs348601 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs5743289 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs10889677 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17658951 rs9930506 gwas_central
GWC: set([u'Hip circumference'])
US:  None

17658951 rs8050136 gwas_central
GWC: set([u'Hip circumference'])
US:  None

17658951 rs9940128 gwas_central