# Evaluation notebook

Here, we will evaluate the relations extracted in all the other notebooks

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle
import numpy as np

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

In [9]:
from snorkel.parser import XMLDocParser, HTMLParser, CorpusParser
from extractor.parser import UnicodeXMLTableDocParser, UnicodeTableParser

xml_parser = UnicodeXMLTableDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

table_parser = UnicodeTableParser()
html_parser = HTMLParser(path='../data/db/papers/')
cp = CorpusParser(xml_parser, table_parser)
%time corpus = cp.parse_corpus(name='GWAS Corpus')

CPU times: user 14min 49s, sys: 47.4 s, total: 15min 37s
Wall time: 23min 30s


## Loading results from other notebooks

#### Phenotype/rsid relations from tables

In [3]:
table_associations = []
with open('phen-rsid.table.rel.all.tsv') as f:
    for line in f:
        fields = line.strip().split('\t')
        pmid, rsid, phen, pval = fields[:4]
        table_associations.append((pmid, rsid, phen))
        
print len(table_associations), 'loaded, e.g.:'
print table_associations[:5]

1939 loaded, e.g.:
[('17903292', 'rs1158167', 'cst3'), ('17903292', 'rs1712790', 'urinary albumin excretion'), ('17903292', 'rs6977660', 'thyroid stimulating hormone'), ('17903292', 'rs9322817', 'thyroid stimulating hormone'), ('17903292', 'rs10499559', 'thyroid stimulating hormone')]


#### PMID/Phenotype relations extracted from titles/abstracts

In [5]:
text_associations = dict()
with open('phenotypes.extracted.tsv') as f:
    for line in f:
        name, phen = line.strip().split('\t')
        text_associations[name] = phen
        
print len(text_associations), 'loaded, e.g.:'
print text_associations.items()[:5]

589 loaded, e.g.:
[('23738518', 'reading'), ('19056611', 'Type 2 Diabetes'), ('21298047', 'Dependence'), ('23104006', 'endometriosis'), ('20072603', 'Osteoporosis.')]


#### RSID/Pvalue relations extracted from tables

In [6]:
rsid_dict = dict()
pval_dict = dict()
with open('pval-rsid.raw.tsv') as f:
    for line in f:
        pmid, rsid, table_id, row_id, col_id, pval = line.strip().split('\t')
        pval, table_id, row_id, col_id = float(pval), int(table_id), int(row_id), int(col_id)
        
        if pmid not in pval_dict: pval_dict[pmid] = dict()
        if rsid not in pval_dict[pmid]: pval_dict[pmid][rsid] = set()
        pval_dict[pmid][rsid].add(pval)
        
        if pval > 1e-5: continue
            
        if pmid not in rsid_dict: rsid_dict[pmid] = set()
        rsid_dict[pmid].add(rsid)

#### Singleton RSids

In [51]:
singleton_rsids = dict()
with open('rsids.singletons.all.tsv') as f:
    for line in f:
        pmid, table_id, rsid = line.strip().split()
        if pmid not in singleton_rsids: singleton_rsids[pmid] = set()
        singleton_rsids[pmid].add(rsid)

17903305


## Construct a list valid relations

In [6]:
# TODO: load pvalues properly, use them to filter stuff downstream, do acroynm resolution here

In [53]:
import re

def clean(phen):
    phen = re.sub('[)():.]+', '', phen)
    phen = re.sub('^or ', '', phen)
    phen = phen.strip()
    return phen

associations = []

for (pmid, rsid, loc_phen) in table_associations:
    if rsid in rsid_dict[pmid]:
        glob_phen = clean(text_associations.get(pmid, ''))
        associations.append((pmid, rsid, (glob_phen, loc_phen)))

table_rsids = {(pmid, rsid) for pmid, rsid, _ in table_associations}
for pmid in rsid_dict:
    phen = clean(text_associations[pmid])
    for rsid in rsid_dict[pmid]:
        if (pmid, rsid) in table_rsids: continue
        associations.append((pmid, rsid, (phen, '')))

for pmid, rsids in singleton_rsids.items():
    for rsid in rsids:
        if rsid in rsid_dict.get(pmid, []): continue
        phen = clean(text_associations[pmid])
        associations.append((pmid, rsid, (phen, '')))

## Comparing to GWAS central

In [10]:
from db.kb import KnowledgeBase

kb = KnowledgeBase()
assocs = [assoc for doc in corpus.documents for assoc in kb.assoc_by_pmid(doc.name) if assoc.source == 'gwas_central' and assoc.pvalue < 1e-5]

print '%d documents, %d associations' % (len(corpus.documents), len(assocs))

589 documents, 7629 associations


In [57]:
rel_dict = { (pmid, rsid) : set() for (pmid, rsid, phen) in associations }
for (pmid, rsid, phen) in associations:
    rel_dict[(pmid, rsid)].add(phen)

gold_rel_dict = { (a.paper.pubmed_id, a.snp.rs_id) : set() for a in assocs }
for a in assocs:
    gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)].add(a.phenotype.name)

In [65]:
# display directly the results we found
for doc in corpus.documents:
#     continue
    rsids_seen = set()
    for a in assocs:
        if str(a.paper.pubmed_id) != doc.name: continue
        rsids_seen.add(a.snp.rs_id)
        
        print a.paper.pubmed_id, a.snp.rs_id, a.source
        print 'GWC:', gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
        print 'US: ', rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), None)
        print
    
    for (pmid, rsid), phens in rel_dict.items():
        if pmid != doc.name: continue
        if rsid in rsids_seen: continue
        print pmid, rsid, phens
    print '---'
    

17447842 rs11209026 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs11465804 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs1373692 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs10512734 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs4613763 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs1002922 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs1343151 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs4495224 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs348601 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs5743289 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs10889677 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs11209026 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs11465804 gwas_central
GWC: set([u"Crohn's disease"])
US:  None

17447842 rs1373692 gwas_central
G

In [35]:
BLACKLIST=['19043545', '18464913', '17997608', '17447842', '19936222', '19503597', '20041166']

#### Error analysis over top 15 documents

Crohns disease (1st paper): Table is really messed up, but could solve it with a specialized extractor that looks at induced cells.

Weight/bmi: The table is only a screenshot

Acronyms at the end: The acronym explanations are in the caption of the table

Some acroynms aren't resolved because an extra letter is added to them e.g. AXYZ instead of XYZ

### Set up map of GWC to GWASDB phenotypes

In [79]:
out_set = set()
for a in assocs:
    s1 = gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    s2 = rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), None)
    if a.paper.pubmed_id == 23400010: print s1, s2
    if s1 and s2:
        for name1 in s1:
            for name2a, name2b in s2:
                name1 = str(name1)
                score = phen_scores.get((name1, name2a, name2b), '')
                out_set.add('%s\t%s\t%s\t%s\n' % (name1, name2a, name2b, score))

with open('phenotype.mapping.tsv', 'w') as f:                    
    for out_str in out_set:
        f.write(out_str)

set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set([u'Thiazide-induced adverse metabolic effects in hypertensive patients']) None
set(

Dictionary legend in 0: wrong, 1: acronym, 2: imprecise, 3: correct

In [78]:
phen_map = dict()
phen_scores = dict()
with open('phenotype.mapping.annotated.txt') as f:                    
    lines = f.read().split('\r')
    for line in lines:
        fields = line.strip().split('\t')
        if fields[0] == 'Thiazide-induced':
            print fields
        fields = [re.sub('"', '', f) for f in fields]
        try:
            orig_name, glob_name, loc_name, score = fields
        except ValueError:
            if len(fields) == 2:
                orig_name, glob_name = fields
                loc_name = ''
            else:
                orig_name, glob_name, loc_name = fields
            score = 3
        phen_scores[(orig_name, glob_name, loc_name)] = score
        if score in ('2', '3'):
            key = (glob_name, loc_name)
            if key not in phen_map: phen_map[key] = set()
            phen_map[key].add(orig_name)

### Look at results!

In [67]:
import re

def paper_contains(pmid, quote):
    with open('../data/db/papers/%d.xml' % pmid) as f:
        txt = f.read()
        return True if re.search(quote, txt) else False

In [85]:
# display directly the results we found
n_correct = 0
n_imprecise = 0
n_missing = 0
n_wrong = 0
n_total = 0
n_new = 0
invalid_per_paper = { a.paper.pubmed_id : 0 for a in assocs }
seen = set()
for a in assocs:
#     if str(a.paper.pubmed_id) in BLACKLIST: continue
    if (a.paper.pubmed_id, a.snp.rs_id) in seen: continue
    seen.add((a.paper.pubmed_id, a.snp.rs_id))
    gold_phen_set = gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    pred_phen_set = rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), {})
    tmp = list(pred_phen_set)
    if tmp and tmp[0][1].startswith('Age at'):
        n_correct += 1
        continue
    pred_phen_set2 = {m for p in pred_phen_set for m in phen_map.get(p, {})}
    if a.paper.pubmed_id == 23583980:
        print a.snp.rs_id, gold_phen_set, pred_phen_set, pred_phen_set2
    if gold_phen_set & pred_phen_set2: 
        n_correct += 1
        n_total += 1
    else:
#         if True:
        if paper_contains(a.paper.pubmed_id, a.snp.rs_id):
            if pred_phen_set2:
                n_wrong += 1
            else:
                n_missing += 1
            n_total += 1
            invalid_per_paper[a.paper.pubmed_id] += 1
#         print gold_phen_set
#         print pred_phen_set
#         print pred_phen_set2        
#         print

print n_correct, n_wrong, n_missing, n_total

print 'Biggest omissions:', sorted(invalid_per_paper.items(), key=lambda x: x[1], reverse=True)[:10]


rs7934606 set([u'Interstitial lung disease']) set([('pneumonias', '')]) set([])
rs2076295 set([u'Interstitial lung disease']) set([('pneumonias', '')]) set([])
rs2736100 set([u'Interstitial lung disease']) set([('pneumonias', '')]) set([])
rs1981997 set([u'Interstitial lung disease']) set([('pneumonias', '')]) set([])
rs6793295 set([u'Interstitial lung disease']) set([('pneumonias', '')]) set([])
rs12610495 set([u'Interstitial lung disease']) set([('pneumonias', '')]) set([])
rs2034650 set([u'Interstitial lung disease']) set([('pneumonias', '')]) set([])
rs2609255 set([u'Interstitial lung disease']) set([('pneumonias', '')]) set([])
rs1278769 set([u'Interstitial lung disease']) set([('pneumonias', '')]) set([])
rs4727443 set([u'Interstitial lung disease']) set([('pneumonias', '')]) set([])
rs11191865 set([u'Interstitial lung disease']) set([('pneumonias', '')]) set([])
rs1379326 set([u'Interstitial lung disease']) {} set([])
rs7005380 set([u'Interstitial lung disease']) {} set([])
rs24

#### Notes from all papers:

20195266: Add outcome column title
19197348: (Korsae islanders) No global phenotype, need to handle phenotype headers
23382691: "Shows pleitoropy with cancer and...". "glycosylation" in phenotype dict but fails because of "n-" prefix. Immunoglobulin not in dict, but igg is, but is not picked up because too short. Need to add "associated with" as a feature, and handle dashed, and if still fails, add all phenotypes from title. Could take out long words (>12 chars?) from long phenotypes in dict for individual matching.
23455636: RSID/Allele format that you need to parse
22829776: Incorrect global phenotype. But "sex hormone-binding globulin" is in dict, but doesn't get matched...
22832960: Should be correct with new phenotype
23400010: All the SNPs are in text!!
22760553: citalopram in dict but connected to a dash in title. Add "side effects" to matched keywords?
17903296: Incorrect global phenotype, but now should be "bone mineral density", although it's still wrong. "bone mass and geometry" hard to match. Sub-phenotype acronyms sometimes not correctly resolved. In many cases, GWAS central is wrong, e.g. rs10514345, "lumbar spine bmd" is correct, not "Hip geometry, neck section modulus".
23583980: Why is "fiboris not matched? It's the period? No: it's in shortened dict, but not in full.

Idea: Match spans if they contain the same words, but potentially in a different order

#### Notes from first 100:

470 fully correct

19197348: Phenotype headers
18159244: Completely messed-up way of writing down p-values
17903302: Acronyms
17903305: Phenotype headers, errors (prostate vs breast cancer)
19043545: Mentions in text
17447842: Mentions in text (also: Messed-up way of writing table (induced cells should help)
17658951: Extraction from text
17903296: e.g. Neck phenotype acronym has an undocumented male/female suffix
19557161: In text, and one table entry strangely doesn't seem have been found

In [54]:
paper_id = '19557161'
for a in assocs:
    if str(a.paper.pubmed_id) == paper_id and paper_contains(a.paper.pubmed_id, a.snp.rs_id):
        print a.phenotype.name, a.snp.rs_id, rel_dict.get((paper_id, a.snp.rs_id), None)

Waist-hip ratio rs2605100 None
Waist circumference rs987237 None
Waist circumference rs545854 None
Waist circumference rs6429082 None


In [167]:
with open('associations.tsv', 'w') as f:
    for (pmid, rsid, (glob_phen, loc_phen)) in associations:
        pval = min(pval_dict[pmid][rsid])
        loc_phen = '-' if not loc_phen else loc_phen

        f.write('%s\t%s\t%s\t%s\t%f\n' % (pmid, rsid, glob_phen, loc_phen, pval))