# Evaluation notebook

Here, we will evaluate the relations extracted in all the other notebooks

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle
import numpy as np

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

In [9]:
from snorkel.parser import XMLDocParser, HTMLParser, CorpusParser
from extractor.parser import UnicodeXMLTableDocParser, UnicodeTableParser

xml_parser = UnicodeXMLTableDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

table_parser = UnicodeTableParser()
html_parser = HTMLParser(path='../data/db/papers/')
cp = CorpusParser(xml_parser, table_parser)
%time corpus = cp.parse_corpus(name='GWAS Corpus')

CPU times: user 14min 49s, sys: 47.4 s, total: 15min 37s
Wall time: 23min 30s


## Loading results from other notebooks

#### Phenotype/rsid relations from tables

In [211]:
phen_table_associations = dict()
with open('phen-rsid.table.rel.all.tsv') as f:
    for line in f:
        fields = line.strip().split('\t')
        pmid, rsid, phen, pval, _, table_id, row_id, col_id = fields
        # note that we assume that an rsid at a givel location will correspond to 1 phenotype
        phen_table_associations[pmid, table_id, row_id, col_id] = (rsid, phen)
        
print len(phen_table_associations), 'loaded, e.g.:'
print phen_table_associations.items()[:5]

2130 loaded, e.g.:
[(('21347282', '1', '3', '1'), ('rs937254', 'HDL-C')), (('23836780', '0', '3', '1'), ('rs3213787', 'Glaucoma')), (('17903302', '4', '31', '2'), ('rs9307847', 'CB-PWV')), (('17903307', '3', '2', '2'), ('rs9325117', 'longitudinal slope of fef mean fev')), (('17903298', '3', '12', '3'), ('rs10510418', 'Fasting insulin'))]


#### PMID/Phenotype relations extracted from titles/abstracts

In [212]:
phen_text_associations = dict()
with open('phenotypes.extracted.tsv') as f:
    for line in f:
        pmid, phen = line.strip().split('\t')
        phen_text_associations[pmid] = phen
        
print len(phen_text_associations), 'loaded, e.g.:'
print phen_text_associations.items()[:5]

589 loaded, e.g.:
[('23738518', 'reading|language'), ('19056611', 'Type 2 Diabetes'), ('21298047', 'European-Origin|Substance Dependence'), ('23104006', 'endometriosis'), ('20072603', 'Osteoporosis')]


#### RSID/Pvalue relations extracted from tables

In [217]:
pval_table_associations = dict()
with open('pval-rsid.raw.tsv') as f:
    for line in f:
        pmid, rsid, table_id, row_id, col_id, pval = line.strip().split('\t')
        pval, table_id, row_id, col_id = float(pval), int(table_id), int(row_id), int(col_id)
        
        key = pmid, table_id, row_id, col_id, rsid
        if key not in pval_table_associations: pval_table_associations[key] = set()
        pval_table_associations[key].add(pval)

print len(pval_table_associations), 'loaded, e.g.:'
print pval_table_associations.items()[:5]        

10681 loaded, e.g.:
[(('22504420', 0, 33, 0, 'rs4790881'), set([0.0, 0.0006, 2e-06])), (('22396660', 1, 6, 1, 'rs11746443'), set([0.98])), (('17903302', 1, 21, 2, 'rs1408263'), set([2.4e-05])), (('17903294', 2, 20, 1, 'rs727979'), set([2.6e-05, 7e-06])), (('17903307', 1, 47, 1, 'rs1409149'), set([9e-06]))]


#### Singleton RSids

In [222]:
singleton_associations = dict()
with open('rsids.singletons.all.tsv') as f:
    for line in f:
        pmid, table_id, row_num, col_num, rsid = line.strip().split()
        singleton_associations[pmid, table_id, row_num, col_num] = rsid

print len(singleton_associations), 'loaded, e.g.:'
print singleton_associations.items()[:5]     

12309 loaded, e.g.:
[(('17903303', '2', '9', '2'), 'rs28207'), (('25742292', '1', '13', '5'), 'rs7919841'), (('17903302', '4', '31', '2'), 'rs9307847'), (('25129146', '0', '2', '0'), 'rs7447927'), (('19096518', '0', '3', '0'), 'rs563694')]


## Construct a list valid relations

In [215]:
# TODO: load pvalues properly, use them to filter stuff downstream, do acroynm resolution here

In [240]:
import re

def clean(phen):
    phen = re.sub('[)():.]+', '', phen)
    phen = re.sub('^or ', '', phen)
    phen = phen.strip()
    return phen

associations = []

# record table associations wth pval < 1e-5 or no pval reported
chosen_rsids = set()
tables_with_pvals = { table_id for 
                      (pmid, table_id, row_id, col_id), (rsid, loc_phen) in phen_table_associations.items() 
                      if (pmid, table_id, row_id, col_id, rsid) in pval_table_associations }
for (pmid, table_id, row_id, col_id), (rsid, loc_phen) in phen_table_associations.items():
    pvals = pval_table_associations.get((pmid, table_id, row_id, col_id, rsid), [-1])
    glob_phen = clean(phen_text_associations[pmid])
    if min(pvals) < 1e-5:
        associations.append((pmid, rsid, (glob_phen, loc_phen)))
        chosen_rsids.add((pmid, table_id, row_id, col_id, rsid))


# record associations with no local phenotype
for (pmid, table_id, row_id, col_id, rsid), pvals in pval_table_associations.items():
    # skip low-pvalue snps and snps that we already added
    if pvals and min(pvals) > 1e-5: continue
    if (pmid, table_id, row_id, col_id, rsid) in chosen_rsids: continue
    
    # append with global phenotype
    phen = clean(phen_text_associations[pmid])
    associations.append((pmid, rsid, (phen, '')))    
    chosen_rsids.add((pmid, table_id, row_id, col_id, rsid))

# # record singletons
# tables_found = {table_id for _, table_id, _, _, _ in pval_table_associations}
# for (pmid, table_id, row_id, col_id), rsid in singleton_associations.items():
#     if (pmid, table_id, row_id, col_id, rsid) in chosen_rsids: continue
#     if table_id in tables_found: continue
#     phen = clean(phen_text_associations[pmid])
#     associations.append((pmid, rsid, (phen, ''))) 

print len(associations), 'associations, e.g.:'
print associations[:5]     

9240 associations, e.g.:
[('21347282', 'rs937254', ('Coronary Heart Disease', 'HDL-C')), ('23836780', 'rs3213787', ('intraocular pressure|glaucoma', 'Glaucoma')), ('17903302', 'rs9307847', ('arterial stiffness|blood pressure', 'CB-PWV')), ('17903307', 'rs9325117', ('pulmonary function', 'longitudinal slope of fef mean fev')), ('17903298', 'rs10510418', ('diabetes-related traits', 'Fasting insulin'))]


## Comparing to GWAS central

In [133]:
from db.kb import KnowledgeBase

kb = KnowledgeBase()
assocs = [assoc for doc in corpus.documents for assoc in kb.assoc_by_pmid(doc.name) if assoc.source == 'gwas_central' and assoc.pvalue < 1e-5]

print '%d documents, %d associations' % (len(corpus.documents), len(assocs))

589 documents, 7629 associations


In [241]:
rel_dict = { (pmid, rsid) : set() for (pmid, rsid, phen) in associations }
for (pmid, rsid, phen) in associations:
    rel_dict[(pmid, rsid)].add(phen)

gold_rel_dict = { (a.paper.pubmed_id, a.snp.rs_id) : set() for a in assocs }
for a in assocs:
    gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)].add(a.phenotype.name)

In [242]:
# display directly the results we found
for doc in corpus.documents:
#     continue
    rsids_seen = set()
    for a in assocs:
        if str(a.paper.pubmed_id) != doc.name: continue
        rsids_seen.add(a.snp.rs_id)
        
#         # print old snps
#         print a.paper.pubmed_id, a.snp.rs_id, a.source
#         print 'GWC:', gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
#         print 'US: ', rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), None)
#         print
    
    # print new snps
    for (pmid, rsid), phens in rel_dict.items():
        if pmid != doc.name: continue
        if rsid in rsids_seen: continue
        print pmid, rsid, phens
    print '---'
    

---
---
---
17903292 rs10502302 set([('endocrine-related traits|kidney', 'Serum creatinine (exam 5)')])
17903292 rs723464 set([('endocrine-related traits|kidney', ''), ('endocrine-related traits|kidney', 'urinary albumin excretion')])
17903292 rs754958 set([('endocrine-related traits|kidney', 'Serum Phosphorous')])
17903292 rs727087 set([('endocrine-related traits|kidney', 'glomerular filtration rate')])
17903292 rs2885618 set([('endocrine-related traits|kidney', 'glomerular filtration rate')])
17903292 rs10495487 set([('endocrine-related traits|kidney', 'Uric acid')])
17903292 rs10486135 set([('endocrine-related traits|kidney', 'glomerular filtration rate')])
17903292 rs10484370 set([('endocrine-related traits|kidney', 'Serum Calcium')])
17903292 rs10512437 set([('endocrine-related traits|kidney', 'glomerular filtration rate')])
17903292 rs10489578 set([('endocrine-related traits|kidney', 'Serum creatinine (exam 6)'), ('endocrine-related traits|kidney', 'glomerular filtration rate (ex

KeyboardInterrupt: 

In [None]:
BLACKLIST=['19043545', '18464913', '17997608', '17447842', '19936222', '19503597', '20041166']

#### Error analysis over top 15 documents

Crohns disease (1st paper): Table is really messed up, but could solve it with a specialized extractor that looks at induced cells.

Weight/bmi: The table is only a screenshot

Acronyms at the end: The acronym explanations are in the caption of the table

Some acroynms aren't resolved because an extra letter is added to them e.g. AXYZ instead of XYZ

### Set up map of GWC to GWASDB phenotypes

In [199]:
out_set = set()
for a in assocs:
    s1 = gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    s2 = rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), None)
    if s1 and s2:
        for name1 in s1:
            for name2a, name2b in s2:
                name1 = str(name1)
                score = phen_scores.get((name1.lower(), name2a.lower(), name2b.lower()), '')
                out_set.add('%s\t%s\t%s\t%s\n' % (name1, name2a, name2b, score))

with open('phenotype.mapping.tsv', 'w') as f:                    
    for out_str in out_set:
        f.write(out_str)

Dictionary legend in 0: wrong, 1: acronym, 2: imprecise, 3: correct

In [200]:
phen_map = dict()
phen_scores = dict()
with open('phenotype.mapping.annotated.txt') as f:                    
    lines = f.read().split('\r')
    for line in lines:
        fields = line.strip().split('\t')
        if fields[0] == 'Thiazide-induced':
            print fields
        fields = [re.sub('"', '', f) for f in fields]
        try:
            orig_name, glob_name, loc_name, score = fields
        except ValueError:
            if len(fields) == 2:
                orig_name, glob_name = fields
                loc_name = ''
            else:
                orig_name, glob_name, loc_name = fields
            score = 3
        phen_scores[(orig_name.lower(), glob_name.lower(), loc_name.lower())] = score
        if score in ('2', '3'):
            key = (glob_name, loc_name)
            if key not in phen_map: phen_map[key] = set()
            phen_map[key].add(orig_name)

### Look at results!

In [118]:
import re

def paper_contains(pmid, quote):
    with open('../data/db/papers/%d.xml' % pmid) as f:
        txt = f.read()
        return True if re.search(quote, txt) else False

In [237]:
# display directly the results we found
n_correct = 0
n_imprecise = 0
n_missing = 0
n_wrong = 0
n_total = 0
n_new = 0
invalid_per_paper = { a.paper.pubmed_id : 0 for a in assocs }
seen = set()
for a in assocs:
#     if str(a.paper.pubmed_id) in BLACKLIST: continue
    if (a.paper.pubmed_id, a.snp.rs_id) in seen: continue
    seen.add((a.paper.pubmed_id, a.snp.rs_id))
    gold_phen_set = gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    pred_phen_set = rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), {})
    tmp = list(pred_phen_set)
    if tmp and tmp[0][1].startswith('Age at'):
        n_correct += 1
        continue
    pred_phen_set2 = {m for p in pred_phen_set for m in phen_map.get(p, {})}
    if gold_phen_set & pred_phen_set2: 
        n_correct += 1
        n_total += 1
    else:
#         if True:
        if paper_contains(a.paper.pubmed_id, a.snp.rs_id):
            if a.paper.pubmed_id == 20195266:
                print a.snp.rs_id, gold_phen_set, pred_phen_set, pred_phen_set2
            if pred_phen_set2:
                n_wrong += 1
            else:
                n_missing += 1
            n_total += 1
            invalid_per_paper[a.paper.pubmed_id] += 1
#         print gold_phen_set
#         print pred_phen_set
#         print pred_phen_set2        
#         print

print n_correct, n_wrong, n_missing, n_total

print 'Biggest omissions:', sorted(invalid_per_paper.items(), key=lambda x: x[1], reverse=True)[:15]


2530 33 395 2954
Biggest omissions: [(23400010, 14), (19197348, 12), (22747683, 9), (22675492, 8), (21533175, 8), (23202124, 7), (22493691, 7), (23903356, 7), (23583978, 7), (23563607, 7), (22216198, 7), (23381943, 7), (24068962, 6), (22961080, 6), (23300278, 5)]


#### Notes from all papers:

19197348: (Korsae islanders) No global phenotype, need to handle phenotype headers. Some not parsed because exact match not in dict, need to search for n-grams
23400010: All the SNPs are in text!!
19305408: Phenotype incorrect? QT Interval vs sudden cardian death. But QT interval is in abstract
22760553: citalopram in dict but connected to a dash in title. Add "side effects" to matched keywords?
22747683: They're in text
22675492: Estradiol gets pushed our because "testosterone," and "testostreone" both present
21533175: can't parse 'Serum dehydroepiandrosterone sulphate levels' (acronym in text)
23202124: It's all in text
22493691: pvalue/rel not parsed in table (WTF!)
23903356: all in text (not abstract)
23583978: in the text (some in abstract)
23563607: Phenotype headers not fully matched (e.g. Obesity type-2) and "trails" instead of traits



Idea: Match spans if they contain the same words, but potentially in a different order

#### Notes from first 100:

470 fully correct

19197348: Phenotype headers
18159244: Completely messed-up way of writing down p-values
17903302: Acronyms
17903305: Phenotype headers, errors (prostate vs breast cancer)
19043545: Mentions in text
17447842: Mentions in text (also: Messed-up way of writing table (induced cells should help)
17658951: Extraction from text
17903296: e.g. Neck phenotype acronym has an undocumented male/female suffix
19557161: In text, and one table entry strangely doesn't seem have been found

In [86]:
paper_id = '19557161'
for a in assocs:
    if str(a.paper.pubmed_id) == paper_id and paper_contains(a.paper.pubmed_id, a.snp.rs_id):
        print a.phenotype.name, a.snp.rs_id, rel_dict.get((paper_id, a.snp.rs_id), None)

Waist-hip ratio rs2605100 None
Waist circumference rs987237 None
Waist circumference rs545854 None
Waist circumference rs6429082 None


In [89]:
with open('associations.tsv', 'w') as f:
    for (pmid, rsid, (glob_phen, loc_phen)) in associations:
        pvals = pval_dict.get(pmid, None)
        pval = min(pvals.get(rsid, [-1])) if pvals else -1
        loc_phen = '-' if not loc_phen else loc_phen

        f.write('%s\t%s\t%s\t%s\t%f\n' % (pmid, rsid, glob_phen, loc_phen, pval))