# Evaluation notebook

Here, we will evaluate the relations extracted in all the other notebooks

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle
import numpy as np

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

In [2]:
from snorkel.parser import XMLDocParser, HTMLParser, CorpusParser
from extractor.parser import UnicodeXMLTableDocParser, UnicodeTableParser

xml_parser = UnicodeXMLTableDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

table_parser = UnicodeTableParser()
html_parser = HTMLParser(path='../data/db/papers/')
cp = CorpusParser(xml_parser, table_parser)
%time corpus = cp.parse_corpus(name='GWAS Corpus')

CPU times: user 16min 26s, sys: 49.8 s, total: 17min 16s
Wall time: 24min 13s


## Loading results from other notebooks

#### Phenotype/rsid relations from tables

In [44]:
phen_table_associations = dict()
with open('phen-rsid.table.rel.all.tsv') as f:
    for line in f:
        fields = line.strip().split('\t')
        pmid, rsid, phen, pval, _, table_id, row_id, col_id = fields
        pval, table_id, row_id, col_id = float(pval), int(table_id), int(row_id), int(col_id)
        # note that we assume that an rsid at a givel location will correspond to 1 phenotype
        phen_table_associations[pmid, table_id, row_id, col_id] = (rsid, phen)
        
print len(phen_table_associations), 'loaded, e.g.:'
print phen_table_associations.items()[:5]

2130 loaded, e.g.:
[(('21738491', 1, 3, 2), ('rs12143842', 'QT')), (('20585627', 2, 8, 1), ('rs12931267', 'Red hair')), (('17903306', 4, 31, 1), ('rs2195926', 'SDNN')), (('17903301', 2, 9, 1), ('rs10514431', 'left ventricular diastolic dimension')), (('24376456', 3, 36, 2), ('rs2410182', 'Coronary heart disease'))]


#### PMID/Phenotype relations extracted from titles/abstracts

In [57]:
phen_text_associations = dict()
with open('phenotypes.extracted.tsv') as f:
    for line in f:
        pmid, phen = line.strip().split('\t')
        phen_text_associations[pmid] = phen
        
print len(phen_text_associations), 'phenotypes loaded, e.g.:'
print phen_text_associations.items()[:5]                           

589 phenotypes loaded, e.g.:
[('23738518', 'reading|language'), ('19056611', 'Type 2 Diabetes'), ('21298047', 'European-Origin|Substance Dependence'), ('23104006', 'endometriosis'), ('20072603', 'Osteoporosis')]
322 table ids loaded, e.g.:
[('23936387', 5), ('20935630', 0), ('24778558', 0), ('22589738', 1), ('22479202', 3)]


#### RSID/Pvalue relations extracted from tables

In [95]:
pval_table_associations = dict()
with open('pval-rsid.raw.cols.tsv') as f:
    for line in f:
        pmid, rsid, table_id, row_id, col_id, pval = line.strip().split('\t')
        pval, table_id, row_id, col_id = float(pval), int(table_id), int(row_id), int(col_id)
        
        key = pmid, table_id, row_id, col_id, rsid
        if key not in pval_table_associations: pval_table_associations[key] = set()
        pval_table_associations[key].add(pval)

print len(pval_table_associations), 'relations loaded, e.g.:'
print pval_table_associations.items()[:5]      

11057 relations loaded, e.g.:
[(('22504420', 0, 33, 0, 'rs4790881'), set([0.0, 0.0006, 2e-06])), (('22396660', 1, 6, 1, 'rs11746443'), set([0.98])), (('24094242', 1, 19, 0, 'rs7267979'), set([0.0])), (('17903302', 1, 21, 2, 'rs1408263'), set([2.4e-05])), (('17903294', 2, 20, 1, 'rs727979'), set([2.6e-05, 7e-06]))]


#### Table annotations

In [103]:
non_gwas_tables = set()
with open('table-annotations.tsv') as f:
    for line in f:
        pmid, table_id, pval_found, lod_found = line.strip().split('\t')
        if int(pval_found) or int(lod_found):
            non_gwas_tables.add((pmid, int(table_id)))
                                  
print len(non_gwas_tables), 'suspcious table ids loaded, e.g.:'
print list(non_gwas_tables)[:5]   

671 suspcious table ids loaded, e.g.:
[('21273288', 1), ('19343178', 1), ('23028341', 2), ('23844046', 3), ('24143190', 1)]


#### Singleton RSids

In [79]:
singleton_associations = dict()
with open('rsids.singletons.all.tsv') as f:
    for line in f:
        pmid, table_id, row_id, col_id, rsid = line.strip().split()
        table_id, row_id, col_id = int(table_id), int(row_id), int(col_id)
        singleton_associations[pmid, table_id, row_id, col_id] = rsid

print len(singleton_associations), 'loaded, e.g.:'
print singleton_associations.items()[:5]

n_singletons_by_table = dict()
for (pmid, table_id, row_id, col_id), rsid in singleton_associations.items():
    key = pmid, table_id
    if key not in n_singletons_by_table: n_singletons_by_table[key] = 0
    n_singletons_by_table[key] += 1

12309 loaded, e.g.:
[(('23128233', 0, 17, 2), 'rs9264942'), (('25129146', 0, 8, 0), 'rs1642764'), (('23754948', 1, 6, 0), 'rs10478424'), (('17903301', 2, 9, 1), 'rs10514431'), (('17903307', 1, 66, 1), 'rs1393593')]


## Construct a list valid relations

In [7]:
# TODO: load pvalues properly, use them to filter stuff downstream, do acroynm resolution here

In [104]:
import re

def clean(phen):
    phen = re.sub('[)():.]+', '', phen)
    phen = re.sub('^or ', '', phen)
    phen = phen.strip()
    return phen

associations = []

tables_found = {(pmid, table_id) for pmid, table_id, _, _, _ in pval_table_associations} # with pvalues
tables_found.update(non_gwas_tables)

# record table associations wth pval < 1e-5 or no pval reported
chosen_rsids = set()
for (pmid, table_id, row_id, col_id), (rsid, loc_phen) in phen_table_associations.items():
    pvals = pval_table_associations.get((pmid, table_id, row_id, col_id, rsid), [])
    glob_phen = clean(phen_text_associations[pmid])
    if (pvals and min(pvals) < 1e-5) or (not pvals and (pmid, table_id) not in tables_found):
        associations.append((pmid, rsid, (glob_phen, loc_phen)))
        chosen_rsids.add((pmid, table_id, row_id, col_id, rsid))


# record associations with no local phenotype
for (pmid, table_id, row_id, col_id, rsid), pvals in pval_table_associations.items():
    # skip low-pvalue snps and snps that we already added
    if pvals and min(pvals) > 1e-5: continue
    if (pmid, table_id, row_id, col_id, rsid) in chosen_rsids: continue
    
    # append with global phenotype
    phen = clean(phen_text_associations[pmid])
    associations.append((pmid, rsid, (phen, '')))    
    chosen_rsids.add((pmid, table_id, row_id, col_id, rsid))

# record singletons
n_singletons_added = 0
for (pmid, table_id, row_id, col_id), rsid in singleton_associations.items():
    if (pmid, table_id, row_id, col_id, rsid) in chosen_rsids: continue
    if (pmid, table_id) in tables_found: continue
    if n_singletons_by_table[(pmid, table_id)] < 30: continue
    phen = clean(phen_text_associations[pmid])
    associations.append((pmid, rsid, (phen, ''))) 
    n_singletons_added += 1

print len(associations), 'associations, e.g.:'
print associations[:5]    
print 'There were %d singletons' % n_singletons_added

7838 associations, e.g.:
[('21738491', 'rs12143842', ('Sudden Cardiac Death', 'QT')), ('20585627', 'rs12931267', ('Common Traits', 'Red hair')), ('24376456', 'rs2410182', ('blood pressure', 'Coronary heart disease')), ('17903295', 'rs2543600', ('longevity|age-related phenotypes', 'Age at death')), ('20838585', 'rs726914', ('Cardiovascular Disease', 'systolic BP'))]
There were 357 singletons


## Comparing to GWAS central

In [43]:
print ('17903296', 1) not in tables_found

False


In [16]:
from db.kb import KnowledgeBase

kb = KnowledgeBase()
assocs = [assoc for doc in corpus.documents for assoc in kb.assoc_by_pmid(doc.name) if assoc.source == 'gwas_central' and assoc.pvalue < 1e-5]

print '%d documents, %d associations' % (len(corpus.documents), len(assocs))

589 documents, 7583 associations


### Set up map of GWC to GWASDB phenotypes

In [105]:
rel_dict = { (pmid, rsid) : set() for (pmid, rsid, phen) in associations }
for (pmid, rsid, phen) in associations:
    rel_dict[(pmid, rsid)].add(phen)

gold_rel_dict = { (a.paper.pubmed_id, a.snp.rs_id) : set() for a in assocs }
for a in assocs:
    gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)].add(a.phenotype.name)

In [199]:
out_set = set()
for a in assocs:
    s1 = gold_rel_dict[(a.paper.pubmed_id, a.snp.rs_id)]
    s2 = rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), None)
    if s1 and s2:
        for name1 in s1:
            for name2a, name2b in s2:
                name1 = str(name1)
                score = phen_scores.get((name1.lower(), name2a.lower(), name2b.lower()), '')
                out_set.add('%s\t%s\t%s\t%s\n' % (name1, name2a, name2b, score))

with open('phenotype.mapping.tsv', 'w') as f:                    
    for out_str in out_set:
        f.write(out_str)

Dictionary legend in 0: wrong, 1: acronym, 2: imprecise, 3: correct

In [11]:
phen_map = dict()
phen_scores = dict()
with open('phenotype.mapping.annotated.txt') as f:                    
    lines = f.read().split('\r')
    for line in lines:
        fields = line.strip().split('\t')
        fields = [re.sub('"', '', f) for f in fields]
        try:
            orig_name, glob_name, loc_name, score = fields
        except ValueError:
            if len(fields) == 2:
                orig_name, glob_name = fields
                loc_name = ''
            else:
                orig_name, glob_name, loc_name = fields
            score = 3
        phen_scores[(orig_name.lower(), glob_name.lower(), loc_name.lower())] = score
        if score in ('2', '3'):
            key = (glob_name, loc_name)
            if key not in phen_map: phen_map[key] = set()
            phen_map[key].add(orig_name)

### Look at recall relative to GWAS Central

In [13]:
import re

def paper_contains(pmid, quote):
    with open('../data/db/papers/%d.xml' % pmid) as f:
        txt = f.read()
        return True if re.search(quote, txt) else False

In [112]:
# display directly the results we found
n_correct = 0
n_imprecise = 0
n_missing = 0
n_wrong = 0
n_total = 0
n_new = 0
invalid_per_paper = { a.paper.pubmed_id : 0 for a in assocs }
seen = set()
for a in assocs:
    if (a.paper.pubmed_id, a.snp.rs_id, a.phenotype.name) in seen: continue
    seen.add((a.paper.pubmed_id, a.snp.rs_id, a.phenotype.name))
    
    gold_phen_set = {a.phenotype.name}
    pred_phen_set = rel_dict.get((str(a.paper.pubmed_id), a.snp.rs_id), {})
    tmp = list(pred_phen_set)
    if tmp and tmp[0][1].startswith('Age at'):
        n_correct += 1
        continue
    pred_phen_set2 = {m for p in pred_phen_set for m in phen_map.get(p, {})}
    if gold_phen_set & pred_phen_set2: 
        n_correct += 1
        n_total += 1
    else:
        if paper_contains(a.paper.pubmed_id, a.snp.rs_id):
            if a.paper.pubmed_id == 23509613:
                print a.snp.rs_id, gold_phen_set, pred_phen_set, pred_phen_set2
            if pred_phen_set2:
                n_wrong += 1
            else:
                n_missing += 1
            n_total += 1
            invalid_per_paper[a.paper.pubmed_id] += 1

print n_correct, n_wrong, n_missing, n_total

print 'Biggest omissions:', sorted(invalid_per_paper.items(), key=lambda x: x[1], reverse=True)[:15]


2574 42 482 3087
Biggest omissions: [(19197348, 16), (18159244, 15), (23400010, 14), (21810271, 11), (24121790, 10), (24058526, 10), (22747683, 9), (22675492, 8), (21533175, 8), (23202124, 7), (22493691, 7), (22504420, 7), (21738487, 7), (23903356, 7), (23583978, 7)]


#### Notes from all papers:

19197348: (Korsae islanders) No global phenotype, need to handle phenotype headers. Some not parsed because exact match not in dict, need to search for n-grams
23400010: All the SNPs are in text!!
19305408: Phenotype incorrect? QT Interval vs sudden cardian death. But QT interval is in abstract
22760553: citalopram in dict but connected to a dash in title. Add "side effects" to matched keywords?
22747683: They're in text
22675492: Estradiol gets pushed our because "testosterone," and "testostreone" both present
21533175: can't parse 'Serum dehydroepiandrosterone sulphate levels' (acronym in text)
23202124: It's all in text
22493691: pvalue/rel not parsed in table (WTF!)
23903356: all in text (not abstract)
23583978: in the text (some in abstract)
23563607: Phenotype headers not fully matched (e.g. Obesity type-2) and "trails" instead of traits



Idea: Match spans if they contain the same words, but potentially in a different order

#### Notes from first 100:

470 fully correct

19197348: Phenotype headers
18159244: Completely messed-up way of writing down p-values
17903302: Acronyms
17903305: Phenotype headers, errors (prostate vs breast cancer)
19043545: Mentions in text
17447842: Mentions in text (also: Messed-up way of writing table (induced cells should help)
17658951: Extraction from text
17903296: e.g. Neck phenotype acronym has an undocumented male/female suffix
19557161: In text, and one table entry strangely doesn't seem have been found

### Look at precision

In [111]:
# # write relations that are not in our dataset
# import random

# new_assocs = []
# for doc in corpus.documents:
#     rsids_seen = set()
#     for a in assocs:
#         if str(a.paper.pubmed_id) != doc.name: continue
#         rsids_seen.add(a.snp.rs_id)
    
#     # print new snps
#     for (pmid, rsid), phens in rel_dict.items():
#         if pmid != doc.name: continue
#         if rsid in rsids_seen: continue
#         new_assocs.append((pmid, rsid, phens))
# #         print pmid, rsid, phens
# #     print '---'

random.shuffle(new_assocs)
random_subset = list(new_assocs[:50])
random_subset = sorted(random_subset)
with open('rels.discovered.tsv', 'w') as f:
    for new_a in random_subset:
        print new_a
    

('17903305', 'rs208354', set([('prostate cancer|breast', '')]))
('18483556', 'rs9392056', set([('Hair Color|Skin Pigmentation', '')]))
('19812673', 'rs4742409', set([('AUTISM', '')]))
('20418485', 'rs1993116', set([('vitamin D levels', '')]))
('20585627', 'rs154659', set([('Common Traits', 'Freckling')]))
('20661308', 'rs10119', set([('Serum Calcium', '')]))
('20927387', 'rs5756506', set([('Cell Traits|Red Blood Cell', 'MCH')]))
('20932310', 'rs11069178', set([('\xcf\x84 protein levels', '')]))
('21273288', 'rs1004565', set([('NT-proBNP', '')]))
('21297633', 'rs1893217', set([('ulcerative colitis', '')]))
('21408207', 'rs2248932', set([('Systemic Lupus Erythematosus|Autoantibody', '')]))
('21437268', 'rs1573496', set([('Cancers|Conducted|Upper Aerodigestive Tract', '')]))
('21448238', 'rs10275320', set([('migraine', '')]))
('21552555', 'rs9930333', set([('Obesity|Obesity-Related Traits', '')]))
('21738491', 'rs174230', set([('Sudden Cardiac Death', '')]))
('21931564', 'rs2216405', set(

In [89]:
with open('associations.tsv', 'w') as f:
    for (pmid, rsid, (glob_phen, loc_phen)) in associations:
        pvals = pval_dict.get(pmid, None)
        pval = min(pvals.get(rsid, [-1])) if pvals else -1
        loc_phen = '-' if not loc_phen else loc_phen

        f.write('%s\t%s\t%s\t%s\t%f\n' % (pmid, rsid, glob_phen, loc_phen, pval))