# SNP/Phenotype detection from raw text

In [1]:
%load_ext autoreload
%autoreload 2

import sys

In [2]:
# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')

# set up paths
abstract_dir = '../data/db/papers'

## Assemble a corpus

We will look at abstracts for now

In [3]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLDocParser

xml_parser = UnicodeXMLDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//abstract/p//text()',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

Warning: there are unicode chars that cause trouble, e.g.:

In [23]:
import unicodedata
print u'\u2212 8'
txt=u'\xd7'
print unicodedata.normalize('NFKD', txt).encode('ascii','ignore')
print unicode(txt)

−8

×


Get the actual docs:

In [20]:
import lxml.etree as et
with open('../data/db/papers/19305408.xml') as f:
    for i,doc in enumerate(et.parse(f).xpath('./*')):
        print doc
        print doc.xpath('.//abstract/p//text()')

<Element article at 0x106398ea8>
[u'QT interval duration reflecting myocardial repolarization on the electrocardiogram is a heritable risk factor for sudden cardiac death and drug-induced arrhythmias. We conducted a meta-analysis of 3 genome-wide association studies in 13,685 individuals of European ancestry from the Framingham Heart Study, the Rotterdam Study and the Cardiovascular Health Study. We observed associations at P < 5\xd710', u'\u22128', ' with variants in ', 'NOS1AP', ', ', 'KCNQ1, KCNE1', ', ', 'KCNH2', ' and ', 'SCN5A', ', known to be involved in myocardial repolarization and Mendelian Long QT Syndromes. Associations at five novel loci included 16q21 near ', 'NDRG4', ' and ', 'GINS3', ', 6q22 near ', 'PLN', ', 1p36 near ', 'RNF207', ', 16p13 near ', 'LITAF', ' and 17q12 near ', 'LIG3', ' and ', 'RIFFL', u'. Collectively, the 14 independent variants at these 10 loci explain 5.4\u20136.5% of variation in QT interval. Identifying the causal variants and defining their impac

In [4]:
from snorkel.parser import SentenceParser
from snorkel.parser import Corpus

sent_parser = SentenceParser()
%time corpus = Corpus(xml_parser, sent_parser)

Parsing documents...
Parsing contexts...
Parsed 589 documents and 4799 contexts
CPU times: user 8.52 s, sys: 934 ms, total: 9.45 s
Wall time: 1min 22s


Test the results:

In [5]:
doc = corpus.get_docs()[2]
print doc

Document(id='21298047', file='21298047.xml', text=u'Substance dependence or addiction is a complex environmental and genetic disorder that results in serious health and socio-economic consequences. Multiple substance dependence categories together, rather than any one individual addiction outcome, may explain the genetic variability of such disorder. In our study, we defined a composite substance dependence phenotype derived from six individual diagnoses: addiction to nicotine, alcohol, marijuana, cocaine, opiates or other drugs as a whole. Using data from several genomewide case-control studies, we identified a strong (Odds ratio \u200a=\u200a1.77) and significant (p-value\u200a=\u200a7E-8) association signal with a novel gene, PBX/knotted 1 homeobox 2 (PKNOX2), on chromosome 11 with the composite phenotype in European-origin women. The association signal is not as significant when individual outcomes for addiction are considered, or in males or African-origin population. Our findings

In [9]:
sent = corpus.get_contexts_in(doc.id)[0]
print sent

Sentence(id='21298047-0', doc_id='21298047', doc_name='21298047.xml', sent_id=0, words=[u'Substance', u'dependence', u'or', u'addiction', u'is', u'a', u'complex', u'environmental', u'and', u'genetic', u'disorder', u'that', u'results', u'in', u'serious', u'health', u'and', u'socio-economic', u'consequences', u'.'], lemmas=[u'substance', u'dependence', u'or', u'addiction', u'be', u'a', u'complex', u'environmental', u'and', u'genetic', u'disorder', u'that', u'result', u'in', u'serious', u'health', u'and', u'socio-economic', u'consequence', u'.'], poses=[u'NN', u'NN', u'CC', u'NN', u'VBZ', u'DT', u'JJ', u'JJ', u'CC', u'JJ', u'NN', u'WDT', u'VBZ', u'IN', u'JJ', u'NN', u'CC', u'JJ', u'NNS', u'.'], dep_parents=[2, 11, 2, 2, 11, 11, 11, 11, 8, 8, 0, 13, 11, 16, 16, 13, 16, 19, 16, 11], dep_labels=[u'compound', u'nsubj', u'cc', u'conj', u'cop', u'det', u'amod', u'amod', u'cc', u'conj', u'ROOT', u'nsubj', u'acl:relcl', u'case', u'amod', u'nmod', u'cc', u'amod', u'conj', u'punct'], char_offsets=[

## Try extracting rs-ids first

In [6]:
# add new paths
sys.path.append('../src/crawler')
sys.path.append('../src/crawler/db')

# import new libs
from kb import KnowledgeBase
from extractor.util import gold_rsid_stats, gold_rsid_precision

from snorkel.candidates import Ngrams
from snorkel.matchers import DictionaryMatch, RegexMatchSpan, Union
from snorkel.candidates import Candidates

Get a list of all the rs-ids we know.

In [7]:
kb = KnowledgeBase()

In [11]:
rs_ids = kb.get_rsid_candidates()
print rs_ids[0]

rs1000113


Get a gold set:

In [11]:
for ngram in c.get_candidates()[:5]:
    print ngram.sentence['words']

[u'We', u'identified', u'two', u'independent', u'single-nucleotide', u'polymorphisms', u'associated', u'with', u'circulating', u'retinol', u'levels', u',', u'which', u'are', u'located', u'near', u'the', u'transthyretin', u'-LRB-', u'TTR', u'-RRB-', u'and', u'retinol', u'binding', u'protein', u'4', u'-LRB-', u'RBP4', u'-RRB-', u'genes', u'which', u'encode', u'major', u'carrier', u'proteins', u'of', u'retinol', u':', u'rs1667255', u'-LRB-', u'P', u'=', u'2.30', u'\xd7', u'10', u'\u2212', u'17', u'-RRB-', u'and', u'rs10882272', u'-LRB-', u'P', u'=', u'6.04', u'\xd7', u'10', u'\u2212', u'12', u'-RRB-', u'.']
[u'A', u'secondary', u'analysis', u'including', u'an', u'additional', u'211', u'cases', u'and', u'285', u'controls', u'from', u'two', u'closely-related', u'Latin-American', u'population', u'isolates', u'from', u'the', u'Central', u'Valley', u'of', u'Costa', u'Rica', u'and', u'Antioquia', u',', u'Colombia', u'also', u'identified', u'rs7868992', u'as', u'the', u'top', u'signal', u'-LRB-'

In [12]:
gold_set = frozenset( [ (doc.id, rs_id) for doc in corpus.get_docs() for rs_id in kb.rsids_by_pmid(int(doc.id)) ] )

In [13]:
gold_set_rsids = [rs_id for doc_id, rs_id in gold_set]

### Extract candidates:

In [14]:
# Define a candidate space
ngrams = Ngrams(n_max=1)

# Define matchers
dict_rsid_matcher = DictionaryMatch(d=rs_ids, longest_match_only=False)
gold_rsid_matcher = DictionaryMatch(d=gold_set_rsids, longest_match_only=False)
regx_rsid_matcher = RegexMatchSpan(rgx=r'rs\d+')
rsid_matcher = Union(dict_rsid_matcher, regx_rsid_matcher)

# collect candidates
%time gold_dict_c = Candidates(ngrams, gold_rsid_matcher, corpus.get_contexts())
%time rsid_c = Candidates(ngrams, rsid_matcher, corpus.get_contexts())

Extracting candidates...
CPU times: user 3.6 s, sys: 47.4 ms, total: 3.64 s
Wall time: 3.65 s
Extracting candidates...
CPU times: user 4.06 s, sys: 15.8 ms, total: 4.08 s
Wall time: 4.08 s


In [15]:
print 'Got %d candidates, e.g.:' % len(rsid_c.get_candidates())
rsid_c.get_candidates()[:5]

Got 994 candidates, e.g.:


[<Ngram("rs1667255", id=21878437-2:704-712, chars=[704,712], words=[38,38]),
 <Ngram("rs7868992", id=22889924-4:983-991, chars=[983,991], words=[30,30]),
 <Ngram("rs10198628", id=22589742-17:2336-2345, chars=[2336,2345], words=[0,0]),
 <Ngram("rs455804", id=22807686-14:2226-2233, chars=[2226,2233], words=[3,3]),
 <Ngram("rs3093077", id=22291609-5:1571-1579, chars=[1571,1579], words=[98,98])]

### Statistics

Statistics on all the rsid candidates:

In [16]:
gold_rsid_stats(rsid_c.get_candidates(), gold_set)

# of gold annotations	= 8384
# of candidates		= 792
Candidate recall	= 0.068
Candidate precision	= 0.722


Statistics on the candidates extracted via dictionary matching.

Recall is how many SNPs in the gold set are actually mentioned in the abstract/txt.
Precision is how many recovered SNPs are also found in the gold set.

The gold set are (pmid, rsid) tuples, so in some cases, there are papers where there is a mention of an rsid from another paper, but this new mention is not recorded in the gold set.

In [17]:
gold_rsid_stats(gold_dict_c.get_candidates(), gold_set)

# of gold annotations	= 8384
# of candidates		= 621
Candidate recall	= 0.068
Candidate precision	= 0.921


Some debugging on the largest rsID candate set:

In [18]:
strange_ngrams = gold_rsid_precision(rsid_c.get_candidates(), gold_set)
gold_dict = dict(gold_set)
for ngram in strange_ngrams[:5]:
    print ngram
    # print corpus.get_doc(ngram.doc_id)
    print ' '.join(ngram.sentence['words'])

<Ngram("rs3093077", id=22291609-5:1571-1579, chars=[1571,1579], words=[98,98])
Novel signals include : for IL-6 , in the ABO gene -LRB- rs657152 , p = 2.13 × 10 − 29 -RRB- ; for ESR , at the HBB -LRB- rs4910472 , p = 2.31 × 10 − 11 -RRB- and UCN119B / SPPL3 -LRB- rs11829037 , p = 8.91 × 10 − 10 -RRB- loci ; for MCP-1 , near its receptor CCR2 -LRB- rs17141006 , p = 7.53 × 10 − 13 -RRB- and in CADM3 -LRB- rs3026968 , p = 7.63 × 10 − 13 -RRB- ; for hsCRP , within the CRP gene -LRB- rs3093077 , p = 5.73 × 10 − 21 -RRB- , near DARC -LRB- rs3845624 , p = 1.43 × 10 − 10 -RRB- , UNC119B / SPPL3 -LRB- rs11829037 , p = 1.50 × 10 − 14 -RRB- , and ICOSLG/AIRE -LRB- rs113459440 , p = 1.54 × 10 − 08 -RRB- loci .
<Ngram("rs1572312", id=25729143-7:991-999, chars=[991,999], words=[9,9])
The C allele of the most significant SNP , rs1572312 , was associated with high values of V .
<Ngram("rs1572312", id=25729143-9:1130-1138, chars=[1130,1138], words=[6,6])
Furthermore , the frequency of the rs1572312 C a

## Get candidate p-values

Need to use regular expressions for this.

In [19]:
from snorkel.matchers import RegexMatchSpan

Test out the regexp.

In [20]:
import re

#p (whitespace)? = (whitespace)? \d+ .? \d? (whitespace?) "x symbol" (whitespace?) 10 (whitespace?) "- symbol" \w?\d+
# rgx = u'p\s*=\s*\d+\.?\d*\s*\xd7\s*10\s*\u2212\s*\d+'
rgx = u'\d+\.?\d*\s*\xd7\s*10\s*\u2212\s*\d+'
sentence = ' '.join(strange_ngrams[4].sentence['words'])
print sentence
print re.search(rgx, sentence).group()

Next , case-control studies resulted in remaining three SNPs -LRB- NFIA-AS2 rs1572312 , TSHR rs7144481 , RBFOX1 rs7191721 -RRB- associated with endurance athlete status .


AttributeError: 'NoneType' object has no attribute 'group'

In [21]:
pval_matcher = RegexMatchSpan(rgx=rgx)
ngrams = Ngrams(n_max=5)
%time pval_c = Candidates(ngrams, pval_matcher, corpus.get_contexts())

Extracting candidates...
CPU times: user 13 s, sys: 84.5 ms, total: 13.1 s
Wall time: 13.1 s


In [29]:
print 'Got %d candidates, e.g.:' % len(pval_c.get_candidates())
for candidate in pval_c.get_candidates()[:5]:
    print unicode(candidate)

Got 1013 candidates, e.g.:
<Ngram("1.2×10 −15", id=22792071-4:978-987, chars=[978,987], words=[50,54])
<Ngram("1.2×10 −11", id=20173748-2:485-494, chars=[485,494], words=[46,50])
<Ngram("1.85×10 −20", id=18846228-5:782-792, chars=[782,792], words=[41,45])
<Ngram("6.2×10 −5", id=24586183-6:1550-1558, chars=[1550,1558], words=[60,64])
<Ngram("1.5×10 −7", id=18464913-6:1097-1105, chars=[1097,1105], words=[60,64])


### Statistics

First, how many p-values that should be present do we extract?

In [58]:
# we need to define a p-value -> float converter
from extractor.util import pvalue_to_float, gold_pval_stats, gold_pval_precision

print pvalue_to_float(u"6.2×10 −5")
print pvalue_to_float(u"1.85×10 −20")

6.2e-05
1.85e-20


In [27]:
from kb import KnowledgeBase
kb = KnowledgeBase()
gold_set_pvals = frozenset([ (doc.id, pval) for doc in corpus.get_docs() for pval in kb.pvals_by_pmid(int(doc.id)) ])

In [45]:
print 'Found %d gold mentions, e.g.:' % len(gold_set_pvals)
print list(gold_set_pvals)[:5]
print

print gold_pval_stats(pval_c.get_candidates(), gold_set_pvals)

Found 5289 gold mentions, e.g.:
[('23770605', 1e-14), ('23326517', 7.000000000000001e-32), ('19936222', 2e-53), ('24586186', 3e-164), ('24999842', 3e-11)]

# of gold annotations	= 2730
# of candidates		= 798
Candidate recall	= 0.182
Candidate precision	= 0.623
None


Some debugging...

In [70]:
print gold_pval_precision(pval_c.get_candidates(), gold_set_pvals)

[('23900074', -10.0), ('22479202', -4.0), ('21878437', -5.0), ('22832960', -8.0), ('21738479', -8.0), ('19714205', -6.0), ('22291609', -29.0), ('18483556', -62.0), ('22737229', -12.0), ('21729881', -8.0), ('24586183', -4.0), ('22368281', -16.0), ('21079607', -7.0), ('19412175', -8.0), ('22479202', -43.0), ('25436638', -3.0), ('24586183', -5.0), ('19503597', -54.0), ('24940741', -11.0), ('23565137', -8.0), ('21814517', -8.0), ('20852633', -11.0), ('22479419', -5.0), ('23459443', -8.0), ('21533024', -16.0), ('24379826', -3.0), ('19503597', -26.0), ('22005930', -4.0), ('21326860', -5.0), ('21124946', -5.0), ('24023788', -3.0), ('23583978', -11.0), ('21124955', -11.0), ('23900074', -4.0), ('25017104', -16.0), ('18282107', -5.0), ('21273288', -10.0), ('23966204', -37.0), ('19915575', -5.0), ('20585626', -25.0), ('20838585', -24.0), ('21460840', -21.0), ('18776929', -7.0), ('21418511', -4.0), ('19056611', -14.0), ('21533024', -9.0), ('22479202', -5.0), ('22558097', -8.0), ('20921969', -5.0),

In [76]:
gold_pval_dict = { doc_id : [] for doc_id, pval in gold_set_pvals }
for doc_id, pval in gold_set_pvals:
    gold_pval_dict[doc_id].append(pval)
    
strange_ngrams = gold_pval_precision(pval_c.get_candidates(), gold_set_pvals)
for ngram in strange_ngrams[:5]:
    print unicode(ngram)
    print ' '.join(ngram.sentence['words'])
    print gold_pval_dict[ngram.doc_id]
    print

<Ngram("6.2×10 −5", id=24586183-6:1550-1558, chars=[1550,1558], words=[60,64])
The MAGI3 and BACH2 variants were associated with an increased risk of hyperthyroidism , which was replicated in an independent cohort of patients with Graves ' disease -LRB- OR : 1.37 , 95 % CI 1.22 -- 1.54 , P = 1.2 × 10 − 7 and OR : 1.25 , 95 % CI 1.12 -- 1.39 , P = 6.2 × 10 − 5 -RRB- .
[1e-09, 3e-06, 3e-07, 3.0000000000000004e-08, 4e-08, 7e-13, 4e-07, 2e-16, 6e-07, 2e-08, 1e-07]

<Ngram("2.44×10 −3", id=18776929-6:1145-1154, chars=[1145,1154], words=[31,35])
Two SNPs of the PLCL1 gene , rs892515 and rs9789480 , surrounded by the four SNPs identified in our GWAS , achieved p values of 8.62 × 10 − 3 and 2.44 × 10 − 3 , respectively , for association with hip BS .
[2e-06]

<Ngram("5.43 × 10 −4", id=21483430-5:1469-1480, chars=[1469,1480], words=[113,117])
Associated regions affiliated with memory included the entorhinal cortex -LRB- rs821639 , p = 4.11 × 10 − 5 ; rs2356606 , p = 4.71 × 10 − 4 -RRB- , cingul

In [None]:
kb.pvals_by_pmid('23382691')

Potential analysis question: how many rsIDs that we extracted earlier also have an associated p-value?

We will say that if an rsID and a p-value occur in the same document, they have been identified correctly.

So we extract from our gold set tuples of the form (pmid, rsID, p-value). We compare them to candidate tuples, where an rsID and a p-value are associated if they occur in the same sentence.

This seems like something to be checked at the relation extraction stage.

## Extract phenotypes

In [83]:
# Define a candidate space
ngrams = Ngrams(n_max=4)

# collect phenotype list
from db.kb import KnowledgeBase
kb = KnowledgeBase()
phenotype_list = kb.get_phenotype_candidates()
# TODO: load disease names from NCBI

# Define matchers
phen_matcher = DictionaryMatch(d=phenotype_list, longest_match_only=False, ignore_case=True, stemmer='porter')

# collect candidates
%time phen_c = Candidates(ngrams, phen_matcher, corpus.get_contexts())

Extracting candidates...
CPU times: user 21.2 s, sys: 142 ms, total: 21.4 s
Wall time: 21.4 s


In [84]:
print 'Got %d candidates, e.g.:' % len(phen_c.get_candidates())
for candidate in phen_c.get_candidates()[:5]:
    print candidate

Got 2429 candidates, e.g.:
<Ngram("quantitative trait", id=20190752-4:597-614, chars=[597,614], words=[3,4])
<Ngram("anthropometric traits", id=21423719-8:1757-1777, chars=[1757,1777], words=[20,21])
<Ngram("systemic sclerosis", id=21779181-0:164-181, chars=[164,181], words=[27,28])
<Ngram("type 2 diabetes", id=25102180-11:1961-1975, chars=[1961,1975], words=[5,7])
<Ngram("hepatic", id=21423719-13:2404-2410, chars=[2404,2410], words=[15,15])


### Statistics

In [106]:
from db.kb import KnowledgeBase
kb = KnowledgeBase() # reload
gold_set_phens = frozenset([ (doc.id, phen) for doc in corpus.get_docs() for phen in kb.phens_by_pmid(int(doc.id)) ])

print 'Found %d gold mentions, e.g.:' % len(gold_set_phens)
print list(gold_set_phens)[:5]

Found 723 gold mentions, e.g.:
[('24478790', u'telomere length'), ('21552555', u'obesity'), ('18483556', u'black vs. red hair color'), ('22359512', u'phospholipid levels (plasma)'), ('22479202', u'adiponectin levels')]


In [104]:
from extractor.util import gold_phen_stats
gold_phen_stats(phen_c.get_candidates(), gold_set_phens)

# of gold annotations	= 723
# of candidates		= 2429
Candidate recall	= 0.939
Candidate precision	= 0.280


Why do we get low recall?

In [114]:
from extractor.util import gold_phen_recall
kb = KnowledgeBase()

phen_not_found = gold_phen_recall(phen_c.get_candidates(), gold_set_phens)
for doc_id, phen in phen_not_found[:5]:
    print phen
    print kb.title_by_pmid(doc_id)
    print corpus.get_doc(doc_id)
    print

word reading
A genome-wide association study for reading and language abilities in two population cohorts.
Document(id='23738518', file='23738518.xml', text=u'Candidate genes have been identified for both reading and language, but most of the heritable variance in these traits remains unexplained. Here, we report a genome-wide association meta-analysis of two large cohorts: population samples of Australian twins and siblings aged 12\u201325\u2009years ( n \u2009=\u20091177 from 538 families), and a younger cohort of children of the UK Avon Longitudinal Study of Parents and their Children (aged 8 and 9\u2009years; maximum  n \u2009=\u20095472). Suggestive association was indicated for reading measures and non-word repetition (NWR), with the greatest support found for single nucleotide polymorphisms (SNPs) in the pseudogene,  ABCC13  ( P \u2009=\u20097.34\u2009\xd7\u200910 \u22128 ), and the gene,  DAZAP1  ( P \u2009=\u20091.32\u2009\xd7\u200910 \u22126 ). Gene-based analyses showed sign

## Now let's do some learning!