# SNP/Phenotype detection from raw text

In [1]:
%load_ext autoreload
%autoreload 2

import sys

In [2]:
# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')

# set up paths
abstract_dir = '../data/db/papers'

## Assemble a corpus

We will look at abstracts for now

In [3]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLDocParser

xml_parser = UnicodeXMLDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//abstract/p//text()',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

Warning: there are unicode chars that cause trouble, e.g.:

In [23]:
import unicodedata
print u'\u2212 8'
txt=u'\xd7'
print unicodedata.normalize('NFKD', txt).encode('ascii','ignore')
print unicode(txt)

−8

×


Get the actual docs:

In [20]:
import lxml.etree as et
with open('../data/db/papers/19305408.xml') as f:
    for i,doc in enumerate(et.parse(f).xpath('./*')):
        print doc
        print doc.xpath('.//abstract/p//text()')

<Element article at 0x106398ea8>
[u'QT interval duration reflecting myocardial repolarization on the electrocardiogram is a heritable risk factor for sudden cardiac death and drug-induced arrhythmias. We conducted a meta-analysis of 3 genome-wide association studies in 13,685 individuals of European ancestry from the Framingham Heart Study, the Rotterdam Study and the Cardiovascular Health Study. We observed associations at P < 5\xd710', u'\u22128', ' with variants in ', 'NOS1AP', ', ', 'KCNQ1, KCNE1', ', ', 'KCNH2', ' and ', 'SCN5A', ', known to be involved in myocardial repolarization and Mendelian Long QT Syndromes. Associations at five novel loci included 16q21 near ', 'NDRG4', ' and ', 'GINS3', ', 6q22 near ', 'PLN', ', 1p36 near ', 'RNF207', ', 16p13 near ', 'LITAF', ' and 17q12 near ', 'LIG3', ' and ', 'RIFFL', u'. Collectively, the 14 independent variants at these 10 loci explain 5.4\u20136.5% of variation in QT interval. Identifying the causal variants and defining their impac

In [4]:
from snorkel.parser import SentenceParser
from snorkel.parser import Corpus

sent_parser = SentenceParser()
%time corpus = Corpus(xml_parser, sent_parser)

Parsing documents...
Parsing contexts...
Parsed 589 documents and 4799 contexts
CPU times: user 8.94 s, sys: 917 ms, total: 9.85 s
Wall time: 1min 11s


Test the results:

In [8]:
doc = corpus.get_docs()[2]
print doc

Document(id='21298047', file='21298047.xml', text=u'Substance dependence or addiction is a complex environmental and genetic disorder that results in serious health and socio-economic consequences. Multiple substance dependence categories together, rather than any one individual addiction outcome, may explain the genetic variability of such disorder. In our study, we defined a composite substance dependence phenotype derived from six individual diagnoses: addiction to nicotine, alcohol, marijuana, cocaine, opiates or other drugs as a whole. Using data from several genomewide case-control studies, we identified a strong (Odds ratio \u200a=\u200a1.77) and significant (p-value\u200a=\u200a7E-8) association signal with a novel gene, PBX/knotted 1 homeobox 2 (PKNOX2), on chromosome 11 with the composite phenotype in European-origin women. The association signal is not as significant when individual outcomes for addiction are considered, or in males or African-origin population. Our findings

In [9]:
sent = corpus.get_contexts_in(doc.id)[0]
print sent

Sentence(id='21298047-0', doc_id='21298047', doc_name='21298047.xml', sent_id=0, words=[u'Substance', u'dependence', u'or', u'addiction', u'is', u'a', u'complex', u'environmental', u'and', u'genetic', u'disorder', u'that', u'results', u'in', u'serious', u'health', u'and', u'socio-economic', u'consequences', u'.'], lemmas=[u'substance', u'dependence', u'or', u'addiction', u'be', u'a', u'complex', u'environmental', u'and', u'genetic', u'disorder', u'that', u'result', u'in', u'serious', u'health', u'and', u'socio-economic', u'consequence', u'.'], poses=[u'NN', u'NN', u'CC', u'NN', u'VBZ', u'DT', u'JJ', u'JJ', u'CC', u'JJ', u'NN', u'WDT', u'VBZ', u'IN', u'JJ', u'NN', u'CC', u'JJ', u'NNS', u'.'], dep_parents=[2, 11, 2, 2, 11, 11, 11, 11, 8, 8, 0, 13, 11, 16, 16, 13, 16, 19, 16, 11], dep_labels=[u'compound', u'nsubj', u'cc', u'conj', u'cop', u'det', u'amod', u'amod', u'cc', u'conj', u'ROOT', u'nsubj', u'acl:relcl', u'case', u'amod', u'nmod', u'cc', u'amod', u'conj', u'punct'], char_offsets=[

## Try extracting rs-ids first

In [106]:
# add new paths
sys.path.append('../src/crawler')
sys.path.append('../src/crawler/db')

# import new libs
from kb import KnowledgeBase
from extractor.util import gold_rsid_stats, gold_rsid_precision

from snorkel.candidates import Ngrams
from snorkel.matchers import DictionaryMatch, RegexMatchSpan, Union
from snorkel.candidates import Candidates

Get a list of all the rs-ids we know.

In [6]:
kb = KnowledgeBase()

In [8]:
rs_ids = kb.get_rsid_candidates()
print rs_ids[0]

rs1000113


Get a gold set:

In [11]:
for ngram in c.get_candidates()[:5]:
    print ngram.sentence['words']

[u'We', u'identified', u'two', u'independent', u'single-nucleotide', u'polymorphisms', u'associated', u'with', u'circulating', u'retinol', u'levels', u',', u'which', u'are', u'located', u'near', u'the', u'transthyretin', u'-LRB-', u'TTR', u'-RRB-', u'and', u'retinol', u'binding', u'protein', u'4', u'-LRB-', u'RBP4', u'-RRB-', u'genes', u'which', u'encode', u'major', u'carrier', u'proteins', u'of', u'retinol', u':', u'rs1667255', u'-LRB-', u'P', u'=', u'2.30', u'\xd7', u'10', u'\u2212', u'17', u'-RRB-', u'and', u'rs10882272', u'-LRB-', u'P', u'=', u'6.04', u'\xd7', u'10', u'\u2212', u'12', u'-RRB-', u'.']
[u'A', u'secondary', u'analysis', u'including', u'an', u'additional', u'211', u'cases', u'and', u'285', u'controls', u'from', u'two', u'closely-related', u'Latin-American', u'population', u'isolates', u'from', u'the', u'Central', u'Valley', u'of', u'Costa', u'Rica', u'and', u'Antioquia', u',', u'Colombia', u'also', u'identified', u'rs7868992', u'as', u'the', u'top', u'signal', u'-LRB-'

In [12]:
gold_set = frozenset( [ (doc.id, rs_id) for doc in corpus.get_docs() for rs_id in kb.rsids_by_pmid(int(doc.id)) ] )

In [111]:
gold_set_rsids = [rs_id for doc_id, rs_id in gold_set]

### Extract candidates:

In [113]:
# Define a candidate space
ngrams = Ngrams(n_max=1)

# Define matchers
dict_rsid_matcher = DictionaryMatch(d=rs_ids, longest_match_only=False)
gold_rsid_matcher = DictionaryMatch(d=gold_set_rsids, longest_match_only=False)
regx_rsid_matcher = RegexMatchSpan(rgx=r'rs\d+')
rsid_matcher = Union(dict_rsid_matcher, regx_rsid_matcher)

# collect candidates
%time gold_dict_c = Candidates(ngrams, gold_rsid_matcher, corpus.get_contexts())
%time rsid_c = Candidates(ngrams, rsid_matcher, corpus.get_contexts())

Extracting candidates...
CPU times: user 3.55 s, sys: 76.4 ms, total: 3.63 s
Wall time: 3.61 s
Extracting candidates...
CPU times: user 3.94 s, sys: 78.8 ms, total: 4.02 s
Wall time: 4 s


In [108]:
print 'Got %d candidates, e.g.:' % len(rsid_c.get_candidates())
rsid_c.get_candidates()[:5]

Got 994 candidates, e.g.:


[<Ngram("rs1667255", id=21878437-2:704-712, chars=[704,712], words=[38,38]),
 <Ngram("rs7868992", id=22889924-4:983-991, chars=[983,991], words=[30,30]),
 <Ngram("rs10198628", id=22589742-17:2336-2345, chars=[2336,2345], words=[0,0]),
 <Ngram("rs455804", id=22807686-14:2226-2233, chars=[2226,2233], words=[3,3]),
 <Ngram("rs3093077", id=22291609-5:1571-1579, chars=[1571,1579], words=[98,98])]

### Statistics

Statistics on all the rsid candidates:

In [110]:
gold_rsid_stats(rsid_c.get_candidates(), gold_set)

# of gold annotations	= 8384
# of candidates		= 792
Candidate recall	= 0.068
Candidate precision	= 0.722


Statistics on the candidates extracted via dictionary matching.

Recall is how many SNPs in the gold set are actually mentioned in the abstract/txt.
Precision is how many recovered SNPs are also found in the gold set.

The gold set are (pmid, rsid) tuples, so in some cases, there are papers where there is a mention of an rsid from another paper, but this new mention is not recorded in the gold set.

In [114]:
gold_rsid_stats(gold_dict_c.get_candidates(), gold_set)

# of gold annotations	= 8384
# of candidates		= 621
Candidate recall	= 0.068
Candidate precision	= 0.921


Some debugging on the largest rsID candate set:

In [22]:
strange_ngrams = gold_rsid_precision(rsid_c.get_candidates(), gold_set)
gold_dict = dict(gold_set)
for ngram in strange_ngrams[:5]:
    print ngram
    # print corpus.get_doc(ngram.doc_id)
    print ' '.join(ngram.sentence['words'])

<Ngram("rs10795668", id=23300701-7:1058-1067, chars=[1058,1067], words=[4,4])
For the known locus rs10795668 -LRB- 10p14 -RRB- , we found an interacting SNP rs367615 -LRB- 5q21 -RRB- with replication p = 0.01 and combined p = 4.19 × 10 − 8 .
<Ngram("rs2075650", id=21418511-8:1498-1506, chars=[1498,1506], words=[20,20])
In conclusion , the major locus determining familial longevity up to high age as detected by GWAS was marked by rs2075650 , which tags the deleterious effects of the ApoE ε4 allele .
<Ngram("rs11552708", id=22558069-5:1095-1104, chars=[1095,1104], words=[10,10])
Furthermore , three SNPs , rs4985726 , rs3803800 , and rs11552708 in TNFRSF13B and TNFSF13 , were indicated to be associated with serum levels of IgG -LRB- P < 2.3 × 10 − 3 -RRB- and IgM -LRB- P < 0.018 -RRB- , while rs3803800 and rs11552708 were associated with IgA -LRB- P < 0.013 -RRB- .
<Ngram("rs1051730", id=19247474-4:1251-1259, chars=[1251,1259], words=[29,29])
In the chr15q25 .1 region spanning the nicotin

## Get candidate p-values

Need to use regular expressions for this.

In [15]:
from snorkel.matchers import RegexMatchSpan

Test out the regexp.

In [95]:
import re

#p (whitespace)? = (whitespace)? \d+ .? \d? (whitespace?) "x symbol" (whitespace?) 10 (whitespace?) "- symbol" \w?\d+
# rgx = u'p\s*=\s*\d+\.?\d*\s*\xd7\s*10\s*\u2212\s*\d+'
rgx = u'\d+\.?\d*\s*\xd7\s*10\s*\u2212\s*\d+'
sentence = ' '.join(strange_ngrams[4].sentence['words'])
print sentence
print re.search(rgx, sentence).group()

Novel signals include : for IL-6 , in the ABO gene -LRB- rs657152 , p = 2.13 × 10 − 29 -RRB- ; for ESR , at the HBB -LRB- rs4910472 , p = 2.31 × 10 − 11 -RRB- and UCN119B / SPPL3 -LRB- rs11829037 , p = 8.91 × 10 − 10 -RRB- loci ; for MCP-1 , near its receptor CCR2 -LRB- rs17141006 , p = 7.53 × 10 − 13 -RRB- and in CADM3 -LRB- rs3026968 , p = 7.63 × 10 − 13 -RRB- ; for hsCRP , within the CRP gene -LRB- rs3093077 , p = 5.73 × 10 − 21 -RRB- , near DARC -LRB- rs3845624 , p = 1.43 × 10 − 10 -RRB- , UNC119B / SPPL3 -LRB- rs11829037 , p = 1.50 × 10 − 14 -RRB- , and ICOSLG/AIRE -LRB- rs113459440 , p = 1.54 × 10 − 08 -RRB- loci .
2.13 × 10 − 29


In [101]:
pval_matcher = RegexMatchSpan(rgx=rgx)
ngrams = Ngrams(n_max=5)
%time pval_c = Candidates(ngrams, pval_matcher, corpus.get_contexts())

Extracting candidates...
CPU times: user 13.5 s, sys: 99.6 ms, total: 13.6 s
Wall time: 13.8 s


In [104]:
print 'Got %d candidates, e.g.:' % len(pval_c.get_candidates())
for candidate in pval_c.get_candidates()[:5]:
    print unicode(candidate)

Got 1013 candidates, e.g.:
<Ngram("1.2×10 −15", id=22792071-4:978-987, chars=[978,987], words=[50,54])
<Ngram("1.2×10 −11", id=20173748-2:485-494, chars=[485,494], words=[46,50])
<Ngram("1.85×10 −20", id=18846228-5:782-792, chars=[782,792], words=[41,45])
<Ngram("6.2×10 −5", id=24586183-6:1550-1558, chars=[1550,1558], words=[60,64])
<Ngram("1.5×10 −7", id=18464913-6:1097-1105, chars=[1097,1105], words=[60,64])


### Statistics

First, how many p-values that should be present do we extract?

In [122]:
# we need to define a p-value -> float converter
def pvalue_to_float(pstr):
    # extract groups via regex
    rgx = u'(\d+\.?\d*)\s*\xd7\s*10\s*\u2212\s*(\d+)'
    result = re.search(rgx, pstr)

    # convert the result to a float
    if result:
        groups = result.groups()
        if len(groups) == 2:
            multiplier = float(groups[0])
            exponent = float(groups[1])    
            return multiplier * 10 ** -exponent
        
    return None

In [123]:
print pvalue_to_float(u"6.2×10 −5")
print pvalue_to_float(u"1.85×10 −20")

6.2e-05
1.85e-20


In [127]:
from kb import KnowledgeBase
kb = KnowledgeBase()
gold_set_pvals = frozenset([ (doc.id, pval) for doc in corpus.get_docs() for pval in kb.pvals_by_pmid(int(doc.id)) ])

In [128]:
print 'Found %d gold mentions, e.g.:' % len(gold_set_pvals)
print list(gold_set_pvals)[:5]

Found 5289 gold mentions, e.g.:
[('23382691', 7.301029995663981), ('24816252', 20.522878745280337), ('25064009', 15.698970004336019), ('24097068', 16.397940008672037), ('23382691', 5.301029995663981)]


Potential analysis question: how many rsIDs that we extracted earlier also have an associated p-value?

We will say that if an rsID and a p-value occur in the same document, they have been identified correctly.

So we extract from our gold set tuples of the form (pmid, rsID, p-value). We compare them to candidate tuples, where an rsID and a p-value are associated if they occur in the same sentence.

This seems like something to be checked at the relation extraction stage.