# Phenotype/SNP relation extraction from tables

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

## Load corpus

In [2]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLTableDocParser

xml_parser = UnicodeXMLTableDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [3]:
from snorkel.parser import HTMLParser
from extractor.parser import UnicodeTableParser
from snorkel.parser import CorpusParser
import cPickle

table_parser = UnicodeTableParser()
html_parser = HTMLParser(path='../data/db/papers/')

corpus_name = 'gwas-table-corpus.pkl'

try:
    with open(corpus_name,"r") as pkl:
        corpus = cPickle.load(pkl)
except:
    cp = CorpusParser(xml_parser, table_parser, max_docs=15)
    %time corpus = cp.parse_corpus(name='GWAS Corpus')
    # pickling currently doesn't work...
#     with open(corpus_name,"w") as pkl:
#         corpus = cPickle.dump(corpus, pkl)

CPU times: user 52.3 s, sys: 2.8 s, total: 55.1 s
Wall time: 1min 35s


## Candidate extraction

### RSid Extraction

In [4]:
from snorkel.matchers import DictionaryMatch, RegexMatchSpan, Union
from snorkel.candidates import EntityExtractor
from snorkel.candidates import TableNgrams

from db.kb import KnowledgeBase

# Define a candidate space
ngrams = TableNgrams(n_max=1)

# Get a list of all the RSids we know
kb = KnowledgeBase()
rs_ids = kb.get_rsid_candidates()

# Define matchers
dict_rsid_matcher = DictionaryMatch(d=rs_ids, longest_match_only=False)
regx_rsid_matcher = RegexMatchSpan(rgx=r'rs\d+')
rsid_matcher = Union(dict_rsid_matcher, regx_rsid_matcher)

rsid_extractor = EntityExtractor(ngrams, rsid_matcher)
%time rs_candidates = rsid_extractor.extract(corpus.get_tables(), name='all')

CPU times: user 1.07 s, sys: 7.75 ms, total: 1.08 s
Wall time: 1.08 s


In [5]:
for cand in rs_candidates[:10]: 
    print cand
print "%s candidates extracted" % len(rs_candidates)
print rs_candidates[0].context
print rs_candidates[0].context.cell

Span("rs2076756", context=None, chars=[0,8], words=[0,0])
Span("rs1992662", context=None, chars=[0,8], words=[0,0])
Span("rs1992660", context=None, chars=[0,8], words=[0,0])
Span("rs1793004", context=None, chars=[0,8], words=[0,0])
Span("rs10521209", context=None, chars=[0,9], words=[0,0])
Span("rs2631372", context=None, chars=[0,8], words=[0,0])
Span("rs2925757", context=None, chars=[0,8], words=[0,0])
Span("rs6947579", context=None, chars=[0,8], words=[0,0])
Span("rs1553575", context=None, chars=[0,8], words=[0,0])
Span("rs10484545", context=None, chars=[0,9], words=[0,0])
1577 candidates extracted
Phrase('17684544', 1, 20, 0, u'rs2076756')
Cell('17684544', 1, 20, u'rs2076756')


#### Statistics

In [6]:
from extractor.util import gold_rsid_stats, gold_rsid_precision

gold_set = frozenset( [ (doc.name, rs_id) for doc in corpus.documents for rs_id in kb.rsids_by_pmid(int(doc.name)) ] )
gold_set_rsids = [rs_id for doc_id, rs_id in gold_set]

gold_rsid_stats(rs_candidates, gold_set)

# of gold annotations	= 178
# of candidates		= 1371
Candidate recall	= 0.978
Candidate precision	= 0.127


Interesting: some SNPs seem to be never mentioned (e.g. rs12122100) while others (rs727153) appear only in the text.
Sometimes, it's not picked up for a different, strange reason: see rs13314993.

In [8]:
cells = rs_candidates[0].row_cells()
[cell.text for cell in cells]

[u'1',
 u'rs2076756',
 u'16',
 u'49,314,382',
 u'CARD15, intron',
 u'0.26',
 u'0.43',
 u'1.93E-13',
 u'2.04E-12',
 u'2.1 (1.58-2.80)',
 u'0.27',
 u'0.41',
 u'6.80E-20',
 u'1.39E-21',
 u'1.71 (1.42-2.05)',
 u'5.90E-08']

### Phenotypes

In [12]:
from snorkel.matchers import DictionaryMatch, RegexMatchSpan, Union, CellNameMatcher
from snorkel.candidates import EntityExtractor
from snorkel.candidates import TableNgrams

# Define a candidate space
ngrams = TableNgrams(n_max=1)

# Create a list of possible words that could denote phenotypes
phen_words = ['trait', 'phenotype']

# Define matchers
# dict_row_matcher = DictionaryMatch(d=phen_words, longest_match_only=False, stemmer='porter')
# cell_row_matcher = CellNameMatcher(row_matcher=dict_row_matcher, cand_space=ngrams)
dict_col_matcher = DictionaryMatch(d=phen_words, longest_match_only=False, stemmer='porter')
# cell_col_matcher = CellNameMatcher(col_matcher=dict_col_matcher, cand_space=ngrams)
# phen_matcher = Union(cell_row_matcher, cell_col_matcher)
phen_matcher = CellNameMatcher(col_matcher=dict_col_matcher, cand_space=ngrams)

phen_extractor = EntityExtractor(ngrams, phen_matcher)
%time phen_candidates = phen_extractor.extract(corpus.get_tables(), name='all')

CPU times: user 2min 39s, sys: 1.62 s, total: 2min 40s
Wall time: 2min 42s


In [13]:
print "%s candidates extracted" % len(phen_candidates)
for cand in phen_candidates[40:80]: 
    print cand.context.document.name
    print unicode(cand)
#     print [span for span in cand.row_ngrams()]
#     print [span for span in cand.col_ngrams()]
    print
print phen_candidates[0].context
print phen_candidates[0].context.document.name
print phen_candidates[0].context.cell

5337 candidates extracted
17903292
Span("TSH", context=None, chars=[29,31], words=[4,4])

17903292
Span(")", context=None, chars=[32,36], words=[5,5])

17903292
Span("Mean", context=None, chars=[0,3], words=[0,0])

17903292
Span("of", context=None, chars=[5,6], words=[1,1])

17903292
Span("TSH", context=None, chars=[8,10], words=[2,2])

17903292
Span("exam", context=None, chars=[12,15], words=[3,3])

17903292
Span("3", context=None, chars=[17,17], words=[4,4])

17903292
Span("&", context=None, chars=[19,19], words=[5,5])

17903292
Span("4", context=None, chars=[21,21], words=[6,6])

17903292
Span("Luteinizing", context=None, chars=[0,10], words=[0,0])

17903292
Span("hormone", context=None, chars=[12,18], words=[1,1])

17903292
Span("(LH) ", context=None, chars=[20,24], words=[2,4])

17903292
Span("LH", context=None, chars=[21,22], words=[3,3])

17903292
Span(") **", context=None, chars=[23,27], words=[4,5])

17903292
Span("**", context=None, chars=[25,26], words=[5,5])

17903292
Span(

### Relations

In [14]:
from snorkel.candidates import RelationExtractor
relation_extractor = RelationExtractor(rsid_extractor, phen_extractor)

In [None]:
%time candidates = relation_extractor.extract(corpus.get_tables(), name='all')
print "%s relations extracted, e.g." % len(candidates)
for cand in candidates[:10]: 
    print cand