# Phenotype/SNP relation extraction from tables

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

## Load corpus

In [2]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLTableDocParser

xml_parser = UnicodeXMLTableDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [3]:
from snorkel.parser import HTMLParser
from extractor.parser import UnicodeTableParser
from snorkel.parser import CorpusParser
import cPickle

table_parser = UnicodeTableParser()
html_parser = HTMLParser(path='../data/db/papers/')

corpus_name = 'gwas-table-corpus.pkl'

try:
    with open(corpus_name,"r") as pkl:
        corpus = cPickle.load(pkl)
except:
    cp = CorpusParser(xml_parser, table_parser, max_docs=15)
    %time corpus = cp.parse_corpus(name='GWAS Corpus')
    # pickling currently doesn't work...
#     with open(corpus_name,"w") as pkl:
#         corpus = cPickle.dump(corpus, pkl)

CPU times: user 52.5 s, sys: 2.84 s, total: 55.4 s
Wall time: 1min 21s


## Candidate extraction

### RSid Extraction

In [4]:
from snorkel.matchers import DictionaryMatch, RegexMatchSpan, Union
from snorkel.candidates import EntityExtractor
from snorkel.candidates import TableNgrams

from db.kb import KnowledgeBase

# Define a candidate space
ngrams = TableNgrams(n_max=1)

# Get a list of all the RSids we know
kb = KnowledgeBase()
rs_ids = kb.get_rsid_candidates()

# Define matchers
dict_rsid_matcher = DictionaryMatch(d=rs_ids, longest_match_only=False)
regx_rsid_matcher = RegexMatchSpan(rgx=r'rs\d+')
rsid_matcher = Union(dict_rsid_matcher, regx_rsid_matcher)

rsid_extractor = EntityExtractor(ngrams, rsid_matcher)
# %time rs_candidates = rsid_extractor.extract(corpus.get_tables(), name='all')

In [6]:
for cand in rs_candidates[:10]: 
    print cand
print "%s candidates extracted" % len(rs_candidates)
print rs_candidates[0].context
print rs_candidates[0].context.cell

Span("rs2076756", context=None, chars=[0,8], words=[0,0])
Span("rs1992662", context=None, chars=[0,8], words=[0,0])
Span("rs1992660", context=None, chars=[0,8], words=[0,0])
Span("rs1793004", context=None, chars=[0,8], words=[0,0])
Span("rs10521209", context=None, chars=[0,9], words=[0,0])
Span("rs2631372", context=None, chars=[0,8], words=[0,0])
Span("rs2925757", context=None, chars=[0,8], words=[0,0])
Span("rs6947579", context=None, chars=[0,8], words=[0,0])
Span("rs1553575", context=None, chars=[0,8], words=[0,0])
Span("rs10484545", context=None, chars=[0,9], words=[0,0])
1577 candidates extracted
Phrase('17684544', 1, 20, 0, u'rs2076756')
Cell('17684544', 1, 20, u'rs2076756')


#### Statistics

In [7]:
from extractor.util import gold_rsid_stats, gold_rsid_precision

gold_set = frozenset( [ (doc.name, rs_id) for doc in corpus.documents for rs_id in kb.rsids_by_pmid(int(doc.name)) ] )
gold_set_rsids = [rs_id for doc_id, rs_id in gold_set]

gold_rsid_stats(rs_candidates, gold_set)

# of gold annotations	= 178
# of candidates		= 1371
Candidate recall	= 0.978
Candidate precision	= 0.127


Interesting: some SNPs seem to be never mentioned (e.g. rs12122100) while others (rs727153) appear only in the text.
Sometimes, it's not picked up for a different, strange reason: see rs13314993.

In [8]:
cells = rs_candidates[0].context.cell.aligned_cells('row')
[cell.text for cell in cells]

[u'1',
 u'16',
 u'49,314,382',
 u'CARD15, intron',
 u'0.26',
 u'0.43',
 u'1.93E-13',
 u'2.04E-12',
 u'2.1 (1.58-2.80)',
 u'0.27',
 u'0.41',
 u'6.80E-20',
 u'1.39E-21',
 u'1.71 (1.42-2.05)',
 u'5.90E-08']

### Phenotypes

In [5]:
from snorkel.matchers import DictionaryMatch, Union, CellNameMatcher, CellDictNameMatcher
from snorkel.candidates import EntityExtractor
from snorkel.candidates import TableNgrams

# Define a candidate space
ngrams = TableNgrams(n_max=3)

# Create a list of possible words that could denote phenotypes
phen_words = ['trait', 'phenotype']

# Define matchers
# dict_row_matcher = DictionaryMatch(d=phen_words, longest_match_only=False, stemmer='porter')
# cell_row_matcher = CellNameMatcher(row_matcher=dict_row_matcher, cand_space=ngrams)
# dict_col_matcher = DictionaryMatch(d=phen_words, longest_match_only=False, stemmer='porter')
# cell_col_matcher = CellNameMatcher(col_matcher=dict_col_matcher, cand_space=ngrams)
# phen_matcher = Union(cell_row_matcher, cell_col_matcher)
# phen_matcher = CellNameMatcher(col_matcher=dict_col_matcher, cand_space=ngrams)
phen_matcher = CellDictNameMatcher(axis='col', d=phen_words, n_max=3, ignore_case=True)

phen_extractor = EntityExtractor(ngrams, phen_matcher)
# %time phen_candidates = phen_extractor.extract(corpus.get_tables(), name='all')

In [11]:
print "%s candidates extracted" % len(phen_candidates)
for cand in phen_candidates[0:10]: 
    print cand.context.document.name
    print unicode(cand)
#     print [span for span in cand.row_ngrams()]
#     print [span for span in cand.col_ngrams()]
#     print
print phen_candidates[0].context
print phen_candidates[0].context.document.name
print phen_candidates[0].context.cell

9457 candidates extracted
17903292
Span("Serum Creatinine", context=None, chars=[0,15], words=[0,1])
17903292
Span("Serum", context=None, chars=[0,4], words=[0,0])
17903292
Span("Creatinine", context=None, chars=[6,15], words=[1,1])
17903292
Span("Change in serum", context=None, chars=[0,14], words=[0,2])
17903292
Span("in serum creatinine", context=None, chars=[7,25], words=[1,3])
17903292
Span("Change in", context=None, chars=[0,8], words=[0,1])
17903292
Span("in serum", context=None, chars=[7,14], words=[1,2])
17903292
Span("serum creatinine", context=None, chars=[10,25], words=[2,3])
17903292
Span("Change", context=None, chars=[0,5], words=[0,0])
17903292
Span("in", context=None, chars=[7,8], words=[1,1])
Phrase('17903292', 0, 10, 0, u'Serum Creatinine')
17903292
Cell('17903292', 0, 10, u'Serum Creatinine')


### Relations

In [6]:
from snorkel.candidates import AlignedTableRelationExtractor
relation_extractor = AlignedTableRelationExtractor(rsid_extractor, phen_extractor, axis='row', induced=True)

In [16]:
tables = corpus.get_tables()
# mini_tables = [tables[8]]
mini_tables = [fun_table]
%time candidates = relation_extractor.extract(mini_tables, name='all')
print "%s relations extracted, e.g." % len(candidates)
for cand in candidates[:40]: 
    print cand

CPU times: user 13.8 s, sys: 128 ms, total: 13.9 s
Wall time: 13.9 s
573 relations extracted, e.g.
SpanPair(Span("rs10511884", context=None, chars=[0,9], words=[0,0]), Span("Interleukin-6, C-reactive", context=None, chars=[0,24], words=[0,2]))
SpanPair(Span("rs10511884", context=None, chars=[0,9], words=[0,0]), Span(", C-reactive protein", context=None, chars=[13,32], words=[1,3]))
SpanPair(Span("rs10511884", context=None, chars=[0,9], words=[0,0]), Span("C-reactive protein and", context=None, chars=[15,36], words=[2,4]))
SpanPair(Span("rs10511884", context=None, chars=[0,9], words=[0,0]), Span("protein and Fibrinogen", context=None, chars=[26,47], words=[3,5]))
SpanPair(Span("rs10511884", context=None, chars=[0,9], words=[0,0]), Span("Interleukin-6,", context=None, chars=[0,13], words=[0,1]))
SpanPair(Span("rs10511884", context=None, chars=[0,9], words=[0,0]), Span(", C-reactive", context=None, chars=[13,24], words=[1,2]))
SpanPair(Span("rs10511884", context=None, chars=[0,9], words=[

In [7]:
tables = corpus.get_tables()
print tables[8]

Table('17903292', 1)


In [8]:
fun_tables = [t for t in tables if t.document.name=='17658951']

In [9]:
fun_doc = [d for d in corpus.documents if d.name == '17903293'][0]
fun_table = fun_doc.tables[2]

In [10]:
from snorkel.models import Span
# fun_cell = fun_table.cells[10]
# fun_cell = fun_table.cells[17]
fun_cell = fun_table.cells[24]
fun_phrase = fun_cell.phrases[0]
fun_span = Span(char_start=0, char_end=10, context=fun_phrase)

print fun_cell.text, fun_cell.row_num, fun_cell.col_num

rs2831617 5 1


In [12]:
head_cell=fun_span.context.cell.head_cell(axis='row', induced=True)
print head_cell.row_num

3


In [13]:
other_axis='col'
other_axis_name='col_num'
axis_name='row_num'
cells = [ cell for cell in fun_cell.table.cells
                  if getattr(cell,axis_name) == getattr(fun_cell,axis_name)
                  and cell != fun_cell ]
def induced_or_real(c):
    return c if c.text and not c.text.isspace() else \
           c.first_aligned_nonempty_cell(other_axis)
aligned_cells = [induced_or_real(cell) for cell in cells
                if induced_or_real(cell) is not None]
# aligned_cells = [cell for cell in head_cell.table.cells
#                              if getattr(cell,other_axis_name) == getattr(head_cell,other_axis_name)
#                              if getattr(cell,axis_name) < getattr(head_cell,axis_name)
#                              and not cell.text.isspace()]

In [14]:
aligned_cells = fun_cell.aligned_cells('row', induced=True)

In [15]:
print fun_cell.text, fun_cell.row_num, fun_cell.col_num
for cell in aligned_cells:
    print cell, cell.row_num, cell.col_num
print aligned_cells

rs2831617 5 1
Cell('17903293', 2, 9, u'Interleukin-6, C-reactive protein and Fibrinogen') 3 0
Cell('17903293', 2, 25, u'21') 5 2
Cell('17903293', 2, 26, u'28481515') 5 3
Cell('17903293', 2, 27, u'6.2*10 -4') 5 4
Cell('17903293', 2, 28, u'0.0027') 5 5
Cell('17903293', 2, 22, u'IL2RA , RBM17') 4 6
[Cell('17903293', 2, 9, u'Interleukin-6, C-reactive protein and Fibrinogen'), Cell('17903293', 2, 25, u'21'), Cell('17903293', 2, 26, u'28481515'), Cell('17903293', 2, 27, u'6.2*10 -4'), Cell('17903293', 2, 28, u'0.0027'), Cell('17903293', 2, 22, u'IL2RA , RBM17')]
