# Phenotype/SNP relation extraction from tables

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

## Load corpus

In [4]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLTableDocParser

xml_parser = UnicodeXMLTableDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [9]:
from snorkel.parser import HTMLParser
from extractor.parser import UnicodeTableParser
from snorkel.parser import CorpusParser
import cPickle

table_parser = UnicodeTableParser()
html_parser = HTMLParser(path='../data/db/papers/')

corpus_name = 'gwas-table-corpus.pkl'

try:
    with open(corpus_name,"r") as pkl:
        corpus = cPickle.load(pkl)
except:
    cp = CorpusParser(xml_parser, table_parser, max_docs=15)
    %time corpus = cp.parse_corpus(name='GWAS Corpus')
    # pickling currently doesn't work...
#     with open(corpus_name,"w") as pkl:
#         corpus = cPickle.dump(corpus, pkl)

CPU times: user 53.9 s, sys: 3.28 s, total: 57.2 s
Wall time: 1min 22s


## Candidate extraction

### RSid Extraction

In [17]:
from snorkel.matchers import DictionaryMatch, RegexMatchSpan, Union
from snorkel.candidates import EntityExtractor
from snorkel.candidates import TableNgrams

from db.kb import KnowledgeBase

# Define a candidate space
ngrams = TableNgrams(n_max=1)

# Get a list of all the RSids we know
kb = KnowledgeBase()
rs_ids = kb.get_rsid_candidates()

# Define matchers
dict_rsid_matcher = DictionaryMatch(d=rs_ids, longest_match_only=False)
regx_rsid_matcher = RegexMatchSpan(rgx=r'rs\d+')
rsid_matcher = Union(dict_rsid_matcher, regx_rsid_matcher)

rsid_extractor = EntityExtractor(ngrams, rsid_matcher)
# %time rs_candidates = rsid_extractor.extract(corpus.get_tables(), name='all')

In [6]:
for cand in rs_candidates[:10]: 
    print cand
print "%s candidates extracted" % len(rs_candidates)
print rs_candidates[0].context
print rs_candidates[0].context.cell

Span("rs2076756", context=None, chars=[0,8], words=[0,0])
Span("rs1992662", context=None, chars=[0,8], words=[0,0])
Span("rs1992660", context=None, chars=[0,8], words=[0,0])
Span("rs1793004", context=None, chars=[0,8], words=[0,0])
Span("rs10521209", context=None, chars=[0,9], words=[0,0])
Span("rs2631372", context=None, chars=[0,8], words=[0,0])
Span("rs2925757", context=None, chars=[0,8], words=[0,0])
Span("rs6947579", context=None, chars=[0,8], words=[0,0])
Span("rs1553575", context=None, chars=[0,8], words=[0,0])
Span("rs10484545", context=None, chars=[0,9], words=[0,0])
1577 candidates extracted
Phrase('17684544', 1, 20, 0, u'rs2076756')
Cell('17684544', 1, 20, u'rs2076756')


#### Statistics

In [7]:
from extractor.util import gold_rsid_stats, gold_rsid_precision

gold_set = frozenset( [ (doc.name, rs_id) for doc in corpus.documents for rs_id in kb.rsids_by_pmid(int(doc.name)) ] )
gold_set_rsids = [rs_id for doc_id, rs_id in gold_set]

gold_rsid_stats(rs_candidates, gold_set)

# of gold annotations	= 178
# of candidates		= 1371
Candidate recall	= 0.978
Candidate precision	= 0.127


Interesting: some SNPs seem to be never mentioned (e.g. rs12122100) while others (rs727153) appear only in the text.
Sometimes, it's not picked up for a different, strange reason: see rs13314993.

In [8]:
cells = rs_candidates[0].context.cell.aligned_cells('row')
[cell.text for cell in cells]

[u'1',
 u'16',
 u'49,314,382',
 u'CARD15, intron',
 u'0.26',
 u'0.43',
 u'1.93E-13',
 u'2.04E-12',
 u'2.1 (1.58-2.80)',
 u'0.27',
 u'0.41',
 u'6.80E-20',
 u'1.39E-21',
 u'1.71 (1.42-2.05)',
 u'5.90E-08']

### Phenotypes

In [10]:
from snorkel.matchers import DictionaryMatch, Union, CellNameMatcher, CellDictNameMatcher
from snorkel.candidates import EntityExtractor
from snorkel.candidates import TableNgrams, CellSpace

# Define a candidate space
ngrams = TableNgrams(n_max=9)
cells = CellSpace()

# Create a list of possible words that could denote phenotypes
phen_words = ['trait', 'phenotype']

# Define matchers
# dict_row_matcher = DictionaryMatch(d=phen_words, longest_match_only=False, stemmer='porter')
# cell_row_matcher = CellNameMatcher(row_matcher=dict_row_matcher, cand_space=ngrams)
# dict_col_matcher = DictionaryMatch(d=phen_words, longest_match_only=False, stemmer='porter')
# cell_col_matcher = CellNameMatcher(col_matcher=dict_col_matcher, cand_space=ngrams)
# phen_matcher = Union(cell_row_matcher, cell_col_matcher)
# phen_matcher = CellNameMatcher(col_matcher=dict_col_matcher, cand_space=ngrams)
phen_matcher = CellDictNameMatcher(axis='col', d=phen_words, n_max=3, ignore_case=True)

phen_extractor = EntityExtractor(cells, phen_matcher)
%time phen_candidates = phen_extractor.extract(corpus.get_tables(), name='all')

CPU times: user 20.9 s, sys: 254 ms, total: 21.2 s
Wall time: 21.1 s


In [15]:
print "%s candidates extracted" % len(phen_candidates)
for cand in phen_candidates[0:10]: 
    print cand.context.document.name, cand.context.table, cand.context.cell
    print unicode(cand)
#     print [span for span in cand.row_ngrams()]
#     print [span for span in cand.col_ngrams()]
#     print
print
print phen_candidates[0].context
print phen_candidates[0].context.document.name, phen_candidates[0].context.table
print phen_candidates[0].context.cell

1851 candidates extracted
17903292 Table('17903292', 0) Cell('17903292', 0, 10, u'Serum Creatinine')
Span("Serum Creatinine", context=None, chars=[0,15], words=[0,1])
17903292 Table('17903292', 0) Cell('17903292', 0, 15, u'Change in serum creatinine')
Span("Change in serum creatinine", context=None, chars=[0,25], words=[0,3])
17903292 Table('17903292', 0) Cell('17903292', 0, 20, u'Glomerular Filtration Rate (GFR)')
Span("Glomerular Filtration Rate (GFR)", context=None, chars=[0,35], words=[0,5])
17903292 Table('17903292', 0) Cell('17903292', 0, 25, u'Chronic Kidney Disease')
Span("Chronic Kidney Disease", context=None, chars=[0,21], words=[0,2])
17903292 Table('17903292', 0) Cell('17903292', 0, 30, u'Cystatin C')
Span("Cystatin C", context=None, chars=[0,9], words=[0,1])
17903292 Table('17903292', 0) Cell('17903292', 0, 35, u'Uric acid')
Span("Uric acid", context=None, chars=[0,8], words=[0,1])
17903292 Table('17903292', 0) Cell('17903292', 0, 40, u'Urinary Albumin Excretion')
Span("Ur

### Relations

In [21]:
from snorkel.candidates import AlignedTableRelationExtractor
relation_extractor = AlignedTableRelationExtractor(rsid_extractor, phen_extractor, axis='row', induced=True)
tables = corpus.get_tables()

# create smaller subsets for evaluation/debugging
easy_tables = [tables[8]]
# hard_tables = [t for t in tables if t.document.name=='17658951']
hard_doc = [d for d in corpus.documents if d.name == '17903293'][0]
hard_tables = [hard_doc.tables[2]]

In [None]:
%time candidates = relation_extractor.extract(tables, name='all')
print "%s relations extracted, e.g." % len(candidates)
for cand in candidates[:40]: 
    print cand

In [None]:
fun_c = [candidate for candidate in candidates if candidate.context.document.name != '17658951']
for c in fun_c[:10]:
    print fun_c

Here, we remove nested candidates

In [None]:
# load existing candidates into a dict
span_dict = { str(span_pair.span1.context) : list() for span_pair in candidates }
for span_pair in candidates:
    span = span_pair.span1
    span_dict[str(span.context)].append( (span.char_start, span.char_end) )

def nested(ivl1, ivl2):
    if ivl1 != ivl2 and ivl2[0] <= ivl1[0] <= ivl1[1] <= ivl2[1]:
        return True
    else:
        return False

new_candidates = list()
for span_pair in candidates:
    span = span_pair.span1
    span_ivl = span.char_start, span.char_end
    span_name = str(span.context)
    if all([not nested(span_ivl, other_ivl) for other_ivl in span_dict[span_name]]):
        new_candidates.append(span_pair)
        
print len(candidates) - len(new_candidates), 'candidates dropped, now we have', len(new_candidates)
# phen_c = new_phen_c

#### Debugging

In [20]:
from snorkel.models import Span
fun_tables = [t for t in tables if t.document.name=='17658951']
fun_doc = [d for d in corpus.documents if d.name == '17903293'][0]
fun_table = fun_doc.tables[2]

# fun_cell = fun_table.cells[10]
# fun_cell = fun_table.cells[17]
fun_cell = fun_table.cells[24]
fun_phrase = fun_cell.phrases[0]
fun_span = Span(char_start=0, char_end=10, context=fun_phrase)

print fun_cell.text, fun_cell.row_num, fun_cell.col_num, fun_table.position

rs2831617 5 1 2


In [14]:
aligned_cells = fun_cell.aligned_cells('row', induced=True)

In [15]:
print fun_cell.text, fun_cell.row_num, fun_cell.col_num
for cell in aligned_cells:
    print cell, cell.row_num, cell.col_num
print aligned_cells

rs2831617 5 1
Cell('17903293', 2, 9, u'Interleukin-6, C-reactive protein and Fibrinogen') 3 0
Cell('17903293', 2, 25, u'21') 5 2
Cell('17903293', 2, 26, u'28481515') 5 3
Cell('17903293', 2, 27, u'6.2*10 -4') 5 4
Cell('17903293', 2, 28, u'0.0027') 5 5
Cell('17903293', 2, 22, u'IL2RA , RBM17') 4 6
[Cell('17903293', 2, 9, u'Interleukin-6, C-reactive protein and Fibrinogen'), Cell('17903293', 2, 25, u'21'), Cell('17903293', 2, 26, u'28481515'), Cell('17903293', 2, 27, u'6.2*10 -4'), Cell('17903293', 2, 28, u'0.0027'), Cell('17903293', 2, 22, u'IL2RA , RBM17')]


## Learning the correctness of relations

### Creating a gold set

To create a gold set, we save all extracted relations into a csv file. We annotate it manually, and save the result to a second file. It contains pairs of phenotype and rsid strings; if that file exists, we take these as gold truth.

In [28]:
# store relations to annotate
with open('candidates.unnanotated.tsv', 'w') as f:
    for span_pair in new_candidates:
        doc_id = span_pair.span0.context.document.name
        table_id = span_pair.span0.context.table.position
        row_num = span_pair.span0.context.document.name
        str1 = span_pair.span0.get_span()
        str2 = span_pair.span1.get_span()
        try:
            f.write('%s\t%s\t%s\%s\t%s\n' % (doc_id, table_id, row_num, str1, str2))
        except:
            continue