# Phenotype acronym extraction

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle

# import snorkel and gwasdb
sys.path.append('../snorkel')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up paths
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

## Load corpus

In [2]:
from snorkel.parser import XMLDocParser
from extractor.parser import UnicodeXMLTableDocParser

xml_parser = UnicodeXMLTableDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [3]:
from snorkel.parser import HTMLParser
from extractor.parser import UnicodeTableParser
from snorkel.parser import CorpusParser
import cPickle

table_parser = UnicodeTableParser()
html_parser = HTMLParser(path='../data/db/papers/')

corpus_name = 'gwas-table-corpus.pkl'

try:
    with open(corpus_name,"r") as pkl:
        corpus = cPickle.load(pkl)
except:
    cp = CorpusParser(xml_parser, table_parser, max_docs=15)
    %time corpus = cp.parse_corpus(name='GWAS Corpus')
    # pickling currently doesn't work...
#     with open(corpus_name,"w") as pkl:
#         corpus = cPickle.dump(corpus, pkl)

CPU times: user 52 s, sys: 2.78 s, total: 54.7 s
Wall time: 1min 25s


## Candidate extraction

### From tables

In [4]:
from snorkel.matchers import DictionaryMatch, Union, CellNameMatcher, CellDictNameMatcher
from snorkel.candidates import EntityExtractor
from snorkel.candidates import TableNgrams, CellSpace

# Define a candidate space
cells = CellSpace()

# Create a list of possible words that could denote phenotypes
acro_words = ['abbreviation', 'acronym']
phen_words = ['trait', 'phenotype']

# Define matchers
phen_matcher = CellDictNameMatcher(axis='col', d=phen_words, n_max=3, ignore_case=True)
acro_matcher = CellDictNameMatcher(axis='col', d=acro_words, n_max=3, ignore_case=True)

phen_extractor = EntityExtractor(cells, phen_matcher)
acro_extractor = EntityExtractor(cells, acro_matcher)

In [5]:
from snorkel.candidates import AlignedTableRelationExtractor
relation_extractor = AlignedTableRelationExtractor(phen_extractor, acro_extractor, axis='row', induced=False)
tables = corpus.get_tables()

# create smaller subsets for evaluation/debugging
easy_tables = [tables[8]]
# hard_tables = [t for t in tables if t.document.name=='17658951']
hard_doc = [d for d in corpus.documents if d.name == '17903293'][0]
hard_tables = [hard_doc.tables[2]]

In [52]:
# %time candidates = relation_extractor.extract(tables, name='all')
print "%s relations extracted, e.g." % len(candidates)
for cand in candidates[:100]: 
    print cand

228 relations extracted, e.g.
SpanPair(Span("CD40 Ligand, serum & plasma", context=None, chars=[0,26], words=[0,5]), Span("CD40L", context=None, chars=[0,4], words=[0,0]))
SpanPair(Span("2", context=None, chars=[0,0], words=[0,0]), Span("CD40L", context=None, chars=[0,4], words=[0,0]))
SpanPair(Span("C-reactive protein", context=None, chars=[0,17], words=[0,1]), Span("CRP", context=None, chars=[0,2], words=[0,0]))
SpanPair(Span("5", context=None, chars=[0,0], words=[0,0]), Span("CRP", context=None, chars=[0,2], words=[0,0]))
SpanPair(Span("Intercellular adhesion molecule-1", context=None, chars=[0,32], words=[0,2]), Span("ICAM1", context=None, chars=[0,4], words=[0,0]))
SpanPair(Span("1", context=None, chars=[0,0], words=[0,0]), Span("ICAM1", context=None, chars=[0,4], words=[0,0]))
SpanPair(Span("Interleukin-6", context=None, chars=[0,12], words=[0,0]), Span("IL6", context=None, chars=[0,2], words=[0,0]))
SpanPair(Span("1", context=None, chars=[0,0], words=[0,0]), Span("IL6", context=

In [12]:
for cand in candidates[:100]: 
    print cand.span0.context.table, cand

Table('17903293', 0) SpanPair(Span("CD40 Ligand, serum & plasma", context=None, chars=[0,26], words=[0,5]), Span("CD40L", context=None, chars=[0,4], words=[0,0]))
Table('17903293', 0) SpanPair(Span("2", context=None, chars=[0,0], words=[0,0]), Span("CD40L", context=None, chars=[0,4], words=[0,0]))
Table('17903293', 0) SpanPair(Span("C-reactive protein", context=None, chars=[0,17], words=[0,1]), Span("CRP", context=None, chars=[0,2], words=[0,0]))
Table('17903293', 0) SpanPair(Span("5", context=None, chars=[0,0], words=[0,0]), Span("CRP", context=None, chars=[0,2], words=[0,0]))
Table('17903293', 0) SpanPair(Span("Intercellular adhesion molecule-1", context=None, chars=[0,32], words=[0,2]), Span("ICAM1", context=None, chars=[0,4], words=[0,0]))
Table('17903293', 0) SpanPair(Span("1", context=None, chars=[0,0], words=[0,0]), Span("ICAM1", context=None, chars=[0,4], words=[0,0]))
Table('17903293', 0) SpanPair(Span("Interleukin-6", context=None, chars=[0,12], words=[0,0]), Span("IL6", cont

### From text

In [37]:
from snorkel.parser import SentenceParser, CorpusParser
from extractor.parser import UnicodeXMLDocParser, GWASXMLDocParser

xml_parser = GWASXMLDocParser(
    path=abstract_dir,
    doc='./*',
    title='.//front//article-title//text()',
    abstract='.//abstract//p//text()',
    par1='.//body/p[1]//text()',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

sent_parser = SentenceParser()
cp = CorpusParser(xml_parser, sent_parser, max_docs=15)
%time text_corpus = cp.parse_corpus(name='GWAS Text Corpus')

CPU times: user 551 ms, sys: 587 ms, total: 1.14 s
Wall time: 7.25 s


In [32]:
from snorkel.candidates import Ngrams
from snorkel.matchers import RegexMatchSpan
from snorkel.candidates import EntityExtractor, RelationExtractor

# Define a candidate space
ngrams6 = Ngrams(n_max=6)
ngrams3 = Ngrams(n_max=3)

# Define matchers
phen_matcher = RegexMatchSpan(rgx=r'.+ \([a-zA-Z]{0,5}[\);]')
acro_matcher = RegexMatchSpan(rgx=r'\([a-zA-Z]{0,5}[\);]')

# Extractors
phen_txt_extractor = EntityExtractor(ngrams6, phen_matcher)
acro_txt_extractor = EntityExtractor(ngrams3, acro_matcher)

# Filtering function
def overlap_filter_fn(span0, span1):
    start0, end0 = span0.char_start, span0.char_end
    start1, end1 = span1.char_start, span1.char_end
    return True if start0 <= start1 <= end1 <= end0 else False

# Relation extractor
txt_tab_extractor = RelationExtractor(phen_txt_extractor, acro_txt_extractor, filter_fn=overlap_filter_fn)

Extract acroynms from tables

In [55]:
# %time txt_tab_c = acro_txt_extractor.extract(corpus.get_sentences(), name='all')
%time txt_tab_c = txt_tab_extractor.extract(corpus.get_sentences(), name='all')
print len(txt_tab_c), 'candidates extracted from text in tables'

CPU times: user 1.16 s, sys: 77.4 ms, total: 1.23 s
Wall time: 1.2 s
325 candidates extracted from text in tables


Extract acroynms from abstracts

In [56]:
%time txt_txt_c = txt_tab_extractor.extract(text_corpus.get_sentences(), name='all')
print len(txt_txt_c), 'candidates extracted from text in abstracts'

CPU times: user 938 ms, sys: 49.6 ms, total: 988 ms
Wall time: 976 ms
169 candidates extracted from text in abstracts


In [39]:
for cand in txt_tab_c[:100]:
    print cand.span0.context.document.name, cand

17684544 SpanPair(Span("sub-entity of inflammatory bowel disease (IBD)", context=None, chars=[22,67], words=[7,14]), Span("(IBD)", context=None, chars=[63,67], words=[12,14]))
17684544 SpanPair(Span("of inflammatory bowel disease (IBD)", context=None, chars=[33,67], words=[8,14]), Span("(IBD)", context=None, chars=[63,67], words=[12,14]))
17684544 SpanPair(Span("inflammatory bowel disease (IBD)", context=None, chars=[36,67], words=[9,14]), Span("(IBD)", context=None, chars=[63,67], words=[12,14]))
17684544 SpanPair(Span("bowel disease (IBD)", context=None, chars=[49,67], words=[10,14]), Span("(IBD)", context=None, chars=[63,67], words=[12,14]))
17684544 SpanPair(Span("disease (IBD)", context=None, chars=[55,67], words=[11,14]), Span("(IBD)", context=None, chars=[63,67], words=[12,14]))
17903292 SpanPair(Span(") and urinary albumin excretion (UAE)", context=None, chars=[31,67], words=[5,12]), Span("(UAE)", context=None, chars=[63,67], words=[10,12]))
17903292 SpanPair(Span("and urinary 

## Learning the correctness of relations

In [71]:
from snorkel.candidates import UnionExtractor

contexts = [corpus.get_tables(), corpus.get_sentences(), text_corpus.get_sentences()]
extractors = [relation_extractor, txt_tab_extractor, txt_tab_extractor]
joint_extractor = UnionExtractor(extractor_list=extractors, context_list=contexts)
all_c = joint_extractor.union()
print len(all_c)

552


### Creating a gold set

To create a gold set, we save all extracted relations into a csv file. We annotate it manually, and save the result to a second file. It contains pairs of phenotype and rsid strings; if that file exists, we take these as gold truth.

In [72]:
# store relations to annotate
with open('acronyms.unannotated.tsv', 'w') as f:
    for span_pair in all_c:
        doc_id = span_pair.span0.context.document.name
        str1 = span_pair.span0.get_span()
        str2 = span_pair.span1.get_span()
        try:
            f.write('%s\t%s\t%s\n' % (doc_id, str1, str2))
        except:
            continue

We now load the results of our annotation

In [77]:
annotations = dict()
with open('acronyms.anotated.txt') as f:
    text = f.read()
    for line in text.split('\r'):
        doc_id, str1, str2, res = line.strip().split('\t')
        res = 1 if res == 1 else -1
        annotations[(doc_id, str1, str2)] = res