# Phenotype acronym extraction

This module looks at both tables and text to identify acronyms used to refer to phenotypes. These acronyms are then used to further expand SNP/phenotype relations.

## Preparations

We start by configuring Jupyter and setting up our environment.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import cPickle
import numpy as np
import sqlalchemy

# set the paths to snorkel and gwasdb
sys.path.append('../snorkel-tables')
sys.path.append('../src')
sys.path.append('../src/crawler')

# set up the directory with the input papers
abstract_dir = '../data/db/papers'

# set up matplotlib
import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)

# create a Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

## Load corpus

We load our usual corpus of GWAS papers.

### Tables corpus

Part of the acronyms are found in tables. We parse these like in the other table-based modules.

In [2]:
from snorkel.parser import XMLMultiDocParser

xml_parser = XMLMultiDocParser(
    path=abstract_dir,
    doc='./*',
    text='.//table',
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

In [3]:
from snorkel.parser import CorpusParser, OmniParser
from snorkel.models import Corpus

# parses tables into rows, cols, cells...
table_parser = OmniParser(timeout=1000000)

try:
    table_corpus = session.query(Corpus).filter(Corpus.name == 'GWAS Table Corpus').one()
except:
    cp = CorpusParser(xml_parser, table_parser)
    %time table_corpus = cp.parse_corpus(name='GWAS Table Corpus', session=session)
    session.add(table_corpus)
    session.commit()

print 'Loaded corpus of %d documents' % len(table_corpus)

Loaded corpus of 589 documents


### Text copus

We also seek mentions of acronyms in the paper text.

The following parser extracts sentences from each paper abstract, title, and the first 5 paragraphs.

In [4]:
from snorkel.parser import SentenceParser
from snorkel.parser import CorpusParser
from snorkel.models import Corpus

from extractor.parser import UnicodeXMLDocParser, GWASXMLDocParser

xml_parser = GWASXMLDocParser(
    path=abstract_dir,
    doc='./*',
    title='.//front//article-title//text()',
    abstract='.//abstract//p//text()',
    n_par=5,
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

sent_parser = SentenceParser()

try:
    text_corpus = session.query(Corpus).filter(Corpus.name == 'GWAS Text Corpus').one()
except:
    cp = CorpusParser(xml_parser, sent_parser)
    %time text_corpus = cp.parse_corpus(name='GWAS Text Corpus', session=session)
    session.add(text_corpus)
    session.commit()

print 'Loaded corpus of %d documents' % len(text_corpus)



CPU times: user 50.7 s, sys: 2.16 s, total: 52.8 s
Wall time: 9min 37s
Loaded corpus of 589 documents


## Candidate extraction

Next, we generate candidates from both tables and text.

### From phenotype / acronym tables

Many papers have tables with an acronym column and a phenotype column. In this section, we extract candidates from these tables.

We define matchers for cells whose header contains a word that is indicative of a phenotype or an acronym.

In [5]:
# create a Snorkel class for the relation we will extract
from snorkel.models import candidate_subclass
AcroPhenRel = candidate_subclass('AcroPhenRel', ['acro','phen'])

# Define a candidate space
from snorkel.candidates import TableCells
cells = TableCells()

# Create a list of possible words that could denote phenotypes
acro_words = ['abbreviation', 'acronym', 'phenotype']
phen_words = ['trait', 'phenotype', 'description']

# Define matchers
from snorkel.matchers import CellNameDictionaryMatcher
phen_matcher = CellNameDictionaryMatcher(axis='col', d=phen_words, n_max=3, ignore_case=True)
acro_matcher = CellNameDictionaryMatcher(axis='col', d=acro_words, n_max=3, ignore_case=True)

# we will be looking only at aligned cells
from snorkel.throttlers import AlignmentThrottler
row_align_filter = AlignmentThrottler(axis='row', infer=True)

# create the candidate extractor
from snorkel.candidates import CandidateExtractor
ce1 = CandidateExtractor(AcroPhenRel, [cells, cells], [acro_matcher, phen_matcher], throttler=row_align_filter)

# collect that cells that will be searched for candidates
tables = [table for doc in table_corpus.documents for table in doc.tables]

We are now ready to perform relation extraction.

In [None]:
from snorkel.models import CandidateSet

try:
    tab_rels = session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Tables Set').one()
except:
    %time tab_rels = ce1.extract(tables, 'AcroPhenRel Tables Set', session)
    
print "%s relations extracted, e.g." % len(tab_rels)
for cand in tab_rels[:10]:
    print cand



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))




In [13]:
from snorkel.candidates import AlignedTableRelationExtractor
relation_extractor = AlignedTableRelationExtractor(acro_extractor, phen_extractor, axis='row', induced=True)
tables = corpus.get_tables()

In [14]:
%time candidates = relation_extractor.extract(tables, name='all')
table_c = candidates
print "%s relations extracted, e.g." % len(table_c)
for cand in table_c[:5]: 
    print cand

CPU times: user 29min 20s, sys: 18.9 s, total: 29min 39s
Wall time: 30min 8s
267 relations extracted, e.g.
SpanPair(Span("Candidate gene", context=None, chars=[0,13], words=[0,1]), Span("PHENOTYPE", context=None, chars=[0,8], words=[0,0]))
SpanPair(Span("PHENOTYPE", context=None, chars=[0,8], words=[0,0]), Span("Candidate gene", context=None, chars=[0,13], words=[0,1]))
SpanPair(Span("CST3", context=None, chars=[0,3], words=[0,0]), Span("CysC", context=None, chars=[0,3], words=[0,0]))
SpanPair(Span("CysC", context=None, chars=[0,3], words=[0,0]), Span("CST3", context=None, chars=[0,3], words=[0,0]))
SpanPair(Span("CST3", context=None, chars=[0,3], words=[0,0]), Span("CysC", context=None, chars=[0,3], words=[0,0]))


### From table phrases

Another way of definining acronyms is in text, e.g. as in "Body Mass Index (BMI)". We are now going to extract such candidates from phrases that are found in paper tables.

In [None]:
# Define a candidate space
from snorkel.candidates import OmniNgrams
ngrams3 = OmniNgrams(n_max=3)
ngrams8 = OmniNgrams(n_max=8)

# Define matchers
from snorkel.matchers import RegexMatchSpan
phen_matcher = RegexMatchSpan(rgx=r'.+ \([a-zA-Z0-9_-]{1,10}[\);]')
acro_matcher = RegexMatchSpan(rgx=r'\([a-zA-Z0-9_-]{1,10}[\);]')

# We only look at phenotype and acronym matches that overlap
from snorkel.throttlers import OverlapThrottler, WordLengthThrottler, CombinedThrottler
overlap_filter = OverlapThrottler()
# length_filter = WordLengthThrottler(op='max', idx=1, lim=15)
# ovl_len_filter = CombinedThrottler([overlap_filter, length_filter])

# create the candidate extractor
from snorkel.candidates import CandidateExtractor
ce2 = CandidateExtractor(AcroPhenRel, [ngrams3, ngrams8], [acro_matcher, phen_matcher], throttler=overlap_filter)

Let us now extract these candidates.

In [None]:
from snorkel.models import CandidateSet

try:
    txt_tab_rels = session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Tables Set 2').one()
except:
    %time txt_tab_rels = ce2.extract(table_corpus.documents, 'AcroPhenRel Tables Set 2', session)
    
print "%s relations extracted, e.g." % len(txt_tab_rels)
for cand in txt_tab_rels[:10]:
    print cand

### From text

Finally, we repeat the same extraction process for candidates that are found in text.

In [None]:
# Define a candidate space
from snorkel.candidates import Ngrams
ngrams3 = Ngrams(n_max=3)
ngrams8 = Ngrams(n_max=8)

# Define matchers
from snorkel.matchers import RegexMatchSpan
phen_matcher = RegexMatchSpan(rgx=r'.+ \([a-zA-Z0-9_-]{1,10}[\);]')
acro_matcher = RegexMatchSpan(rgx=r'\([a-zA-Z0-9_-]{1,10}[\);]')

# We only look at phenotype and acronym matches that overlap
from snorkel.throttlers import OverlapThrottler, WordLengthThrottler, CombinedThrottler
overlap_filter = OverlapThrottler()
# length_filter = WordLengthThrottler(op='max', idx=1, lim=15)
# ovl_len_filter = CombinedThrottler([overlap_filter, length_filter])

# create the candidate extractor
from snorkel.candidates import CandidateExtractor
ce3 = CandidateExtractor(AcroPhenRel, [ngrams3, ngrams8], [acro_matcher, phen_matcher], throttler=overlap_filter)

We extract the candidates.

In [None]:
from snorkel.models import CandidateSet

try:
    txt_txt_rels = session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Text Set').one()
except:
    sentences = [s for doc in corpus for s in doc.sentences]
    %time txt_txt_rels = ce2.extract(sentences, 'AcroPhenRel Text Set', session)
    
print "%s relations extracted, e.g." % len(txt_txt_rels)
for cand in txt_txt_rels[:10]:
    print cand

### Combining the results

Finally, we merge all the candiates into a single set.

In [None]:
from snorkel.models import CandidateSet

try:
    rels = session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Canidates').one()
except:
    rels = CandidateSet(name='AcroPhenRel Canidates')
    for c in rels1: rels.append(c)
    for c in rels2: rels.append(c)
    for c in rels3: rels.append(c)

    session.add(rels)
    session.commit()

print '%d candidates in total' % len(rels)

In [6]:
from snorkel.parser import SentenceParser, CorpusParser
from extractor.parser import UnicodeXMLDocParser, GWASXMLDocParser

xml_parser = GWASXMLDocParser(
    path=abstract_dir,
    doc='./*',
    title='.//front//article-title//text()',
    abstract='.//abstract//p//text()',
    n_par=5,
    id='.//article-id[@pub-id-type="pmid"]/text()',
    keep_xml_tree=True)

sent_parser = SentenceParser()
cp = CorpusParser(xml_parser, sent_parser, max_docs=15)
%time text_corpus = cp.parse_corpus(name='GWAS Text Corpus')

CPU times: user 839 ms, sys: 59 ms, total: 898 ms
Wall time: 10.7 s


In [7]:
from snorkel.candidates import Ngrams, CellSpace, TableNgrams
from snorkel.matchers import RegexMatchSpan
from snorkel.candidates import EntityExtractor, RelationExtractor, UnionExtractor

# Define a candidate space
ngrams8 = Ngrams(n_max=8)
ngrams3 = Ngrams(n_max=3)
cells = CellSpace()
table_ngrams3 = TableNgrams(n_max=3)

# Define matchers
phen_matcher = RegexMatchSpan(rgx=r'.+ \([a-zA-Z0-9_-]{1,10}[\);]')
acro_matcher = RegexMatchSpan(rgx=r'\([a-zA-Z0-9_-]{1,10}[\);]')

# Extractors
phen_txt_ngram_extractor = EntityExtractor(ngrams8, phen_matcher)
phen_txt_cells_extractor = EntityExtractor(cells, phen_matcher)
acro_txt_ngram_extractor = EntityExtractor(ngrams3, acro_matcher)
acro_txt_cells_extractor = EntityExtractor(table_ngrams3, acro_matcher)

# Filtering functions
def overlap_filter_fn(span0, span1):
    if hasattr(span0.context, 'cell') and hasattr(span1.context, 'cell'):
        if span0.context.cell != span1.context.cell: return False
    if len(span1.get_span().split()) >= 15: return False
    start0, end0 = span0.char_start, span0.char_end
    start1, end1 = span1.char_start, span1.char_end
    return True if start1 <= start0 <= end0 <= end1 else False

# Relation extractor
txt_tab_ngram_extractor = RelationExtractor(acro_txt_ngram_extractor, phen_txt_ngram_extractor, filter_fn=overlap_filter_fn)
txt_tab_cells_extractor = RelationExtractor(acro_txt_cells_extractor, phen_txt_cells_extractor, filter_fn=overlap_filter_fn)

Extract acroynms from tables

In [8]:
%time txt_tab_c = txt_tab_ngram_extractor.extract(corpus.get_phrases(), name='all')
print len(txt_tab_c), 'candidates extracted from text in tables'

CPU times: user 3.4 s, sys: 262 ms, total: 3.67 s
Wall time: 3.76 s
1305 candidates extracted from text in tables


Extract acronyms from full table cells

In [9]:
%time txt_cel_c = txt_tab_cells_extractor.extract(corpus.get_tables(), name='all')
print len(txt_cel_c), 'candidates extracted from text in full table cells'

CPU times: user 5.37 s, sys: 119 ms, total: 5.49 s
Wall time: 5.42 s
761 candidates extracted from text in full table cells


Extract acroynms from abstracts

In [10]:
%time txt_txt_c = txt_tab_ngram_extractor.extract(text_corpus.get_sentences(), name='all')
print len(txt_txt_c), 'candidates extracted from text in abstracts'

CPU times: user 2.05 s, sys: 31.2 ms, total: 2.09 s
Wall time: 2.09 s
884 candidates extracted from text in abstracts


## Creating a gold set

It will be helpful to have a list of gold labels against which to evaluate the accuracy of our system.

We are going to load here a list of candidates that we have previously labeled by hand.

In [15]:
annotations = dict()
with open('acronyms.anotated.txt') as f:
    text = f.read()
    for line in text.split('\r'):
        doc_id, str1, str2, res = line.strip().split('\t')
        res = 1 if int(res) == 1 else -1
        annotations[(doc_id, str2, str1)] = res

The format of this file is: pmid, phenotype, acronym, label. We originally generated it from 100 random candidates.

## Learning the correctness of relations extracted from tables

Next, we are going to use a machine learning classifier to identify correct acronyms amond our set of candidates.

First, we are going to train a classifier for candidates that have been extracted from tables (that had a phenotype and an acronym column).

### Creating training and test sets

We first split data into an (unlabeled) training set (since we will use unsupervised risk estimation to train a candidate on it), and a dev/test set.

In [None]:
try:
    tab_train_c = session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Table Training Candidates').one()
    tab_devtest_c = session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Table Dev/Test Candidates').one()
except:
    # delete any previous sets with that name
    session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Table Training Candidates').delete()
    session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Table Dev/Test Candidates').delete()

    # helpers/config
    frac_test = 0.5
    def r2id(r):
        doc_id = r[0].parent.document.name
        str1, str2 = r[0].get_span(), r[1].get_span()
        return (doc_id, str1, str2)

    # initialize the new sets
    tab_train_c = CandidateSet(name='AcroPhenRel Table Training Candidates')
    tab_devtest_c = CandidateSet(name='AcroPhenRel Table Dev/Test Candidates')

    # choose a random subset for the labeled set
    n_test = len(tab_rels) * frac_test
    test_idx = set(np.random.choice(len(tab_rels), size=(n_test,), replace=False))

    # add to the sets
    for i, c in enumerate(tab_rels):
        if i in test_idx:
            tab_devtest_c.append(c)
        elif r2id(c) in annotations:
            tab_devtest_c.append(c)
        else:
            tab_train_c.append(c)

    # save the results
    session.add(tab_train_c)
    session.add(tab_devtest_c)
    session.commit()

print 'Initialized %d training and %d dev/testing candidates' % (len(tab_train_c), len(tab_devtest_c))
print "Positive labels in dev/test set: %s" % len([c for c in tab_devtest_c if annotations.get(r2id(c),0)==1])
print "Negative labels in dev/test set: %s" % len([c for c in tab_devtest_c if annotations.get(r2id(c),0)==-1])

In [17]:
def spair2uid(span_pair):
    doc_id = span_pair.span0.context.document.name
    str1 = span_pair.span0.get_span()
    str2 = span_pair.span1.get_span()
    return (doc_id, str1, str2)

# Split into train and test set
training_candidates = []
gold_candidates     = []
gold_labels         = []
n_half = len(candidates)/2
for c in candidates[:n_half]:
    uid = spair2uid(c)
    if uid in annotations:
        gold_candidates.append(c)
        gold_labels.append(annotations[uid])
    else:
        training_candidates.append(c)
training_candidates.extend(candidates[n_half:])
gold_labels = np.array(gold_labels)
print "Training set size: %s" % len(training_candidates)
print "Gold set size: %s" % len(gold_candidates)
print "Positive labels in training set: %s" % len([c for c in training_candidates if annotations.get(spair2uid(c),0)==1])
print "Negative labels in training set: %s" % len([c for c in training_candidates if annotations.get(spair2uid(c),0)==-1])
print "Positive labels in gold set: %s" % len([c for c in gold_candidates if annotations[spair2uid(c)]==1])
print "Negative labels in gold set: %s" % len([c for c in gold_candidates if annotations[spair2uid(c)]==-1])

Training set size: 210
Gold set size: 57
Positive labels in training set: 0
Negative labels in training set: 0
Positive labels in gold set: 36
Negative labels in gold set: 21


### Labeling functions

Following the data programming approach, we define set of labeling functions. We will learn their accuracy via unsupervised learning and use them for classifying candidates.

In [18]:
def LF1_digits(m):
    txt = m[1].get_span()
    frac_num = len([ch for ch in txt if ch.isdigit()]) / float(len(txt))
    return -1 if frac_num > 0.5 else +1
def LF1_short(m):
    txt = m[1].get_span()
    return -1 if len(txt) < 5 else 0

LF_tables = [LF1_digits, LF1_short]

We compute the LFs's on our training set.

In [None]:
from snorkel.annotations import LabelManager
label_manager = LabelManager()

try:
    %time L_tab_train = label_manager.load(session, tab_train_c, 'AcroPhenRel Table Training LF Labels')
except sqlalchemy.orm.exc.NoResultFound:
    %time L_tab_train = label_manager.create(session, tab_train_c, 'AcroPhenRel Table Training LF Labels', f=LF_tables)

And we learn their accuracy.

In [None]:
from snorkel.learning import NaiveBayes

tab_model = NaiveBayes()
tab_model.train(L_tab_train, n_iter=10000, rate=1e-2)

### Evaluating the model's accuracy

We evaluate our accuracy on the dev/test set.

In [None]:
from snorkel.annotations import LabelManager
label_manager = LabelManager()

try:
    %time L_tab_devtest = label_manager.load(session, tab_devtest_c, 'AcroPhenRel Table Dev/Test LF Labels')
except sqlalchemy.orm.exc.NoResultFound:
    %time L_tab_devtest = label_manager.create(session, tab_devtest_c, 'AcroPhenRel Table Dev/Test LF Labels', f=LF_tables)

L_tab_test_gold = np.array([annotations.get(r2uid[r],0) for r in tab_devtest_c])
tab_model.score(L_tab_devtest, L_tab_test_gold, tab_devtest_c)

In [23]:
learner.test_wmv(test_candidates, test_labels)

Applying LFs...
Featurizing...
Test set size:	28
----------------------------------------
Precision:	1.0
Recall:		1.0
F1 Score:	1.0
----------------------------------------
TP: 14 | FP: 0 | TN: 14 | FN: 0


### Candidate classification

Finally, we classify the entire set of candidates. We start by applying the labelling functions.

In [None]:
from snorkel.annotations import LabelManager
label_manager = LabelManager()

try:
    %time L_tab_all = label_manager.load(session, tab_rels, 'AcroPhenRel Table LF Labels')
except sqlalchemy.orm.exc.NoResultFound:
    %time L_tab_all = label_manager.create(session, tab_rels, 'AcroPhenRel Table LF Labels', f=LF_tables)

We use the model to predict which ones are correct.

In [None]:
scores = tab_model.odds(L_tab_all)
tab_acronyms = [spair2uid(c) for (c, s) in zip(tab_rels, scores) if s > 0]

print 'Identified %d acronyms predicted to be correct, e.g.' % len(tab_acronyms)
print tab_acronyms[:10]

## Learning the correctness of relations extracted from text

Next, we repeat our classification procedure on relations that have been extracted from phrases.

We start by creating the set of all phrase relations.

In [None]:
from snorkel.models import CandidateSet

try:
    txt_rels = session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Text Canidates').one()
except:
    txt_rels = CandidateSet(name='AcroPhenRel Text Canidates')
    for c in txt_tab_rels: txt_rels.append(c)
    for c in txt_txt_rels: txt_rels.append(c)

    session.add(txt_rels)
    session.commit(txt_rels)

print 'Collected %d candidates from phrases' % len(txt_rels)

### Creating training and test sets

We first split data into an (unlabeled) training set (since we will use unsupervised risk estimation to train a candidate on it), and a dev/test set.

In [None]:
try:
    txt_train_c = session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Phrase Training Candidates').one()
    txt_devtest_c = session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Phrase Dev/Test Candidates').one()
except:
    # delete any previous sets with that name
    session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Phrase Training Candidates').delete()
    session.query(CandidateSet).filter(CandidateSet.name == 'AcroPhenRel Phrase Dev/Test Candidates').delete()

    # helpers/config
    frac_test = 0.5
    def r2id(r):
        doc_id = r[0].parent.document.name
        str1, str2 = r[0].get_span(), r[1].get_span()
        return (doc_id, str1, str2)

    # initialize the new sets
    txt_train_c = CandidateSet(name='AcroPhenRel Phrase Training Candidates')
    txt_devtest_c = CandidateSet(name='AcroPhenRel Phrase Dev/Test Candidates')

    # choose a random subset for the labeled set
    n_test = len(txt_rels) * frac_test
    test_idx = set(np.random.choice(len(txt_rels), size=(n_test,), replace=False))

    # add to the sets
    for i, c in enumerate(txt_rels):
        if i in test_idx:
            txt_devtest_c.append(c)
        elif r2id(c) in annotations:
            txt_devtest_c.append(c)
        else:
            txt_train_c.append(c)

    # save the results
    session.add(txt_train_c)
    session.add(txt_devtest_c)
    session.commit()

print 'Initialized %d training and %d dev/testing candidates' % (len(txt_train_c), len(txt_devtest_c))
print "Positive labels in dev/test set: %s" % len([c for c in txt_devtest_c if annotations.get(r2id(c),0)==1])
print "Negative labels in dev/test set: %s" % len([c for c in txt_devtest_c if annotations.get(r2id(c),0)==-1])

### Labelling functions

Following the data programming approach, we define set of labeling functions. We will learn their accuracy via unsupervised learning and use them for classifying candidates.

In [32]:
import re
from bs4 import BeautifulSoup as soup
from snorkel.lf_helpers import get_left_tokens

# helper fn
def r2id(r):
    doc_id = r[0].parent.document.name
    str1, str2 = r[0].get_span(), r[1].get_span()
    return (doc_id, str1, str2)

# positive LFs
def LF_acro_matches(m):
    _, acro, phen = r2id(m)
    words = phen.strip().split()
    if len(acro) == len(words):
        w_acro = ''.join([w[0] for w in words])
        if w_acro.lower() == acro.lower():
            return +1
    return 0

def LF_acro_matches_with_dashes(m):
    _, acro, phen = r2id(m)
    words = re.split(' |-', phen)
    if len(acro) == len(words):
        w_acro = ''.join([w[0] for w in words])
        if w_acro.lower() == acro.lower():
            return +1
    return 0

def LF_acro_first_letter(m):
    _, acro, phen = r2id(m)
    if not any(l.islower() for l in phen): return 0
    words = phen.strip().split()
    if len(acro) <= len(words):
        if words[0].lower() == acro[0].lower():
            return +1
    return 0

def LF_acro_prefix(m):
    _, acro, phen = r2id(m)
    phen = phen.replace('-', '')
    if phen[:2].lower() == acro[:2].lower():
        return +1
    return 0

def LF_acro_matches_last_letters(m):
    _, acro, phen = r2id(m)
    words = phen.strip().split()
#     prev_words = m.span1.pre_window(d=1) + words
    prev_words = left_text(m[1], window=1).split() + words
    w_prev_acro = ''.join([w[0] for w in prev_words])
    if w_prev_acro.lower() == acro.lower(): return 0
    for r in (1,2):
        new_acro = acro[r:]
        if len(new_acro) < 3: continue
        if len(new_acro) == len(words):
            w_acro = ''.join([w[0] for w in words])
            if w_acro.lower() == new_acro.lower():
                return +1
    return 0

def LF_full_cell(m):
    """If only phrase in cell is A B C (XYZ), then it's correct"""
    if not hasattr(m[1].parent, 'cell'): return 0
    cell = m[1].parent.cell
    txt_cell = soup(cell.text).text if cell.text is not None else ''
    txt_span = m[1].get_span()
    return 1 if cell.text == txt_span or txt_cell == txt_span else 0
#     return 1 if m[1].parent.cell.text == m[1].get_span() else 0

def LF_start(m):
    punc = ',.;!?()\'"'
    if m[1].get_word_start() == 0 or any(c in punc for c in left_text(m[1], window=1)):
        _, acro, phen = r2id(m)
        if phen[0].lower() == acro[0].lower(): 
            return +1
    return 0

LF_txt_pos = [LF_acro_matches, LF_acro_matches_with_dashes, LF_acro_first_letter, LF_acro_prefix, LF_acro_matches_last_letters, LF_full_cell, LF_start]

# negative LFs
def LF_no_pos(m):
    return -1 if not any(LF(m) for LF in LF_txt_pos) else 0

def LF_short(m):
    _, acro, phen = r2id(m)
    return -1 if len(acro) == 1 else 0

def LF_lc(m):
    _, acro, phen = r2id(m)
    return -1 if all(l.islower() for l in acro) else 0

def LF_uc(m):
    _, acro, phen = r2id(m)
    return -2 if not any(l.islower() for l in phen) else 0

def LF_punc(m):
    _, acro, phen = r2id(m)
    punc = ',.;!?()'
    return -1 if any(c in punc for c in phen) else 0
    

LF_txt_neg = [LF_no_pos, LF_short, LF_lc, LF_uc, LF_punc]

LF_txt = LF_txt_pos + LF_txt_neg

We compute the LFs on our training set.

In [None]:
from snorkel.annotations import LabelManager
label_manager = LabelManager()

try:
    %time L_txt_train = label_manager.load(session, txt_train_c, 'AcroPhenRel Phrase Training LF Labels')
except sqlalchemy.orm.exc.NoResultFound:
    %time L_txt_train = label_manager.create(session, txt_train_c, 'AcroPhenRel Phrase Training LF Labels', f=LF_txt)

And we learn their accuracy.

In [None]:
from snorkel.learning import NaiveBayes

txt_model = NaiveBayes()
txt_model.train(L_txt_train, n_iter=10000, rate=1e-2)

### Evaluating the model's accuracy

We evaluate our accuracy on the dev/test set.

In [None]:
from snorkel.annotations import LabelManager
label_manager = LabelManager()

try:
    %time L_txt_devtest = label_manager.load(session, txt_devtest_c, 'AcroPhenRel Phrase Dev/Test LF Labels')
except sqlalchemy.orm.exc.NoResultFound:
    %time L_txt_devtest = label_manager.create(session, txt_devtest_c, 'AcroPhenRel Phrase Dev/Test LF Labels', f=LF_txt)

In [None]:
L_txt_test_gold = np.array([annotations.get(r2uid[r],0) for r in txt_devtest_c])
txt_model.score(L_txt_devtest, L_txt_test_gold, txt_devtest_c)

In [37]:
text_learner.test_wmv(test_candidates, test_labels)

Applying LFs...
Featurizing...
Test set size:	97
----------------------------------------
Precision:	0.615384615385
Recall:		1.0
F1 Score:	0.761904761905
----------------------------------------
TP: 24 | FP: 15 | TN: 58 | FN: 0


### Candidate classification

Finally, we classify the entire set of candidates. We start by applying the labelling functions.

In [None]:
from snorkel.annotations import LabelManager
label_manager = LabelManager()

try:
    %time L_txt_all = label_manager.load(session, txt_rels, 'AcroPhenRel Table LF Labels')
except sqlalchemy.orm.exc.NoResultFound:
    %time L_txt_all = label_manager.create(session, txt_rels, 'AcroPhenRel Table LF Labels', f=LF_txt)

We use the model to predict which ones are correct.

In [None]:
scores = txt_model.odds(L_txt_all)
txt_acronyms = [r2id(c) for (c, s) in zip(txt_rels, scores) if s > 0]

print 'Identified %d acronyms predicted to be correct, e.g.' % len(txt_acronyms)
print txt_acronyms[:10]

In [38]:
text_preds = text_learner.predict_wmv(text_c)
text_acronyms = [spairtxt2uid(c) for (c, p) in zip(text_c, text_preds) if p == 1]
print text_acronyms[:10]
mislabeled_cand = [(c,p, annotations.get(spair2uid(c), None)) for c, p in zip(text_c, text_preds) if p != annotations.get(spair2uid(c), p)]
# for (c,p,g) in mislabeled_cand[:50]:
#     _, phen, acro = spairtxt2uid(c)
#     print c.span0.context.document.name, p, g
#     print c.span0.context    
#     print c.span0.get_span(), c.span1.get_span()
#     print [LF(c) for LF in LF_txt]
#     print

Applying LFs...
Featurizing...
[('17903292', u'GFR', u'Glomerular Filtration Rate'), ('17903292', u'TSH', u'Thyroid stimulation hormone'), ('17903292', u'DHEAS', u'Dehydroepiandrosterone sulfate'), ('17903292', u'Mb', u'Physical Location'), ('17903292', u'Mb', u'Physical Location'), ('17903292', u'Mb', u'Physical location'), ('17903292', u'GEE', u'Mean p-value'), ('17903292', u'FBAT', u'Mean p-value'), ('17903292', u'GEE', u'P_value'), ('17903294', u'min-max', u'Sample size')]
17903292 1.0 -1
Phrase('17903292', 1, 3, 0, u'Physical Location (Mb)')
(Mb) Physical Location (Mb)
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

17903292 1.0 -1
Phrase('17903292', 1, 189, 0, u'Physical Location (Mb)')
(Mb) Physical Location (Mb)
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

17903292 1.0 -1
Phrase('17903292', 1, 383, 0, u'Physical location (Mb)')
(Mb) Physical location (Mb)
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

17903292 1.0 -1
Phrase('17903292', 2, 5, 0, u'Mean p-value (GEE)')
(GEE) Mean p-value (GEE)
[0, 0, 0, 0

In [None]:
# punc=','
# for (c) in (text_c):
#     if c.span0.context.document.name != '17903294': continue
#     _, phen, acro = spairtxt2uid(c)
#     print c.span0.context.document.name
#     print c.span0.context    
#     print c.span0.get_span(), c.span1.get_span()
#     print [LF(c) for LF in LF_txt]
#     print

### Store the predicted candidates

In [41]:
len(acronyms)

693

In [42]:
acronyms = tab_acronyms + txt_acronyms
print '%d acronyms resolved' % len(acronyms)

# store relations to annotate
with open('acronyms.extracted.all.tsv', 'w') as f:
    for doc_id, str1, str2 in acronyms:
        try:
            out = u'{}\t{}\t{}\n'.format(doc_id, unicode(str2), str1)
            f.write(out.encode("UTF-8"))
        except:
            print 'ERROR:', str1, str2