# Tutorial: Extracting Formation Measurements from Paleontology Literature

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
import json

PARALLEL = 1
os.environ['SNORKELDB'] = 'sqlite:///gwas_nature.db'

In [2]:
from snorkel.contrib.fonduer import SnorkelSession

session = SnorkelSession()

In [3]:
from snorkel.contrib.fonduer.models import candidate_subclass

Pvalue = candidate_subclass('Pvalue', ['pvalue'])

In [4]:
from snorkel.contrib.fonduer import HTMLPreprocessor, OmniParser

docs_path = os.environ['DATA'] + '/gwas/db/papers'

max_docs = 1000
doc_preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

In [5]:
corpus_parser = OmniParser(structural=True, lingual=True, visual=False)
%time corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

Clearing existing...
Running UDF...
CPU times: user 38min 21s, sys: 19min 55s, total: 58min 17s
Wall time: 1h 51min 12s


In [6]:
from snorkel.contrib.fonduer.models import Document, Phrase

print "Documents:", session.query(Document).count()
print "Phrases:", session.query(Phrase).count()

Documents: 589
Phrases: 1317142


In [7]:
docs = session.query(Document).order_by(Document.name).all()

In [8]:
from snorkel.matchers import RegexMatchSpan, Union

# p-value matcher
rgx1 = u'[1-9]\d?[\xb7\.]?\d*[\s\u2009]*[\xd7\xb7\*][\s\u2009]*10[\s\u2009]*[-\u2212\u2013\u2012][\s\u2009]*\d+'
pval_rgx_matcher1 = RegexMatchSpan(rgx=rgx1)
rgx2 = u'[1-9]\d?[\xb7\.]?\d*[\s\u2009]*[eE][\s\u2009]*[-\u2212\u2013\u2012][\s\u2009]*\d+'
pval_rgx_matcher2 = RegexMatchSpan(rgx=rgx2)
rgx3 = u'0\.0000+\d+'
pval_rgx_matcher3 = RegexMatchSpan(rgx=rgx3)
pval_rgx_matcher = Union(pval_rgx_matcher1, pval_rgx_matcher2, pval_rgx_matcher3)

In [9]:
from snorkel.contrib.fonduer.fonduer.candidates import OmniNgrams

heptagrams = OmniNgrams(n_max=7, split_tokens=[])

In [10]:
from snorkel.contrib.fonduer.candidates import CandidateExtractor

candidate_extractor = CandidateExtractor(Pvalue, [heptagrams], [pval_rgx_matcher])

%time candidate_extractor.apply(docs, split=0, parallelism=PARALLEL)

Clearing existing...
Running UDF...
CPU times: user 9min 54s, sys: 7.62 s, total: 10min 2s
Wall time: 10min


Here we specified that these `Candidates` belong to the training set by specifying `split=0`; recall that we're referring to train/dev/test as splits 0/1/2.

In [11]:
candidates = session.query(Pvalue).filter(Pvalue.split == 0).all()
print "Number of candidates:", len(candidates)

Number of candidates: 12309


In [12]:
import re

pvalue_rgx = u'{}|{}|{}'.format(rgx1, rgx2, rgx3)
pvalue_matcher = re.compile(pvalue_rgx, flags=(re.I|re.UNICODE))

def overlap(a1, b1, a2, b2):
    return not (b1 < a2 or a1 > b2)


def extract_metadata(phrase):
    return [p.text.encode('utf-8') for p in phrase.table.phrases if 
        p.row_end <= 2 and
        overlap(p.col_start, p.col_end, phrase.col_start, phrase.col_end) and 
        not pvalue_matcher.match(p.text)]


def make_line(phrase): 
    doc_id = phrase.document.name
    table_idx = phrase.table.position + 1 # make table_idx 1-indexed
    row = phrase.row_start
    col = phrase.col_start
    pvalue = c[0].get_span().ljust(10) # pad for uniform width
    metadata = extract_metadata(phrase)
#     print('{}: {}\t{}'.format(doc_id, pvalue, metadata))
    line = map(lambda x: unicode(x).encode('utf-8'), [doc_id, table_idx, row, col, pvalue, metadata])
    return line

In [13]:
import csv

OUTFILE = 'pvalue_metadata.tsv'

with open(OUTFILE, 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["doc_id", "table_index", "rows", "cols", "p-value", "metadata"])
    for c in candidates:
        phrase = c.get_parent()
        if not phrase.table:
            continue
        line = make_line(phrase)
        writer.writerow(line)
        # print(line)

### Match discovered p-values with other information

In [26]:
import math
from collections import defaultdict

pvalue_dict = defaultdict(list)
for c in candidates:
    phrase = c.get_parent()
    if not phrase.table:
        continue
    line = make_line(phrase)
    doc_id, table_idx, row, col, pvalue, metadata = line
    
    pvalue = pvalue.replace(' ', '')
    pvalue = pvalue.replace('×10', 'E')
    pvalue = pvalue.replace('·10', 'E')
    try:
        pvalue = float(pvalue)
        if not pvalue:
            continue
    except:
        continue
    
    table_0idx = str(int(table_idx) - 1)
    pvalue_log10 = math.log10(pvalue)
    pvalue_dict[(doc_id, table_0idx)].append((pvalue_log10, row, col, metadata))
print("DONE")

DONE


In [28]:
import csv

# Source: "https://raw.githubusercontent.com/kuleshov/gwaskb/master/notebooks/results/nb-output/pval-rsid.filtered.tsv"
rsid_readfile = 'pval-rsid.filtered.csv'
rsid_writefile = 'pval-rsid.metadata.csv'

with open(rsid_readfile, 'r') as readfile, open(rsid_writefile, 'w') as writefile:
    reader = csv.reader(readfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    writer = csv.writer(writefile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["doc_id", "table_index", "rows", "cols", "p-value", "metadata"])
    for line in reader:
        doc_id, rsid, table, row, col, pval_log10 = line
        options = pvalue_dict[(doc_id, table)]
        metadata = ''
        for opt in options:
            if abs(float(pval_log10) - opt[0]) < 0.001:
                metadata = opt[-1]
                break
        line.append(metadata)        
        writer.writerow(line)