# Snorkel corpus preprocess
This notebook takes as input some snorkel-compatible files (corpus, entities, gold relation labels) and creates snorkel.db, where all of the above are persisted.
re-run to drop and re-create db

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pickle

In [3]:
from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.models import  Document, Sentence

In [4]:
import glob
import pandas as pd

## Load the corpus

In [5]:
# Get train,dev,test from goldset, and chemdner_silver from NCBI_parsed (ensure consistency of named entities)
txt_corpus = glob.glob("/home/antonis/data/biocreative6/NCBI_parsed/outgoing_citations/*.txt") + \
                glob.glob("/home/antonis/data/biocreative6/goldset/*/*.txt")

# test cand extr+ goldlabel generator
txt_corpus = glob.glob("/home/antonis/data/biocreative6/goldset/*/*.txt")

txt_corpus = pd.Series(txt_corpus,name='paths')

In [6]:
# full_corpus_paths.to_csv('full_corpus_paths.csv',header=True)
txt_corpus.to_csv('full_corpus_paths.csv',index=False)

In [7]:
from snorkel.parser import TextDocPreprocessor, CSVPathsPreprocessor
# path = "/home/antonis/data/biocreative6/corpus/training/"
# doc_preprocessor = TextDocPreprocessor(path)

csv_preprocessor = CSVPathsPreprocessor('full_corpus_paths.csv')

In [8]:
from snorkel.parser import CorpusParser
from snorkel.utils_cdr import TaggerOneTagger
from snorkel.parser.spacy_parser import Spacy


tagger_one = TaggerOneTagger(fname_tags=
                             '/home/antonis/data/biocreative6/entities/unary_tags.pkl.bz2',
                            fname_mesh=
                             '/home/antonis/data/biocreative6/entities/mesh_dict.pkl.bz2')

corpus_parser = CorpusParser(parser = Spacy() , fn = tagger_one.tag)
corpus_parser = CorpusParser(fn = tagger_one.tag)
corpus_parser.apply(list(csv_preprocessor))

# Inspect DB contents
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Clearing existing...
Running UDF...

('Documents:', 2432)
('Sentences:', 24740)


### Split dataset into dev, train, test

In [9]:
with open('/home/antonis/data/biocreative6/pubmed_ids_citations.pickle', 'rb') as f:
    pubmed_ids = pickle.load(f)

In [10]:
pubmed_ids.keys()

['development',
 'incoming_citations',
 'training',
 'test_gs',
 'outgoing_citations',
 'chemdner_silver']

In [11]:
# merge training+development set
train_ids  =  set(pubmed_ids['training']).union(set(pubmed_ids['development']))
test_ids = set(pubmed_ids['test_gs'])

In [12]:
# split sentences

In [13]:
train_sents, test_sents = set(), set()
docs = session.query(Document).order_by(Document.name).all()

In [14]:
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            raise Exception('ID <{0}> not found in any id set'.format(doc.name))

## Candidate extraction

In [8]:
from snorkel.models import Candidate, candidate_subclass

REGULATOR = candidate_subclass('REGULATOR', ['Chemical', 'Gene'])

In [16]:
from snorkel.candidates import PretaggedCandidateExtractor

candidate_extractor = PretaggedCandidateExtractor(REGULATOR, ['Chemical', 'Gene'])

In [17]:
for k, sents in enumerate([train_sents, test_sents]):
    print len(sents)

16588
8152


In [18]:
for k, sents in enumerate([train_sents, test_sents]):
    candidate_extractor.apply(sents, split=k)
    print("Number of candidates:", session.query(REGULATOR).filter(REGULATOR.split == k).count())

Clearing existing...
Running UDF...

('Number of candidates:', 24848)
Clearing existing...
Running UDF...

('Number of candidates:', 13394)


# Import gold labels

In [30]:
from utils import load_external_labels

In [31]:
from snorkel.db_helpers import reload_annotator_labels
filter_label_split= False
# load_external_labels(session,
#                      REGULATOR,
#                      FPATH='../../data/biocreative6/gold_rels_snorkel_format.tsv'
# #                      id_fname='../../data/biocreative6/pubmed_ids_extended.pickle'
#                     )

#load external labels into db
load_external_labels(session, REGULATOR, tsv_path='/home/antonis/data/biocreative6/entities/gold_rels_complete.tsv', reload=False)

# manually reload annotations from db into the table (moved out of load_external_labels)
reload_annotator_labels(session, REGULATOR, 'gold', split=0, filter_label_split= filter_label_split, debug=True)
reload_annotator_labels(session, REGULATOR, 'gold', split=1, filter_label_split= filter_label_split, debug=True)


AnnotatorLabels created: 11873
AnnotatorLabels not matched to candidates (split=0): 309670
AnnotatorLabels created: 5776
AnnotatorLabels not matched to candidates (split=1): 315767


In [37]:
session.commit()

In [11]:
from snorkel.db_helpers import reload_annotator_labels


In [15]:
reload_annotator_labels(session, REGULATOR, 'gold', split=0, filter_label_split= True, debug=True)

AnnotatorLabels created: 0
AnnotatorLabels not matched to candidates (split=0): 309670


## Drop all candidates without mapped label

** Weird shit going on: Does not drop in this notebook, but dropped in snorkel baseline LSTM pred **

In [5]:
from sqlalchemy import  and_,any_,or_
from snorkel.models import Candidate

In [14]:
session.query(REGULATOR).filter(REGULATOR.labels.any()).count()

0

In [38]:
session.query(REGULATOR).filter(and_(REGULATOR.split == 0, REGULATOR.labels.any())).count()

0

In [34]:
session.query(REGULATOR).filter(and_(REGULATOR.split == 0, ~REGULATOR.labels.any())).count()

24848

In [9]:
#get all ids to drop (in train-dev & without labels)
todrop = list(map(lambda x: x.id ,session.query(REGULATOR).filter(and_(REGULATOR.split.in_([0,1]), ~REGULATOR.labels.any())).all()))

In [10]:
print 'Will drop %i candidates'%len(todrop)

Will drop 26311 candidates


In [None]:
# drop from REGULATOR table

In [12]:
session.query(REGULATOR).filter(REGULATOR.id.in_(todrop)).delete(synchronize_session=False)
session.commit()

In [15]:
#check if deleted
session.query(REGULATOR).filter(~(REGULATOR.labels.any())).count()

0

In [None]:
# drop from Candidate table

In [14]:
session.query(Candidate).filter(Candidate.id.in_(todrop)).delete(synchronize_session=False)
session.commit()

In [16]:
#check if deleted
session.query(Candidate).filter(~(Candidate.labels.any())).count()

0

~~~ NOT RUN

# Exporting candidates from snorkel to sklearn for ML model training

In [21]:
# from sklearn_bridge import export_snorkel_candidates

In [22]:
# # export candidates for train, dev, test dataset
# candidates = dict()
# nr_cands_extracted=0
# for i in range(3): #for train,dev,test export only labelled candidates 
#     candidates[i] = export_snorkel_candidates(session,REGULATOR, i, False)
#     print 'Extracted %i candidates from split = %i '%(len(candidates[i].keys()), i)
#     nr_cands_extracted += len(candidates[i].keys())
    
# print 'Extracted %i candidates in total'%nr_cands_extracted

Extracted 7281 candidates from split = 0 
Extracted 4592 candidates from split = 1 
Extracted 5776 candidates from split = 2 
Extracted 17649 candidates in total


In [23]:
# with open('candidates_TrainDevTest-baseline.pickle', 'wb') as f:
#     pickle.dump(dict(candidates),f)

# #########################################
# Once this is done, results are persisted into snorkel.db and this step is no longer required, unless more documents are added.
# #########################################