## Candidate Generation

Generate pairwise candidates from entities tagged in documents loaded above.

Note: This is often a good time to run the scripts/tagging-frequency.ipynb notebook before continuing to ensure that there are not problematic tags

In [1]:
from snorkel import SnorkelSession
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence, Candidate
from tcre import supervision
import numpy as np
import pandas as pd
import tqdm

In [2]:
session = SnorkelSession()

In [3]:
session.query(Document).count()

4521

In [4]:
sents = session.query(Sentence).all()
#sents = session.query(Sentence).limit(250000).all()
#sents = session.query(Sentence).limit(500000).all()[250000:]
len(sents)

789519

In [5]:
from snorkel.candidates import PretaggedCandidateExtractor
classes = supervision.get_candidate_classes()
candidate_extractors = [
    PretaggedCandidateExtractor(c.subclass, c.entity_types)
    for c in classes.values()
]

In [6]:
def apply_extraction(sents, split, batch_size=50000):
    for extractor in candidate_extractors:
        relation_class = extractor.udf_init_kwargs['candidate_class']
        n_batch = int(np.ceil(len(sents) / batch_size))
        print('Beginning candidate extraction for split {}, relation type {}, num batches {}'.format(
            split, relation_class.__name__, n_batch
        ))
        for batch in tqdm.tqdm(np.array_split(list(sents), n_batch)):
            extractor.apply(batch, split=split, clear=False, progress_bar=False)
        print('Number of candidates generated for split {}, relation type {} = {}'.format(
            split, relation_class.__name__,
            session.query(relation_class).filter(relation_class.split == split).count()
        ))

apply_extraction(sents, supervision.SPLIT_INFER)

  0%|          | 0/16 [00:00<?, ?it/s]

Beginning candidate extraction for split 9, relation type InducingCytokine, num batches 16
Running UDF...


  6%|▋         | 1/16 [00:30<07:34, 30.28s/it]

Running UDF...


 12%|█▎        | 2/16 [00:50<06:23, 27.36s/it]

Running UDF...


 19%|█▉        | 3/16 [01:08<05:18, 24.51s/it]

Running UDF...


 25%|██▌       | 4/16 [01:34<04:57, 24.77s/it]

Running UDF...


 31%|███▏      | 5/16 [01:55<04:22, 23.90s/it]

Running UDF...


 38%|███▊      | 6/16 [02:12<03:37, 21.73s/it]

Running UDF...


 44%|████▍     | 7/16 [02:46<03:47, 25.31s/it]

Running UDF...


 50%|█████     | 8/16 [03:18<03:37, 27.24s/it]

Running UDF...


 56%|█████▋    | 9/16 [03:38<02:55, 25.10s/it]

Running UDF...


 62%|██████▎   | 10/16 [03:55<02:16, 22.82s/it]

Running UDF...


 69%|██████▉   | 11/16 [04:48<02:38, 31.76s/it]

Running UDF...


 75%|███████▌  | 12/16 [05:25<02:13, 33.49s/it]

Running UDF...


 81%|████████▏ | 13/16 [05:51<01:33, 31.23s/it]

Running UDF...


 88%|████████▊ | 14/16 [06:12<00:56, 28.16s/it]

Running UDF...


 94%|█████████▍| 15/16 [07:04<00:35, 35.38s/it]

Running UDF...


100%|██████████| 16/16 [07:52<00:00, 39.08s/it]


Number of candidates generated for split 9, relation type InducingCytokine = 91839
Beginning candidate extraction for split 9, relation type SecretedCytokine, num batches 16


  0%|          | 0/16 [00:00<?, ?it/s]

Running UDF...


  6%|▋         | 1/16 [00:20<05:07, 20.50s/it]

Running UDF...


 12%|█▎        | 2/16 [00:34<04:20, 18.62s/it]

Running UDF...


 19%|█▉        | 3/16 [00:47<03:40, 16.97s/it]

Running UDF...


 25%|██▌       | 4/16 [01:05<03:24, 17.08s/it]

Running UDF...


 31%|███▏      | 5/16 [01:20<03:01, 16.47s/it]

Running UDF...


 38%|███▊      | 6/16 [01:31<02:29, 14.93s/it]

Running UDF...


 44%|████▍     | 7/16 [01:54<02:35, 17.29s/it]

Running UDF...


 50%|█████     | 8/16 [02:16<02:29, 18.67s/it]

Running UDF...


 56%|█████▋    | 9/16 [02:29<02:00, 17.17s/it]

Running UDF...


 62%|██████▎   | 10/16 [02:42<01:34, 15.69s/it]

Running UDF...


 69%|██████▉   | 11/16 [03:12<01:40, 20.07s/it]

Running UDF...


 75%|███████▌  | 12/16 [03:46<01:36, 24.15s/it]

Running UDF...


 81%|████████▏ | 13/16 [04:03<01:06, 22.15s/it]

Running UDF...


 88%|████████▊ | 14/16 [04:18<00:39, 19.88s/it]

Running UDF...


 94%|█████████▍| 15/16 [04:53<00:24, 24.58s/it]

Running UDF...


100%|██████████| 16/16 [05:24<00:00, 26.57s/it]


Number of candidates generated for split 9, relation type SecretedCytokine = 91839
Beginning candidate extraction for split 9, relation type InducingTranscriptionFactor, num batches 16


  0%|          | 0/16 [00:00<?, ?it/s]

Running UDF...


  6%|▋         | 1/16 [00:13<03:21, 13.43s/it]

Running UDF...


 12%|█▎        | 2/16 [00:24<02:59, 12.83s/it]

Running UDF...


 19%|█▉        | 3/16 [00:35<02:36, 12.03s/it]

Running UDF...


 25%|██▌       | 4/16 [00:48<02:28, 12.40s/it]

Running UDF...


 31%|███▏      | 5/16 [00:59<02:12, 12.07s/it]

Running UDF...


 38%|███▊      | 6/16 [01:09<01:54, 11.43s/it]

Running UDF...


 44%|████▍     | 7/16 [01:24<01:51, 12.40s/it]

Running UDF...


 50%|█████     | 8/16 [01:40<01:48, 13.56s/it]

Running UDF...


 56%|█████▋    | 9/16 [01:50<01:28, 12.58s/it]

Running UDF...


 62%|██████▎   | 10/16 [02:00<01:10, 11.71s/it]

Running UDF...


 69%|██████▉   | 11/16 [02:21<01:13, 14.66s/it]

Running UDF...


 75%|███████▌  | 12/16 [02:41<01:04, 16.08s/it]

Running UDF...


 81%|████████▏ | 13/16 [02:54<00:45, 15.18s/it]

Running UDF...


 88%|████████▊ | 14/16 [03:06<00:28, 14.25s/it]

Running UDF...


 94%|█████████▍| 15/16 [03:37<00:19, 19.13s/it]

Running UDF...


100%|██████████| 16/16 [04:01<00:00, 20.60s/it]


Number of candidates generated for split 9, relation type InducingTranscriptionFactor = 36535


In [7]:
cands = session.query(Candidate.type, Candidate.split).all()
pd.DataFrame(cands).groupby(['type', 'split']).size()

type                           split
inducing_cytokine              9        91839
inducing_transcription_factor  9        36535
secreted_cytokine              9        91839
dtype: int64