In [1]:
import os
import os.path as osp
import pandas as pd
import numpy as np
import snorkel
from snorkel import SnorkelSession
from tcre.env import *
from tcre.supervision import *
session = SnorkelSession()
classes = get_candidate_classes()

In [2]:
SPLIT_DEV, SPLIT_TEST, SPLIT_TRAIN, SPLIT_INFER, SPLIT_VAL

(1, 3, 0, 2, 4)

### Refactor 1

Move unlabeled "infer" candidates to training and leave rest as "test" set (after annotating on infer split):

In [3]:
from snorkel.models import Candidate
pd.Series([v[0] for v in session.query(Candidate.split).all()]).value_counts()

2    305916
0    120166
1      1756
3       612
dtype: int64

In [2]:
from snorkel.models import Candidate, GoldLabel
annot_cands = session.query(Candidate, GoldLabel).join(GoldLabel).filter(Candidate.split == SPLIT_INFER).all()
len(annot_cands)

612

In [3]:
from snorkel.models import Candidate, GoldLabel
infer_cands = session.query(Candidate).filter(Candidate.split == SPLIT_INFER).all()
len(infer_cands)

396528

In [4]:
annot_ids = set([c[0].id for c in annot_cands])
len(annot_ids)

612

In [5]:
mv_cands = [r for r in infer_cands if r.id not in annot_ids]
len(mv_cands)

395916

In [11]:
# Sanity check on counts and presence
from snorkel.models import StableLabel, GoldLabel
gcandids = [r.candidate_id for r in session.query(GoldLabel).all()]
assert not any([c.id in gcandids for c in mv_cands])
assert len(infer_cands) - len(mv_cands) == len(annot_cands)
len(infer_cands) - len(mv_cands)

612

In [20]:
c = mv_cands[0]
c

InducingCytokine(Span("b'interleukin-4'", sentence=128832, chars=[90,102], words=[17,17]), Span("b'Th2'", sentence=128832, chars=[75,77], words=[14,14]))

In [21]:
c.split

3

In [32]:
idx = pd.Series(np.arange(len(mv_cands)))
idx_train = idx.sample(n=90000, random_state=1).values
idx_infer = np.setdiff1d(idx, idx_train)
len(idx), len(idx_train), len(idx_infer)

(395916, 90000, 305916)

In [34]:
assert len(idx) == len(idx_train) + len(idx_infer)

In [35]:
assert len(idx) == len(mv_cands)

In [33]:
# Move some candidates to training split (from 3 -> 0)
for i in idx_train:
    mv_cands[i].split = 0 
session.commit()

In [36]:
# Move the rest to a new infer split (from 3 -> 2)
# * from now on, the old INFER split (3) will contain only labeled candidates and should be referred to as TEST
for i in idx_infer:
    mv_cands[i].split = 2 
session.commit()

In [37]:
# Pull new candidate distribution
from snorkel.models import Candidate
pd.Series([v[0] for v in session.query(Candidate.split).all()]).value_counts()

2    305916
0    120166
1      1756
3       612
dtype: int64

### Refactor 2

Move some test candidates to validation split:

In [7]:
test_cands = session.query(Candidate).filter(Candidate.split == SPLIT_TEST).all()
len(test_cands)

612

In [10]:
pd.Series([c.type for c in test_cands]).value_counts()

inducing_transcription_factor    217
secreted_cytokine                198
inducing_cytokine                197
dtype: int64

In [13]:
c.gold_labels[0].value

-1

In [16]:
df = pd.DataFrame([
    (c.id, c.type, len(c.gold_labels), c.gold_labels[0].value)
    for c in test_cands
], columns=['id', 'type', 'num_lbls', 'lbl'])
assert (df['num_lbls'] == 1).all()
assert df['lbl'].isin([-1, 1]).all()
df.head()

Unnamed: 0,id,type,num_lbls,lbl
0,32545,inducing_cytokine,1,-1
1,33626,inducing_cytokine,1,-1
2,33781,inducing_cytokine,1,-1
3,34318,inducing_cytokine,1,-1
4,34476,inducing_cytokine,1,-1


In [17]:
df.groupby(['type', 'lbl']).size().unstack()

lbl,-1,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
inducing_cytokine,170,27
inducing_transcription_factor,177,40
secreted_cytokine,159,39


In [27]:
from sklearn.model_selection import train_test_split

mv_cand_ids = {}
for k, g in df.groupby('type'):
    new_val_ids, old_test_ids = train_test_split(g['id'], stratify=g['lbl'], test_size=100, random_state=1)
    assert len(new_val_ids) + len(old_test_ids) == len(g)
    mv_cand_ids[k] = new_val_ids.values

In [28]:
# Show counts and label balance for data to be moved to validation split
for k, v in mv_cand_ids.items():
    print(k, len(v), df.set_index('id').loc[v]['lbl'].value_counts().to_dict())

inducing_cytokine 97 {-1: 84, 1: 13}
inducing_transcription_factor 117 {-1: 95, 1: 22}
secreted_cytokine 98 {-1: 79, 1: 19}


In [30]:
# Show sum of counts
np.sum([len(v) for k, v in mv_cand_ids.items()])

312

In [29]:
# Grab candidate objects for above ids
mv_cands = [{c.id: c for c in test_cands}[cid] for k, v in mv_cand_ids.items() for cid in v]
len(mv_cands)

312

In [32]:
# Check counts by type one more time
pd.Series([c.type for c in mv_cands]).value_counts()

inducing_transcription_factor    117
secreted_cytokine                 98
inducing_cytokine                 97
dtype: int64

In [33]:
SPLIT_VAL

4

In [34]:
# Move some candidates to training split (from 3 -> 0)
for c in mv_cands:
    c.split = SPLIT_VAL
session.commit()

In [37]:
# Pull new candidate distribution
from snorkel.models import Candidate
pd.Series([SPLIT_MAP[v[0]] for v in session.query(Candidate.split).all()]).value_counts()

infer         305916
train         120166
dev             1756
validation       312
test             300
dtype: int64