# Dev Set Supervised Modeling 

Produce models and predictions for use as labeling functions.  The process for this is:

- **dev** predictions
    1. K fold split on **dev** candidates (with fairly large K)
    2. Train a model on each fold using **val** split for validation and out-of-sample fold as **test**
- **train** predictions
    1. Train on all **dev** with **val** for validation
    2. Apply to all **train** candidates
- Concatenate predictions for both sets, stored in large dict keyed by id, expose in LFs

In [1]:
import pandas as pd
import numpy as np
import snorkel
from snorkel import SnorkelSession
from snorkel.models import Candidate
from tcre.env import *
from tcre.supervision import *
session = SnorkelSession()
classes = get_candidate_classes()

### Generate Splits

In [2]:
output_dir = osp.join(RESULTS_DATA_DIR, 'training-dev', 'run1')
output_dir

'/lab/data/results/training-dev/run1'

In [3]:
def to_df(cands):
    return pd.DataFrame([
        dict(id=c.id, type=c.type, label=c.gold_labels[0].value if c.gold_labels else -1)
        for c in cands
    ])

def get_all_ids(split):
    return to_df(session.query(Candidate).filter(Candidate.split == split).all())\
        .groupby('type')['id'].unique().apply(list)

In [19]:
SPLIT_MAP

{0: 'train', 1: 'dev', 2: 'infer', 3: 'test', 4: 'val'}

In [20]:
# Load all ids for each split (may take a few minutes)
ids = {
    SPLIT_MAP[k]: get_all_ids(k)
    for k in [SPLIT_DEV, SPLIT_VAL, SPLIT_TEST, SPLIT_TRAIN]
}

pd.DataFrame([
    dict(split=split, task=task, nids=len(ids[split][task]))
    for split in ids.keys()
    for task in ids[split].keys()
])

Unnamed: 0,nids,split,task
0,48656,train,inducing_cytokine
1,22983,train,inducing_transcription_factor
2,48527,train,secreted_cytokine
3,673,dev,inducing_cytokine
4,410,dev,inducing_transcription_factor
5,673,dev,secreted_cytokine
6,125108,infer,inducing_cytokine
7,55572,infer,inducing_transcription_factor
8,125236,infer,secreted_cytokine
9,100,test,inducing_cytokine


In [21]:
# Load dev candidates and labels for stratification
df = to_df(session.query(Candidate).filter(Candidate.split.in_([SPLIT_DEV])).all())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1756 entries, 0 to 1755
Data columns (total 3 columns):
id       1756 non-null int64
label    1756 non-null int64
type     1756 non-null object
dtypes: int64(2), object(1)
memory usage: 41.2+ KB


In [22]:
df.groupby(['type', 'label']).size().unstack()

label,-1,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
inducing_cytokine,540,133
inducing_transcription_factor,324,86
secreted_cytokine,592,81


In [23]:
# Build splits with larger fold partition of dev set as training data and held-out
# portion as the prediction target (with validation and test as usual)
from sklearn.model_selection import StratifiedKFold

def get_split(k, g, val_ids, test_ids):
    cv = StratifiedKFold(n_splits=10, random_state=TCRE_SEED)
    for train, test in cv.split(g['id'].values.reshape((-1, 1)), g['label']):
        yield pd.Series({
            'train': list(g['id'].iloc[train].values), 
            'predict': list(g['id'].iloc[test].values), 
            'val': list(val_ids[k]),
            'test': list(test_ids[k])
        })

splits = {}
for k, g in df.groupby('type'):
    splits[k] = []
    for split in get_split(k, g, ids['val'], ids['test']):
        splits[k].append(split)

In [24]:
splits.keys()

dict_keys(['inducing_cytokine', 'inducing_transcription_factor', 'secreted_cytokine'])

In [25]:
# Show number of ids to be written in splits files (by fold and task)
pd.DataFrame([
    {**dict(task=k, fold=i), **fold.apply(len).to_dict()}
    for k in splits.keys()
    for i, fold in enumerate(splits[k])
])

Unnamed: 0,fold,predict,task,test,train,val
0,0,68,inducing_cytokine,100,605,97
1,1,68,inducing_cytokine,100,605,97
2,2,68,inducing_cytokine,100,605,97
3,3,67,inducing_cytokine,100,606,97
4,4,67,inducing_cytokine,100,606,97
5,5,67,inducing_cytokine,100,606,97
6,6,67,inducing_cytokine,100,606,97
7,7,67,inducing_cytokine,100,606,97
8,8,67,inducing_cytokine,100,606,97
9,9,67,inducing_cytokine,100,606,97


In [26]:
# Write split arrays to individual files for each fold
for k, split in splits.items():
    for i, fold in enumerate(split):
        split_file = osp.join(output_dir, 'splits', f'dev_{k}_fold_{i}.json')
        fold.to_json(split_file, orient='index')
print(f"All fold splits written to {osp.join(output_dir, 'splits')}")

All fold splits written to /lab/data/results/training-dev/run1/splits


In [27]:
# Finally, write a split file for each task across the entire dev set
# with the train partition as the prediction target
for k in splits.keys():
    split_file = osp.join(output_dir, 'splits', f'dev_{k}_fold_all.json')
    split_data = pd.Series({
        'train': ids['dev'][k],
        'val': ids['val'][k],
        'test': ids['test'][k],
        'predict': ids['train'][k]
    })
    split_data.to_json(split_file, orient='index')
print(f"Complete splits written to {osp.join(output_dir, 'splits')}")

Complete splits written to /lab/data/results/training-dev/run1/splits


## Training

### Complete Dev Model

In [49]:
cmd_fmt = """mkdir -p {log_dir} && \
python /lab/repos/t-cell-relation-extraction/src/tcre/exec/v1/cli.py \
--relation-class={relation_class} \
--device="cuda:1" \
--output-dir={output_dir} \
train \
--splits-file={splits_file} \
--{checkpoints}-checkpoints \
--use-swaps \
--use-positions \
--no-secondary \
--marker-list=doub_01 \
--wrd-embedding-type=w2v_frozen \
--model-size=L \
--weight-decay=.001 \
--dropout=0.5 > {log_dir}/log.txt 2>&1
"""

In [45]:
def get_cmd(relation_class, fold, use_checkpoints):
    args = dict(
        relation_class=relation_class,
        splits_file=osp.join(output_dir, 'splits', f'dev_{relation_class}_fold_{fold}.json'),
        checkpoints='use' if use_checkpoints else 'no',
        output_dir=osp.join(output_dir, 'models', f'{relation_class}_fold_{fold}', 'data'),
        log_dir=osp.join(output_dir, 'models', f'{relation_class}_fold_{fold}')
    )
    return cmd_fmt.format(**args)

In [46]:
cmd = get_cmd('inducing_cytokine', 0, False)
#cmd = get_cmd('inducing_cytokine', 'all', False)
print(cmd)

mkdir -p /lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0 && python /lab/repos/t-cell-relation-extraction/src/tcre/exec/v1/cli.py --relation-class=inducing_cytokine --device="cuda:1" --output-dir=/lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/data train --splits-file=/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_0.json --no-checkpoints --save-predictions --use-swaps --use-positions --no-secondary --marker-list=doub_01 --wrd-embedding-type=w2v_frozen --model-size=L --weight-decay=.001 --dropout=0.5 > /lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/log.txt 2>&1



In [95]:
!mkdir -p /lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0 && \
python /lab/repos/t-cell-relation-extraction/src/tcre/exec/v1/cli.py \
--relation-class=inducing_cytokine --device="cuda:1" \
--output-dir=/lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/data \
train \
--splits-file=/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_0.json \
--use-checkpoints --use-swaps --use-positions --no-secondary --marker-list=doub_01 \
--wrd-embedding-type=w2v_frozen --model-size=L --weight-decay=.001 --dropout=0.5 

INFO:__main__:Gathering candidates for splits at "/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_0.json"
INFO:__main__:Split sizes = {'train': 605, 'val': 97, 'test': 100}
INFO:__main__:Found 802 candidates
INFO:__main__:Modeling config:
{     'batch_size': 32,
      'bidirectional': False,
      'cell_type': 'LSTM',
      'device': 'cuda:1',
      'dropout': 0.5,
      'entity_types': ['cytokine', 'immune_cell_type'],
      'label': 'relation_class=inducing_cytokine:splits_file=/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_0.json:marker_list=doub_01:use_secondary=False:use_swaps=True:use_lower=False:use_positions=True:use_checkpoints=True:wrd_embedding_type=w2v_frozen:model_size=L:bidirectional=False:cell_type=LSTM:learning_rate=0.005:weight_decay=0.001:dropout=0.5:vocab_limit=50000:device=cuda:1:batch_size=32:output_dir=/lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/data:seed=3832',
      'learning_rate': 0.005,
    

In [98]:
# !mkdir -p /lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0 && \
# python /lab/repos/t-cell-relation-extraction/src/tcre/exec/v1/cli.py \
# --relation-class=inducing_cytokine --device="cuda:1" \
# --output-dir=/lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/data \
# train \
# --splits-file=/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_0.json \
# --use-checkpoints --use-swaps --use-positions --no-secondary --marker-list=doub_01 \
# --wrd-embedding-type=w2v_frozen --model-size=L --weight-decay=.001 --dropout=0.5 

In [75]:
!mkdir -p /lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0 && \
python /lab/repos/t-cell-relation-extraction/src/tcre/exec/v1/cli.py \
--relation-class=inducing_cytokine --device="cpu" \
--output-dir=/lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/data \
predict \
--splits-file=/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_0.json 

INFO:__main__:Gathering candidates for splits at "/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_0.json"
INFO:__main__:Split sizes = {'predict': 68}
INFO:__main__:Found 68 candidates
INFO:__main__:Gathering predictions
INFO:__main__:Loading model state from checkpoint dir /lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/data/checkpoints
  "num_layers={}".format(dropout, num_layers))
INFO:__main__:Restored model with arguments: {'hidden_dim': 20, 'wrd_embed_dim': None, 'pos_embed_dim': 10, 'dropout': 0.5, 'bidirectional': False, 'cell_type': 'LSTM', 'device': 'cpu'}
INFO:__main__:Collecting features
100%|█████████████████████████████████████████| 68/68 [00:00<00:00, 1026.20it/s]
INFO:__main__:Sample feature records:

INFO:__main__:{     'e0_dist': [-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26],
      'e0_text': 'IL-12',
      'e1_dist': [     -24, -23, -22, -21, -20, -19, -18, -17, -16, -15, -1

In [76]:
dfp = pd.read_json('/lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/data/predictions.json')
dfp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68 entries, 0 to 9
Data columns (total 3 columns):
id        68 non-null int64
y_true    68 non-null int64
y_pred    68 non-null float64
dtypes: float64(1), int64(2)
memory usage: 2.1 KB


In [78]:
# %matplotlib inline
# dfp['y_pred'].hist(bins=32)

In [149]:
dfp[dfp['y_pred'] > 0].head(10)

Unnamed: 0,id,y,y_pred
0,170518,0,1
1,93524,0,1
4,79382,0,1
5,53695,0,1
12,248902,0,1
13,242713,0,1
15,221591,0,1
19,53746,0,1
23,685,0,1
25,228239,0,1


In [153]:
[
    (c, c.get_parent().text) for c in 
    session.query(Candidate).filter(Candidate.id.in_([170518, 93524, 79382, 53965])).all()
]

[(SecretedCytokine(Span("b'IL-2'", sentence=315891, chars=[61,64], words=[10,10]), Span("b'Th17'", sentence=315891, chars=[18,21], words=[3,3])),
  'Although Ets-1 KO Th17 cells were resistant to inhibition by IL-2, other IL-2–induced responses, such as induction of CD25, were intact in Ets-1 KO Th cells (unpublished data).'),
 (InducingCytokine(Span("b'IL-10'", sentence=527307, chars=[0,4], words=[0,0]), Span("b'Tr1'", sentence=527307, chars=[37,39], words=[6,6])),
  'IL-10 was originally shown to induce Tr1 cells .'),
 (InducingCytokine(Span("b'IL-12'", sentence=687300, chars=[0,4], words=[0,0]), Span("b'Th1'", sentence=687300, chars=[25,27], words=[4,4])),
  'IL-12 is responsible for Th1 polarization of lymphocytes.'),
 (InducingCytokine(Span("b'IL-4'", sentence=1370692, chars=[0,3], words=[0,0]), Span("b'Th2'", sentence=1370692, chars=[16,18], words=[2,2])),
  'IL-4 stimulates Th2 proliferation in an autocrine manner.')]

In [86]:
rc = os.system(cmd)

In [None]:
/lab/repos/t-cell-relation-extraction/src/tcre/exec/v1/cli.py --relation-class=inducing_transcription_factor --device="cuda:1" \
--output-dir=/lab/data/results/label-modeling-v3/run2/config_inducing_transcription_factor_0000 --marker-list=mult_01 --no-secondary \ 
--use-swaps --wrd-embedding-type=w2v_frozen --model-size=L --weight-decay=0 --use-positions --dropout=0.0 > /lab/data/results/label-modeling-v3/run2/config_inducing_transcription_factor_0000/log.txt 2>&1