# Dev Set Supervised Modeling 

Produce models and predictions for use as labeling functions.  The process for this is:

- **dev** predictions
    1. K fold split on **dev** candidates (with fairly large K)
    2. Train a model on each fold using **val** split for validation and out-of-sample fold as **test**
- **train** predictions
    1. Train on all **dev** with **val** for validation
    2. Apply to all **train** candidates
- Concatenate predictions for both sets, stored in large dict keyed by id, expose in LFs

In [1]:
import pandas as pd
import numpy as np
import snorkel
from snorkel import SnorkelSession
from snorkel.models import Candidate
from tcre.env import *
from tcre.supervision import *
session = SnorkelSession()
classes = get_candidate_classes()

### Generate Splits

In [3]:
output_dir = osp.join(RESULTS_DATA_DIR, 'training-dev', 'run1')
output_dir

'/lab/data/results/training-dev/run1'

In [60]:
def to_df(cands):
    return pd.DataFrame([
        dict(id=c.id, type=c.type, label=c.gold_labels[0].value if c.gold_labels else -1)
        for c in cands
    ])

def get_all_ids(split):
    return to_df(session.query(Candidate).filter(Candidate.split == split).all()).groupby('type')['id'].unique().apply(list)

In [63]:
# Load all ids for each split with gold labels
val_ids = get_all_ids(SPLIT_VAL)
train_ids = get_all_ids(SPLIT_TRAIN)
dev_ids = get_all_ids(SPLIT_DEV)

# Show counts by task for validation split
{k: len(v) for k, v in val_ids.items()}

{'inducing_cytokine': 97,
 'inducing_transcription_factor': 117,
 'secreted_cytokine': 98}

In [34]:
# Load dev candidates and labels for stratification
df = to_df(session.query(Candidate).filter(Candidate.split.in_([SPLIT_DEV])).all())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1756 entries, 0 to 1755
Data columns (total 3 columns):
id       1756 non-null int64
label    1756 non-null int64
type     1756 non-null object
dtypes: int64(2), object(1)
memory usage: 41.2+ KB


In [35]:
df.groupby(['type', 'label']).size().unstack()

label,-1,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
inducing_cytokine,540,133
inducing_transcription_factor,324,86
secreted_cytokine,592,81


In [52]:
from sklearn.model_selection import StratifiedKFold

def get_split(k, g, val_ids):
    cv = StratifiedKFold(n_splits=10, random_state=TCRE_SEED)
    for train, test in cv.split(g['id'].values.reshape((-1, 1)), g['label']):
        yield pd.Series({
            'train': list(g['id'].iloc[train].values), 
            'test': list(g['id'].iloc[test].values), 
            'val': list(val_ids[k])
        })

splits = {}
for k, g in df.groupby('type'):
    splits[k] = []
    for split in get_split(k, g, val_ids):
        splits[k].append(split)

In [53]:
splits.keys()

dict_keys(['inducing_cytokine', 'inducing_transcription_factor', 'secreted_cytokine'])

In [57]:
{
    (k, i): fold.apply(len).to_dict()
    for k in splits.keys()
    for i, fold in enumerate(splits[k])
}

{('inducing_cytokine', 0): {'train': 605, 'test': 68, 'val': 97},
 ('inducing_cytokine', 1): {'train': 605, 'test': 68, 'val': 97},
 ('inducing_cytokine', 2): {'train': 605, 'test': 68, 'val': 97},
 ('inducing_cytokine', 3): {'train': 606, 'test': 67, 'val': 97},
 ('inducing_cytokine', 4): {'train': 606, 'test': 67, 'val': 97},
 ('inducing_cytokine', 5): {'train': 606, 'test': 67, 'val': 97},
 ('inducing_cytokine', 6): {'train': 606, 'test': 67, 'val': 97},
 ('inducing_cytokine', 7): {'train': 606, 'test': 67, 'val': 97},
 ('inducing_cytokine', 8): {'train': 606, 'test': 67, 'val': 97},
 ('inducing_cytokine', 9): {'train': 606, 'test': 67, 'val': 97},
 ('inducing_transcription_factor', 0): {'train': 368, 'test': 42, 'val': 117},
 ('inducing_transcription_factor', 1): {'train': 368, 'test': 42, 'val': 117},
 ('inducing_transcription_factor', 2): {'train': 368, 'test': 42, 'val': 117},
 ('inducing_transcription_factor', 3): {'train': 368, 'test': 42, 'val': 117},
 ('inducing_transcriptio

In [59]:
# Write split arrays to individual files for each fold
for k, split in splits.items():
    for i, fold in enumerate(split):
        split_file = osp.join(output_dir, 'splits', f'dev_{k}_fold_{i}.json')
        fold.to_json(split_file, orient='index')
print(f"All fold splits written to {osp.join(output_dir, 'splits')}")

All splits written to /lab/data/results/training-dev/run1/splits


In [69]:
# Finally, write a split file for each task across the entire dev set
for k in splits.keys():
    split_file = osp.join(output_dir, 'splits', f'dev_{k}_fold_all.json')
    ids = pd.Series({
        'train': dev_ids[k],
        'val': val_ids[k],
        'test': train_ids[k]
    })
    ids.to_json(split_file, orient='index')
print(f"Complete splits written to {osp.join(output_dir, 'splits')}")

Complete splits written to /lab/data/results/training-dev/run1/splits


## Training

### Complete Dev Model

In [110]:
cmd_fmt = """python /lab/repos/t-cell-relation-extraction/src/tcre/exec/v1/cli.py \
--relation-class={relation_class} \
--splits-file={splits_file} \
--{checkpoints}-checkpoints \
--save-predictions \
--use-swaps \
--use-positions \
--no-secondary \
--marker-list=doub_01 \
--wrd-embedding-type=w2v_frozen \
--model-size=L \
--weight-decay=.001 \
--dropout=0.5 \
--device="cuda:1" \
--output-dir={output_dir} > {log_file} 2>&1
"""

In [111]:
def get_cmd(relation_class, fold, use_checkpoints):
    args = dict(
        relation_class=relation_class,
        splits_file=osp.join(output_dir, 'splits', f'dev_{relation_class}_fold_{fold}.json'),
        checkpoints='use' if use_checkpoints else 'no',
        output_dir=osp.join(output_dir, 'models', f'{relation_class}_fold_{fold}', 'data'),
        log_file=osp.join(output_dir, 'models', f'{relation_class}_fold_{fold}', 'log.txt')
    )
    return cmd_fmt.format(**args)

In [112]:
cmd = get_cmd('inducing_cytokine', 0, False)
print(cmd)

python /lab/repos/t-cell-relation-extraction/src/tcre/exec/v1/cli.py --relation-class=inducing_cytokine --splits-file=/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_0.json --no-checkpoints --save-predictions --use-swaps --use-positions --no-secondary --marker-list=doub_01 --wrd-embedding-type=w2v_frozen --model-size=L --weight-decay=.001 --dropout=0.5 --device="cuda:1" --output-dir=/lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/data > /lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/log.txt 2>&1



In [138]:
!mkdir -p /lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0 && \
python /lab/repos/t-cell-relation-extraction/src/tcre/exec/v1/cli.py --relation-class=inducing_cytokine \
--splits-file=/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_0.json \
--use-checkpoints --save-predictions --use-swaps --use-positions --no-secondary --marker-list=doub_01 \
--wrd-embedding-type=w2v_frozen --model-size=L --weight-decay=.001 --dropout=0.5 --device="cuda:1" \
--output-dir=/lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/data 
#/lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/log.txt 2>&1

INFO:__main__:Gathering candidates for splits at "/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_0.json"
INFO:__main__:Found 770 candidates (split sizes = {'train': 605, 'test': 68, 'val': 97}
INFO:__main__:Initializing /lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/data
INFO:__main__:Training config:
{     'batch_size': 32,
      'device': 'cuda:1',
      'dropout': 0.5,
      'entity_types': ['cytokine', 'immune_cell_type'],
      'label': 'relation_class=inducing_cytokine:splits_file=/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_0.json:marker_list=doub_01:use_secondary=False:use_swaps=True:use_lower=False:use_positions=True:use_checkpoints=True:wrd_embedding_type=w2v_frozen:model_size=L:learning_rate=0.005:weight_decay=0.001:dropout=0.5:vocab_limit=50000:device=cuda:1:batch_size=32:model_dir=/lab/data/results/training-dev/run1/models/inducing_cytokine_fold_0/data',
      'learning_rate': 0.005,
      'marker_list': 

INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=250.
INFO:tcre.modeling.training:Epoch[1] Iteration[10] Loss: 0.5288 LR: 0.005
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Training Results - Epoch: 1  Count: 605 Loss: 0.69 Accuracy: 0.803 F1: 0.000
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Validation Results - Epoch: 1  Count: 97 Loss: 0.69 Accuracy: 0.866 F1: 0.000
INFO:tcre.modeling.training:Epoch[2] Iteration[20] Loss: 0.6708 LR: 0.005
INFO:tcre.modeling.training:Epoch[2] Iteration[30] Loss:

INFO:tcre.modeling.training:Epoch[11] Iteration[200] Loss: 0.3342 LR: 0.005
INFO:ignite.engine.engine.Engine:Epoch[11] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Training Results - Epoch: 11  Count: 605 Loss: 0.69 Accuracy: 0.825 F1: 0.209
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Validation Results - Epoch: 11  Count: 97 Loss: 0.69 Accuracy: 0.876 F1: 0.143
INFO:tcre.modeling.training:Epoch[12] Iteration[210] Loss: 0.4585 LR: 0.005
INFO:tcre.modeling.training:Epoch[12] Iteration[220] Loss: 0.4345 LR: 0.005
INFO:ignite.engine.engine.Engine:Epoch[12] Comp

INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Validation Results - Epoch: 20  Count: 97 Loss: 0.77 Accuracy: 0.763 F1: 0.258
INFO:tcre.modeling.training:Epoch[21] Iteration[390] Loss: 0.1278 LR: 0.005
INFO:ignite.engine.engine.Engine:Epoch[21] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Training Results - Epoch: 21  Count: 605 Loss: 0.66 Accuracy: 0.889 F1: 0.617
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Validation Results - Epo

INFO:tcre.modeling.training:Epoch[30] Iteration[570] Loss: 0.0022 LR: 0.00125
INFO:ignite.engine.engine.Engine:Epoch[30] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Training Results - Epoch: 30  Count: 605 Loss: 0.63 Accuracy: 0.982 F1: 0.952
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Validation Results - Epoch: 30  Count: 97 Loss: 0.75 Accuracy: 0.804 F1: 0.387
INFO:tcre.modeling.training:Epoch[31] Iteration[580] Loss: 0.0331 LR: 0.00125
INFO:ignite.engine.engine.Engine:Epoch[31] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run s

INFO:tcre.modeling.training:Epoch[40] Iteration[750] Loss: 0.1042 LR: 0.00125
INFO:tcre.modeling.training:Epoch[40] Iteration[760] Loss: 0.0015 LR: 0.00125
INFO:ignite.engine.engine.Engine:Epoch[40] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Training Results - Epoch: 40  Count: 605 Loss: 0.62 Accuracy: 0.985 F1: 0.961
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Validation Results - Epoch: 40  Count: 97 Loss: 0.75 Accuracy: 0.804 F1: 0.424
INFO:tcre.modeling.training:Epoch[41] Iteration[770] Loss: 0.0141 LR: 0.00125
INFO:ignite.engine.engine.Engine:Epoch[41

INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Validation Results - Epoch: 49  Count: 97 Loss: 0.78 Accuracy: 0.753 F1: 0.333
INFO:tcre.modeling.training:Epoch[50] Iteration[940] Loss: 0.0175 LR: 0.00125
INFO:tcre.modeling.training:Epoch[50] Iteration[950] Loss: 0.0007 LR: 0.00125
INFO:ignite.engine.engine.Engine:Epoch[50] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Training Results - Epoch: 50  Count: 605 Loss: 0.62 Accuracy: 0.998 F1: 0.996
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run co

INFO:tcre.modeling.training:Epoch[59] Iteration[1110] Loss: 0.0169 LR: 0.0003125
INFO:tcre.modeling.training:Epoch[59] Iteration[1120] Loss: 0.0036 LR: 0.0003125
INFO:ignite.engine.engine.Engine:Epoch[59] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Training Results - Epoch: 59  Count: 605 Loss: 0.62 Accuracy: 1.000 F1: 1.000
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:00
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken 00:00:00
INFO:tcre.modeling.training:Validation Results - Epoch: 59  Count: 97 Loss: 0.77 Accuracy: 0.763 F1: 0.343
INFO:tcre.modeling.training:Epoch[60] Iteration[1130] Loss: 0.0138 LR: 0.0003125
INFO:tcre.modeling.training:Epoc

In [86]:
rc = os.system(cmd)

In [None]:
/lab/repos/t-cell-relation-extraction/src/tcre/exec/v1/cli.py --relation-class=inducing_transcription_factor --device="cuda:1" \
--output-dir=/lab/data/results/label-modeling-v3/run2/config_inducing_transcription_factor_0000 --marker-list=mult_01 --no-secondary \ 
--use-swaps --wrd-embedding-type=w2v_frozen --model-size=L --weight-decay=0 --use-positions --dropout=0.0 > /lab/data/results/label-modeling-v3/run2/config_inducing_transcription_factor_0000/log.txt 2>&1