# Dev Set Supervised Modeling 

Produce models and predictions for use as labeling functions.  The process for this is:

- **dev** predictions
    1. K fold split on **dev** candidates (with fairly large K)
    2. Train a model on each fold using **val** split for validation and out-of-sample fold as **test**
- **train** predictions
    1. Train on all **dev** with **val** for validation
    2. Apply to all **train** candidates
- Concatenate predictions for both sets, stored in large dict keyed by id, expose in LFs

In [1]:
import pandas as pd
import numpy as np
import tqdm
import snorkel
import os
import os.path as osp
from snorkel import SnorkelSession
from snorkel.models import Candidate
from tcre.env import *
from tcre.supervision import *
from tcre.exec.v1 import cli_client
session = SnorkelSession()
classes = get_candidate_classes()

In [2]:
output_dir = osp.join(RESULTS_DATA_DIR, 'training-dev', 'run1')
search_dir = osp.join(RESULTS_DATA_DIR, 'gridsearch-dev', 'run1')
n_cv_splits = 10
output_dir, search_dir, n_cv_splits

('/lab/data/results/training-dev/run1',
 '/lab/data/results/gridsearch-dev/run1',
 10)

In [3]:
tasks = [classes[c].field for c in classes]
tasks

['inducing_cytokine', 'secreted_cytokine', 'inducing_transcription_factor']

In [4]:
folds = list(map(lambda v: f'{v:03d}', range(n_cv_splits)))
folds

['000', '001', '002', '003', '004', '005', '006', '007', '008', '009']

## Generate Splits

In [5]:
def to_df(cands):
    return pd.DataFrame([
        dict(id=c.id, type=c.type, label=c.gold_labels[0].value if c.gold_labels else -1)
        for c in cands
    ])

def get_all_ids(split):
    return to_df(session.query(Candidate).filter(Candidate.split == split).all())\
        .groupby('type')['id'].unique().apply(list)

In [6]:
SPLIT_MAP

{0: 'train', 1: 'dev', 2: 'infer', 3: 'test', 4: 'val'}

In [7]:
# Load all ids for each split (may take a few minutes)
ids = {
    SPLIT_MAP[k]: get_all_ids(k)
    for k in [SPLIT_DEV, SPLIT_VAL, SPLIT_TEST, SPLIT_TRAIN]
}

pd.DataFrame([
    dict(split=split, task=task, nids=len(ids[split][task]))
    for split in ids.keys()
    for task in ids[split].keys()
])

Unnamed: 0,nids,split,task
0,673,dev,inducing_cytokine
1,410,dev,inducing_transcription_factor
2,673,dev,secreted_cytokine
3,97,val,inducing_cytokine
4,117,val,inducing_transcription_factor
5,98,val,secreted_cytokine
6,100,test,inducing_cytokine
7,100,test,inducing_transcription_factor
8,100,test,secreted_cytokine
9,48656,train,inducing_cytokine


In [8]:
# Load dev candidates and labels for stratification
df = to_df(session.query(Candidate).filter(Candidate.split.in_([SPLIT_DEV])).all())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1756 entries, 0 to 1755
Data columns (total 3 columns):
id       1756 non-null int64
label    1756 non-null int64
type     1756 non-null object
dtypes: int64(2), object(1)
memory usage: 41.2+ KB


In [9]:
df.groupby(['type', 'label']).size().unstack()

label,-1,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
inducing_cytokine,540,133
inducing_transcription_factor,324,86
secreted_cytokine,592,81


In [10]:
# Build splits with larger fold partition of dev set as training data and held-out
# portion as the prediction target (with validation and test as usual)
from sklearn.model_selection import StratifiedKFold

def get_split(k, g, val_ids, test_ids):
    cv = StratifiedKFold(n_splits=n_cv_splits, random_state=TCRE_SEED)
    for train, test in cv.split(g['id'].values.reshape((-1, 1)), g['label']):
        yield pd.Series({
            'train': list(g['id'].iloc[train].values), 
            'predict': list(g['id'].iloc[test].values), 
            'val': list(val_ids[k]),
            'test': list(test_ids[k])
        })

splits = {}
for task in tasks:
    g = df[df['type'] == task]
    assert len(g) > 0, f'No candidates found for task {task}'
    splits[task] = []
    for split in get_split(task, g, ids['val'], ids['test']):
        splits[task].append(split)

In [11]:
splits.keys()

dict_keys(['inducing_cytokine', 'secreted_cytokine', 'inducing_transcription_factor'])

In [12]:
# Show number of ids to be written in splits files (by fold and task)
pd.DataFrame([
    {**dict(task=k, fold=i), **fold.apply(len).to_dict()}
    for k in splits.keys()
    for i, fold in enumerate(splits[k])
])

Unnamed: 0,fold,predict,task,test,train,val
0,0,68,inducing_cytokine,100,605,97
1,1,68,inducing_cytokine,100,605,97
2,2,68,inducing_cytokine,100,605,97
3,3,67,inducing_cytokine,100,606,97
4,4,67,inducing_cytokine,100,606,97
5,5,67,inducing_cytokine,100,606,97
6,6,67,inducing_cytokine,100,606,97
7,7,67,inducing_cytokine,100,606,97
8,8,67,inducing_cytokine,100,606,97
9,9,67,inducing_cytokine,100,606,97


In [13]:
# Write split arrays to individual files for each fold
for task, split in splits.items():
    for i, fold in enumerate(split):
        split_file = osp.join(output_dir, 'splits', f'dev_{task}_fold_{i:03d}.json')
        if not osp.exists(osp.dirname(split_file)):
            os.makedirs(osp.dirname(split_file))
        fold.to_json(split_file, orient='index')
print(f"All fold splits written to {osp.join(output_dir, 'splits')}")

All fold splits written to /lab/data/results/training-dev/run1/splits


In [14]:
# Finally, write a split file for each task across the entire dev set
# with the train partition as the prediction target
for task in splits.keys():
    split_file = osp.join(output_dir, 'splits', f'dev_{task}_fold_all.json')
    split_data = pd.Series({
        'train': ids['dev'][task],
        'val': ids['val'][task],
        'test': ids['test'][task],
        'predict': ids['train'][task]
    })
    split_data.to_json(split_file, orient='index')
print(f"Complete splits written to {osp.join(output_dir, 'splits')}")

Complete splits written to /lab/data/results/training-dev/run1/splits


## Training

Load model configs from grid search results:

In [15]:
dfcfg = pd.read_json(osp.join(search_dir, 'top_model_configurations.json'))
dfcfg

Unnamed: 0,bidirectional,cell_type,dropout,learning_rate,marker_list,model_size,use_positions,use_secondary,use_swaps,weight_decay,wrd_embedding_type
0,False,LSTM,0.5,0.01,doub_01,XL,True,False,False,0.001,w2v_frozen
1,False,LSTM,0.5,0.01,doub_01,XL,True,False,False,0.001,w2v_trained


Train all models and create predictions for each:

In [16]:
client = cli_client.Client(require_options=True, exceptions=['log_level', 'seed', 'batch_size', 'vocab_limit', 'use_lower', 'save_keys'])

In [None]:
CMD_FORMAT = "{cmd} > {log_file} 2>&1"

def run_predictions(folds, tasks, cfgs):
    pbar = tqdm.tqdm(total=len(tasks) * len(folds) * len(cfgs))
    for task in tasks:
        for fold in folds:
            for i, cfg in list(enumerate(cfgs)):
                pbar.set_description(f'Processing task={task}, fold={fold}, config={i}')
                splits_file = osp.join(output_dir, 'splits', f'dev_{task}_fold_{fold}.json')
                model_dir = osp.join(output_dir, 'models', f'dev_{task}_fold_{fold}_cfg_{i:03d}')
                train_log_file = osp.join(model_dir, 'train_log.txt')
                predict_log_file = osp.join(model_dir, 'predict_log.txt')
                data_dir = osp.join(model_dir, 'data')

                if not osp.exists(model_dir):
                    os.makedirs(model_dir)

                cli_args = dict(relation_class=task, device='"cuda:1"', output_dir=data_dir)
                cmd = client.cmd(
                    cli=cli_args,
                    train={**cfg, **dict(use_checkpoints=True, splits_file=splits_file)}
                )
                cmd = CMD_FORMAT.format(cmd=cmd, log_file=train_log_file)
                client.execute(cmd)

                cmd = client.cmd(
                    cli=cli_args,
                    predict=dict(splits_file=splits_file)
                )
                cmd = CMD_FORMAT.format(cmd=cmd, log_file=predict_log_file)
                client.execute(cmd)
                pbar.update(1)

#run_predictions(['all'], ['secreted_cytokine'], dfcfg.to_dict(orient='records'))
#run_predictions(['000'], ['secreted_cytokine'], dfcfg.to_dict(orient='records'))
run_predictions(folds + ['all'], tasks, dfcfg.to_dict(orient='records'))

Processing task=inducing_cytokine, fold=000, config=1:   2%|▏         | 1/66 [00:33<36:30, 33.70s/it]

In [56]:
# !ls /lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_000.json

In [55]:
# !python /lab/repos/t-cell-relation-extraction/src/tcre/exec/v1/cli.py --relation-class=inducing_cytokine --device="cpu" --output-dir=/lab/data/results/training-dev/run1/models/dev_inducing_cytokine_fold_000_cfg_000/data predict --splits-file=/lab/data/results/training-dev/run1/splits/dev_inducing_cytokine_fold_000.json

In [30]:
# dfh = pd.read_json('/lab/data/results/training-dev/run1/models/dev_secreted_cytokine_fold_000_cfg_000/data/history.json')
# dfh.set_index(['epoch', 'type'])['f1'].unstack()

In [19]:
dfp = pd.read_json('/lab/data/results/training-dev/run1/models/dev_secreted_cytokine_fold_all_cfg_000/data/predictions.json')
dfp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48527 entries, 0 to 9999
Data columns (total 3 columns):
id        48527 non-null int64
y_true    48527 non-null int64
y_pred    48527 non-null float64
dtypes: float64(1), int64(2)
memory usage: 1.5 MB


In [78]:
# %matplotlib inline
# dfp['y_pred'].hist(bins=32)

In [20]:
dfp.sort_values('y_pred', ascending=False).head()

Unnamed: 0,id,y_true,y_pred
26720,278083,0,0.964123
13977,60695,0,0.96005
46931,367475,0,0.959832
31602,299321,0,0.959384
7649,19321,0,0.95917


In [21]:
list(dfp.sort_values('y_pred', ascending=False)['id'].head(10).unique())

[278083, 60695, 367475, 299321, 19321, 54598, 288078, 331541, 297000, 308685]

In [22]:
[
    (c, c.get_parent().text) for c in 
    session.query(Candidate).filter(Candidate.id.in_([278083, 60695, 367475, 299321, 19321, 54598, 288078, 331541, 297000, 308685])).all()
]

[(SecretedCytokine(Span("b'IL-9'", sentence=59599, chars=[65,68], words=[13,13]), Span("b'Th9'", sentence=59599, chars=[51,53], words=[10,10])),
  'The latest addition to the list of subsets, termed Th9, secretes IL-9 as the signature cytokine and may play a role in several inflammatory disorders.'),
 (SecretedCytokine(Span("b'IL-6'", sentence=220721, chars=[70,73], words=[13,13]), Span("b'naive T'", sentence=220721, chars=[34,40], words=[6,7])),
  'The generation of Th17 cells from naive T cells activated with IL-1β, IL-6 and IL-23 in the absence of TGF-β has been reported.'),
 (SecretedCytokine(Span("b'IL-10'", sentence=320315, chars=[312,316], words=[60,60]), Span("b'Tregs'", sentence=320315, chars=[284,288], words=[54,54])),
  "G-MSCs' infusion modulated the function of multiple innate and adaptive immune cells through the COX/PGE2 pathway, resulting in a decreased infiltration of DCs, CD81 T-cells, Th-17, and MCs, a suppression of a variety of inflammatory cytokines, a reciprocal 