In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import itertools
import tqdm
import os.path as osp
from snorkel import SnorkelSession
from tcre.env import *
from tcre import supervision
from tcre.exec.v1 import bert
session = SnorkelSession()
classes = supervision.get_candidate_classes()

In [2]:
output_dir = osp.join(RESULTS_DATA_DIR, 'modeling-bert', 'run1')
output_dir

'/lab/data/results/modeling-bert/run1'

In [3]:
bert.DEFAULT_BERT_PATH

'/lab/data/scibert/scibert_scivocab_uncased'

In [4]:
# Create grid subset based on https://mccormickml.com/2019/07/22/BERT-fine-tuning/
keys = ['learning_rate', 'num_train_epochs', 'per_gpu_train_batch_size', 'max_seq_length', 'bert_path']
configs = list(itertools.product(
    [5e-5, 3e-5, 2e-5],
    [2, 4, 6],
    [16, 32],
    [128],
    ['bert-base-uncased', bert.DEFAULT_BERT_PATH]
))
configs = [dict(zip(keys, c)) for c in configs]
configs[:5]

[{'learning_rate': 5e-05,
  'num_train_epochs': 2,
  'per_gpu_train_batch_size': 16,
  'max_seq_length': 128,
  'bert_path': 'bert-base-uncased'},
 {'learning_rate': 5e-05,
  'num_train_epochs': 2,
  'per_gpu_train_batch_size': 16,
  'max_seq_length': 128,
  'bert_path': '/lab/data/scibert/scibert_scivocab_uncased'},
 {'learning_rate': 5e-05,
  'num_train_epochs': 2,
  'per_gpu_train_batch_size': 32,
  'max_seq_length': 128,
  'bert_path': 'bert-base-uncased'},
 {'learning_rate': 5e-05,
  'num_train_epochs': 2,
  'per_gpu_train_batch_size': 32,
  'max_seq_length': 128,
  'bert_path': '/lab/data/scibert/scibert_scivocab_uncased'},
 {'learning_rate': 5e-05,
  'num_train_epochs': 4,
  'per_gpu_train_batch_size': 16,
  'max_seq_length': 128,
  'bert_path': 'bert-base-uncased'}]

In [5]:
def get_model_name(bert_path):
    if bert_path == 'bert-base-uncased':
        return 'bert-base-uncased'
    if bert_path == bert.DEFAULT_BERT_PATH:
        return 'bert-scibert-uncased'
    raise ValueError('BERT path "{}" not valid'.format(bert_path))

def get_scores_for_class(candidate_class):
    res = []
    data_dir = osp.join(output_dir, candidate_class.field)
    for i, config in tqdm.tqdm(list(enumerate(configs))):
        bert_path = config['bert_path']
        cfg = {k: v for k, v in config.items() if k != 'bert_path'}
        scores = bert.run_transformer_modeling(
            session, candidate_class, config=cfg, bert_path=bert_path,
            data_dir=data_dir, print_commands=False, clear=True
        )
        res.append(scores.assign(config_index=i, model=get_model_name(bert_path)))
    return pd.concat(res)

def get_scores():
    res = []
    for c in classes:
        candidate_class = classes[c]
        print(f'Running modeling for class "{candidate_class.field}"')
        scores = get_scores_for_class(candidate_class)
        res.append(scores.assign(task=candidate_class.field))
    return pd.concat(res)

df = get_scores()
df.head()

  0%|          | 0/36 [00:00<?, ?it/s]

Running modeling for class "inducing_cytokine"


100%|██████████| 36/36 [47:03<00:00, 79.39s/it]  
  0%|          | 0/36 [00:00<?, ?it/s]

Running modeling for class "secreted_cytokine"


100%|██████████| 36/36 [46:00<00:00, 80.00s/it]
  0%|          | 0/36 [00:00<?, ?it/s]

Running modeling for class "inducing_transcription_factor"


100%|██████████| 36/36 [43:39<00:00, 74.92s/it]


Unnamed: 0,metric,value,split,config_index,model,task
0,acc_and_f1,0.756539,train,0,bert-base-uncased,inducing_cytokine
1,accuracy,0.737931,train,0,bert-base-uncased,inducing_cytokine
2,f1,0.775148,train,0,bert-base-uncased,inducing_cytokine
3,n,290.0,train,0,bert-base-uncased,inducing_cytokine
4,precision,0.678756,train,0,bert-base-uncased,inducing_cytokine


In [6]:
(
    df
    .pipe(lambda df: df[df['metric'] == 'f1'])
    .pipe(lambda df: df[df['split'] == 'test'])
    .pipe(lambda df: df[df['model'] == 'bert-scibert-uncased'])
    .groupby(['task'])['value'].max()
)

task
inducing_cytokine                0.781250
inducing_transcription_factor    0.750000
secreted_cytokine                0.718563
Name: value, dtype: float64

In [7]:
top_configs = (
    df
    .pipe(lambda df: df[df['metric'] == 'f1'])
    .pipe(lambda df: df[df['split'] == 'val'])
    .groupby(['task', 'model'])
    .apply(lambda g: g.sort_values('value').tail(1)['config_index'].iloc[0])
)
top_configs

task                           model               
inducing_cytokine              bert-base-uncased       28
                               bert-scibert-uncased    23
inducing_transcription_factor  bert-base-uncased       34
                               bert-scibert-uncased     9
secreted_cytokine              bert-base-uncased        6
                               bert-scibert-uncased    19
dtype: int64

In [8]:
df_top = df.set_index(['task', 'model', 'config_index']).loc[list(map(tuple, top_configs.reset_index().values))].reset_index()
df_top = df_top.drop('config_index', axis=1)
assert (df_top.groupby(['task', 'metric', 'model', 'split']).size() == 1).all()
df_top.head()

Unnamed: 0,task,model,metric,value,split
0,inducing_cytokine,bert-base-uncased,acc_and_f1,0.841408,train
1,inducing_cytokine,bert-base-uncased,accuracy,0.837931,train
2,inducing_cytokine,bert-base-uncased,f1,0.844884,train
3,inducing_cytokine,bert-base-uncased,n,290.0,train
4,inducing_cytokine,bert-base-uncased,precision,0.810127,train


In [9]:
df_top[df_top['metric'] == 'f1']

Unnamed: 0,task,model,metric,value,split
2,inducing_cytokine,bert-base-uncased,f1,0.844884,train
9,inducing_cytokine,bert-base-uncased,f1,0.680851,val
16,inducing_cytokine,bert-base-uncased,f1,0.610169,test
23,inducing_cytokine,bert-scibert-uncased,f1,0.882155,train
30,inducing_cytokine,bert-scibert-uncased,f1,0.826087,val
37,inducing_cytokine,bert-scibert-uncased,f1,0.678571,test
44,inducing_transcription_factor,bert-base-uncased,f1,0.824645,train
51,inducing_transcription_factor,bert-base-uncased,f1,0.787234,val
58,inducing_transcription_factor,bert-base-uncased,f1,0.710744,test
65,inducing_transcription_factor,bert-scibert-uncased,f1,0.875576,train


## Export

In [10]:
dfe = df_top.copy()
dfe['split'] = dfe['split'].map({'train': 'training', 'val': 'validation', 'test': 'test'})
dfe['metric'] = dfe['metric'].apply(lambda v: 'accuracy' if v == 'acc' else v)
assert dfe['split'].notnull().all()
dfe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 5 columns):
task      126 non-null object
model     126 non-null object
metric    126 non-null object
value     126 non-null float64
split     126 non-null object
dtypes: float64(1), object(4)
memory usage: 5.0+ KB


In [11]:
dfe.head()

Unnamed: 0,task,model,metric,value,split
0,inducing_cytokine,bert-base-uncased,acc_and_f1,0.841408,training
1,inducing_cytokine,bert-base-uncased,accuracy,0.837931,training
2,inducing_cytokine,bert-base-uncased,f1,0.844884,training
3,inducing_cytokine,bert-base-uncased,n,290.0,training
4,inducing_cytokine,bert-base-uncased,precision,0.810127,training


In [12]:
path = osp.join(output_dir, 'scores.csv')
dfe.to_csv(path, index=False)
path

'/lab/data/results/modeling-bert/run1/scores.csv'