## BERT Fine-Tuning Example

This notebook will export training and evaluation data for fine-tuning and scoring of a BERT model

See: 
- [Running Pytorch-Transformers on Custom Datasets](https://medium.com/dsnet/running-pytorch-transformers-on-custom-datasets-717fd9e10fe2)
- [pytorch-transformers-extensions](https://github.com/nikhilno1/nlp_projects/tree/f5e4ae159970b6fd613d2c2181265db336acc934/pytorch-transformers-extensions) (slight abstractions for running pytorch-transformers on custom datasets)
- [pytorch-transformers](https://github.com/huggingface/pytorch-transformers)

In [2]:
%matplotlib inline
from snorkel import SnorkelSession
from tcre.env import *
from tcre import supervision
from tcre.supervision import SPLIT_DEV, SPLIT_VAL, SPLIT_TEST, REL_FIELD_INDUCING_CYTOKINE, ENT_TYP_CT_L
from tcre.modeling import features
import numpy as np
import pandas as pd
session = SnorkelSession()
classes = supervision.get_candidate_classes()

In [3]:
candidate_class = {classes[c].field: classes[c] for c in classes}[REL_FIELD_INDUCING_CYTOKINE]
candidate_class

CandidateClass({'index': 0, 'name': 'InducingCytokine', 'field': 'inducing_cytokine', 'label': 'Induction', 'abbr': 'indck', 'entity_types': ['cytokine', 'immune_cell_type'], 'subclass': <class 'snorkel.models.candidate.InducingCytokine'>})

In [4]:
from snorkel.models import Candidate
splits = [SPLIT_DEV, SPLIT_VAL, SPLIT_TEST]
cands = session.query(Candidate).filter(Candidate.split.in_(frozenset(splits))).filter(Candidate.type == candidate_class.field).all()
len(cands)

1674

In [5]:
mcand = pd.DataFrame([dict(split=c.split, cand=c) for c in cands]).groupby('split')['cand'].unique().to_dict()
mcand.keys()

dict_keys([1, 2, 3])

In [24]:
from tcre.modeling import sampling

In [30]:
df_cand, df_dist = sampling.get_modeling_splits(session, target_split_map={'dev': 'train', 'val': 'val'})

In [31]:
df_cand.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 998 entries, 831 to 2967
Data columns (total 4 columns):
id       998 non-null int64
label    998 non-null int64
split    998 non-null object
task     998 non-null object
dtypes: int64(2), object(2)
memory usage: 39.0+ KB


In [32]:
df_cand['split'].value_counts()

train    766
val      232
Name: split, dtype: int64

In [6]:
import tempfile
import shutil
import os

DEFAULT_CONFIG = dict(max_seq_length=128, learning_rate=2e-5, num_train_epochs=3.0)
DEFAULT_BERT_PATH = osp.join('/lab/data/scibert', 'scibert_scivocab_uncased')

def run_transformer_model(session, config=DEFAULT_CONFIG, data_dir=None):
    df_cand, df_dist = sampling.get_modeling_splits(session, target_split_map={'dev': 'train', 'val': 'val', 'test': 'test'})
    
    def get_cands(split):
        cands = df_cand[df_cand['split'] == split]['id'].unique()
        cands = session.query(Candidate).filter(Candidate.id.in_(frozenset(cands)))
        return cands
    
    train_cands, eval_cands, test_cands = [get_cands(s) for s in ['train', 'val', 'test']]
    
    
CMD_TRAIN = """
python run_dataset.py --task_name tcre --do_train \
--do_lower_case --data_dir {data_dir} \
--model_type bert --model_name_or_path {model_name_or_path} \
--max_seq_length {max_seq_length} --learning_rate {learning_rate} --num_train_epochs {learning_rate} \
--overwrite_output_dir \
--output_dir {output_dir}
"""

CMD_EVAL = """
python run_dataset.py --task_name tcre --do_eval \
--do_lower_case --data_dir {data_dir} \
--model_type bert --model_name_or_path {model_name_or_path} \
--max_seq_length {max_seq_length} --learning_rate {learning_rate} --num_train_epochs {learning_rate} \
--overwrite_output_dir \
--output_dir {output_dir}
"""
    
def run_training(train_cands, eval_cands, config=DEFAULT_CONFIG, data_dir=None, bert_path=DEFAULT_BERT_PATH):
    
    cands = []
    cands += list(zip(['train'] * len(train_cands), train_cands)) 
    cands += list(zip(['dev'] * len(eval_cands), eval_cands)) 
    df = pd.DataFrame([
        dict(guid=c.id, text=features.get_scibert_text(c), split=s, label=str(features.get_label(c)))
        for (s, c) in cands
    ])
    assert df['label'].isin(['0', '1']).all()
    
    if data_dir is None:
        data_dir = tempfile.mkdtemp()
    input_dir = osp.join(data_dir, 'input')
    output_dir = osp.join(data_dir, 'output')
    log_file = osp.join(data_dir, 'log.txt')
    if osp.exists(input_dir):
        shutil.rmtree(input_dir)
    os.makedirs(input_dir, exist_ok=True)
    if osp.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    if osp.exists(log_file):
        os.remove(log_file)
    df.to_csv(osp.join(input_dir, 'data.csv'), index=False)
    
    
    cmd = CMD_TRAIN.format(data_dir=input_dir, output_dir=output_dir, model_name_or_path=bert_path, **config)
    cmd += " >> {}".format(log_file)
    rc = os.system(cmd)
    if rc != 0:
        raise ValueError('Command failed with return code {}:\n{}'.format(rc, cmd))
        
    
    pd.read_json('/tmp/scibert/inducing_cytokine/output/scores.json', lines=True).iloc[0]
    return cmd

In [7]:
config = {}
train_cands = mcand[SPLIT_DEV]
eval_cands = mcand[SPLIT_VAL]
data_dir = osp.join('/tmp', 'scibert', candidate_class.field)
os.makedirs(data_dir, exist_ok=True)
cmd = run_transformer({}, train_cands, eval_cands, data_dir=data_dir)

In [8]:
print(cmd)


    python run_dataset.py --task_name tcre --do_train --do_eval     --do_lower_case --data_dir /tmp/scibert/inducing_cytokine/input     --model_type bert --model_name_or_path /lab/data/scibert/scibert_scivocab_uncased     --max_seq_length 128 --learning_rate 2e-5 --num_train_epochs 3.0     --overwrite_output_dir     --output_dir /tmp/scibert/inducing_cytokine/output
    


In [103]:
# from pytorch_transformers import file_utils
# file_utils.PYTORCH_PRETRAINED_BERT_CACHE
# from pytorch_transformers import BertForSequenceClassification
# model = BertForSequenceClassification.from_pretrained('/tmp/scibert/inducing_cytokine/output')

In [14]:
# Training + Evaluation
# Note: /lab/data/scibert/scibert_scivocab_uncased/bert_config.json must be renamed to config.json
!python run_dataset.py --task_name tcre --do_train --do_eval \
--do_lower_case --data_dir /tmp/scibert/inducing_cytokine/input \
--model_type bert --model_name_or_path /lab/data/scibert/scibert_scivocab_uncased \
--max_seq_length 128 --learning_rate 2e-5 --num_train_epochs 8.0 \
--evaluate_during_training \
--overwrite_output_dir \
--output_dir /tmp/scibert/inducing_cytokine/output

09/10/2019 00:42:07 - INFO - pytorch_transformers.modeling_utils -   loading configuration file /lab/data/scibert/scibert_scivocab_uncased/config.json
09/10/2019 00:42:07 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "tcre",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 31090
}

09/10/2019 00:42:07 - INFO - pytorch_transformers.tokenization_utils -   Model name '/lab/data/scibert/scibert_scivocab_uncased' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-unca

In [22]:
# Evaluation only using model trained above:
!python run_dataset.py --task_name tcre --do_eval \
--do_lower_case --data_dir /tmp/scibert/inducing_cytokine/input \
--model_type bert --model_name_or_path /tmp/scibert/inducing_cytokine/output \
--max_seq_length 128 --learning_rate 2e-5 --num_train_epochs 8.0 \
--evaluate_during_training \
--output_dir /tmp/scibert/inducing_cytokine/output

09/10/2019 14:25:38 - INFO - pytorch_transformers.modeling_utils -   loading configuration file /tmp/scibert/inducing_cytokine/output/config.json
09/10/2019 14:25:38 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "tcre",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 31090
}

09/10/2019 14:25:38 - INFO - pytorch_transformers.tokenization_utils -   Model name '/tmp/scibert/inducing_cytokine/output' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-

In [23]:
!cat /tmp/scibert/inducing_cytokine/output/scores.json

{"acc":0.9172661871,"f1":0.5306122449,"acc_and_f1":0.723939216,"precision":0.5,"recall":0.5652173913,"n":278.0,"rate":0.0827338129}

In [42]:
pd.read_json('/tmp/scibert/inducing_cytokine/output/scores.json', lines=True).iloc[0].rename('value').rename_axis('metric').reset_index()

Unnamed: 0,metric,value
0,acc,0.917266
1,acc_and_f1,0.723939
2,f1,0.530612
3,n,278.0
4,precision,0.5
5,rate,0.082734
6,recall,0.565217
