## BERT Fine-Tuning Example

This notebook will export training and evaluation data for fine-tuning and scoring of a BERT model

See: 
- [pytorch-transformers-extensions](https://github.com/nikhilno1/nlp_projects/tree/f5e4ae159970b6fd613d2c2181265db336acc934/pytorch-transformers-extensions) (slight abstractions for running pytorch-transformers on custom datasets)
- [pytorch-transformers](https://github.com/huggingface/pytorch-transformers)

In [53]:
%matplotlib inline
from snorkel import SnorkelSession
from tcre.env import *
from tcre.supervision import SPLIT_DEV, SPLIT_VAL, SPLIT_TEST, REL_FIELD_INDUCING_CYTOKINE, ENT_TYP_CT_L
from tcre.modeling import features
import numpy as np
import pandas as pd
session = SnorkelSession()
classes = supervision.get_candidate_classes()

In [22]:
candidate_class = {classes[c].field: classes[c] for c in classes}[REL_FIELD_INDUCING_CYTOKINE]
candidate_class

CandidateClass({'index': 0, 'name': 'InducingCytokine', 'field': 'inducing_cytokine', 'label': 'Induction', 'abbr': 'indck', 'entity_types': ['cytokine', 'immune_cell_type'], 'subclass': <class 'snorkel.models.candidate.InducingCytokine'>})

In [54]:
from snorkel.models import Candidate
splits = [SPLIT_DEV, SPLIT_VAL, SPLIT_TEST]
cands = session.query(Candidate).filter(Candidate.split.in_(frozenset(splits))).filter(Candidate.type == candidate_class.field).all()
len(cands)

1424

In [68]:
mcand = pd.DataFrame([dict(split=c.split, cand=c) for c in cands]).groupby('split')['cand'].unique().to_dict()
mcand.keys()

dict_keys([1, 2, 3])

In [72]:
import imp
imp.reload(features)

<module 'tcre.modeling.features' from '/lab/repos/t-cell-relation-extraction/src/tcre/modeling/features.py'>

In [100]:
import tempfile
import shutil
import os

def run_transformer(config, train_cands, eval_cands, data_dir=None):
    
    cands = []
    cands += list(zip(['train'] * len(train_cands), train_cands)) 
    cands += list(zip(['dev'] * len(eval_cands), eval_cands)) 
    df = pd.DataFrame([
        dict(guid=c.id, text=features.get_scibert_text(c), split=s, label=str(features.get_label(c)))
        for (s, c) in cands
    ])
    assert df['label'].isin(['0', '1']).all()
    
    if data_dir is None:
        data_dir = tempfile.mkdtemp()
    input_dir = osp.join(data_dir, 'input')
    output_dir = osp.join(data_dir, 'output')
    if osp.exists(input_dir):
        shutil.rmtree(input_dir)
    os.makedirs(input_dir, exist_ok=True)
    if osp.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    df.to_csv(osp.join(input_dir, 'data.csv'), index=False)
    
    bert_path = osp.join('/lab/data/scibert', 'scibert_scivocab_uncased')
    
    cmd = """
    python run_dataset.py --task_name tcre --do_train --do_eval \
    --do_lower_case --data_dir {} \
    --model_type bert --model_name_or_path {} \
    --max_seq_length 128 --learning_rate 2e-5 --num_train_epochs 3.0 \
    --overwrite_output_dir \
    --output_dir {}
    """.format(input_dir, bert_path, output_dir)
    
    return cmd

In [101]:
config = {}
train_cands = mcand[SPLIT_DEV]
eval_cands = mcand[SPLIT_VAL]
data_dir = osp.join('/tmp', 'scibert', candidate_class.field)
os.makedirs(data_dir, exist_ok=True)
cmd = run_transformer({}, train_cands, eval_cands, data_dir=data_dir)

In [102]:
print(cmd)


    python run_dataset.py --task_name tcre --do_train --do_eval     --do_lower_case --data_dir /tmp/scibert/inducing_cytokine/input     --model_type bert --model_name_or_path /lab/data/scibert/scibert_scivocab_uncased     --max_seq_length 128 --learning_rate 2e-5 --num_train_epochs 3.0     --output_dir /tmp/scibert/inducing_cytokine/output
    


In [103]:
# from pytorch_transformers import file_utils
# file_utils.PYTORCH_PRETRAINED_BERT_CACHE

In [110]:
# from pytorch_transformers import BertForSequenceClassification
# model = BertForSequenceClassification.from_pretrained('/tmp/scibert/inducing_cytokine/output')

In [105]:
# /lab/data/scibert/scibert_scivocab_uncased/bert_config.json must be renamed to config.json
!python run_dataset.py --task_name tcre --do_train --do_eval \
--do_lower_case --data_dir /tmp/scibert/inducing_cytokine/input \
--model_type bert --model_name_or_path /lab/data/scibert/scibert_scivocab_uncased \
--max_seq_length 128 --learning_rate 2e-5 --num_train_epochs 3.0 \
--evaluate_during_training \
--overwrite_output_dir \
--output_dir /tmp/scibert/inducing_cytokine/output

In [111]:
!python run_dataset.py --task_name tcre --do_train --do_eval \
--do_lower_case --data_dir /tmp/scibert/inducing_cytokine/input \
--model_type bert --model_name_or_path bert-base-uncased \
--max_seq_length 128 --learning_rate 2e-5 --num_train_epochs 3.0 \
--overwrite_output_dir \
--evaluate_during_training \
--output_dir /tmp/scibert/inducing_cytokine/output

09/09/2019 22:46:45 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /root/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
09/09/2019 22:46:45 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "tcre",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

09/09/2019 22:46:45 - INFO - pytorch_transformers.file_uti