### Understanding Bert ###

In [None]:
### Google Colab Mount Drive ###

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

In [None]:
# %cd ./where_I_need_to_be

In [None]:
# load all transformers
!pip install transformers

In [None]:
# use new version of tensorflow
%tensorflow_version 2.x
import tensorflow
print(tensorflow.__version__)

In [None]:
'''
upload bert model folder, for example, 
uncased_L-12_H-768_A-12
which is bert-base-uncased
The folder consists of:
bert_config.json: 
    sepcifies the model structure
    Loading the configuration file and using this file to 
    initialize a model does NOT load the model weights. It 
    only affects the model's configuration.
bert_model.ckpt.data-00000-of-00001 
bert_model.ckpt.index
bert_model.ckpt.meta
    Store the weights and biases of model.
vocab.txt:
    Bert's vocab file
'''

#### 1. Next-sentence prediction with pretrained bert ###

In [None]:
from transformers import BertForNextSentencePrediction, BertConfig, BertModel, BertForPreTraining, BertTokenizer

In [None]:
from torch.nn.functional import softmax

In [None]:
'''Three ways to load model and tokenizer'''
'''#1'''
# load pretrained model and pretrained tokenizer
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
'''#2: this one does not work and I don't know why '''
# logits always changing
# and wrong predictions
config = modeling_bert.BertConfig.from_json_file("/uncased_L-12_H-768_A-12/bert_config.json")
model = BertForNextSentencePrediction(config)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
'''#3: can use customized tf checkpoints'''
# correct and stable
config = BertConfig.from_json_file('../uncased_L-12_H-768_A-12/bert_config.json')
model = BertForPreTraining.from_pretrained('../uncased_L-12_H-768_A-12/bert_model.ckpt.index', from_tf=True, config=config)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Note: #3 is slower, preferably use save_pretrained('./dir_to_model')
# TODO: learn save_pretrained

In [None]:
# Next Sentence Prediction Examples #
seq_A = 'I like cookies !'
seq_B = 'Mitochondia are the powerhouse of the cell .'
# combined as one input to the model
encoded = tokenizer.encode_plus(seq_A, text_pair=seq_B, return_tensors='pt')
print(encoded)
# if the model is a BertForNextSentencePrediction, use [0]
seq_relationship_logits = model(**encoded)[0]
# if the model is is a BertForPreTraining, use[1]
# because it outputs logits for both masked language model 
# and next sentence prediction
seq_relationship_logits = model(**encoded)[1]
print(seq_relationship_logits)

In [None]:
# convert logits to probabilities 
# index 0: sequence B is a continuation of sequence A
# index 1: sequence B is a random sequence
probs = softmax(seq_relationship_logits, dim = 1)
print(probs[0][0])

In [None]:
import numpy as np
label = np.argmax(probs.detach().numpy(), axis = 1)
print(label)

#### 2. Finetune attention weights ###

In [None]:
'''
https://github.com/google-research/bert/blob/master/README.md#pre-training-with-bert
'''

In [None]:
!python create_pretraining_data.py \
  --input_file=../training_text.txt \
  --output_file=../tmp/tf_examples.tfrecord \
  --vocab_file=../uncased_L-12_H-768_A-12/vocab.txt \
  --do_lower_case=True \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --masked_lm_prob=0.15 \
  --random_seed=12345 \
  --dupe_factor=5

In [None]:
!python run_pretraining.py \
  --input_file=/tmp/tf_examples.tfrecord \
  --output_dir=/tmp/pretraining_output \
  --do_train=True \
  --do_eval=True \
  --bert_config_file=../uncased_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=../uncased_L-12_H-768_A-12/bert_model.ckpt \
  --train_batch_size=32 \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --num_train_steps=20 \
  --num_warmup_steps=10 \
  --learning_rate=2e-5

In [None]:
# Random note: the if ".index" is specified in the file name, 
# it will be removed

#### 3. Load finetuned attnetion weights for next-sentence-prediction ####

In [None]:
# Load model as in section 2 #3 

#### 4. Finetuning Bert for Classification ####

In [None]:
# Use customized attention weights by specifying 
# the corresponding tf checkpoints for
# init_checkpoint

In [None]:
!python run_classifier.py \
  --task_name=MRPC \
  --do_train=true \
  --do_eval=true \
  --do_predict=true \
  --data_dir=../fold1 \
  --vocab_file=../uncased_L-12_H-768_A-12/vocab.txt \
  --bert_config_file=../uncased_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=../tmp/model.ckpt-10000 \
  --max_seq_length=128 \
  --train_batch_size=32 \
  --learning_rate=2e-7 \
  --num_train_epochs=2.0 \
  --output_dir=../fold1_output