<a href="https://colab.research.google.com/github/jcha-ultra/data_toolkit/blob/master/bert_minimal_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a minimal example of fine-tuning BERT to create a classifier for the [emotion dataset](https://huggingface.co/datasets/emotion).

Adapted from https://colab.research.google.com/drive/18Qqox_QxJkOs80XVYaoLsdum0dX-Ilxb

In [None]:
!pip install transformers
!pip install datasets

In [None]:
import numpy as np
import torch
import random
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from os.path import join
from google.colab import drive

In [None]:
# config info
model_name = "bert-base-uncased"
max_length = 512
is_gpu = True

# save info
model_save_path = '/content/drive/MyDrive/ml_models'
model_save_name = "emotion-bert-base-uncased"

In [None]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).
 
    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf
        tf.random.set_seed(seed) 

set_seed(1)

In [None]:
# create tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

# load and preprocess dataset
emotion_dataset = load_dataset("emotion")
train_dataset = emotion_dataset['train'].map(lambda e: tokenizer(e['text'], truncation=True, padding=True, max_length=max_length), batched=True)
valid_dataset = emotion_dataset['validation'].map(lambda e: tokenizer(e['text'], truncation=True, padding=True, max_length=max_length), batched=True)

# set target names
target_names = train_dataset.features['label'].names

In [None]:
# create model
def mk_bert_pt_classifier(model_name, target_names, is_gpu):
  cpu_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))
  return cpu_model.to("cuda") if is_gpu else cpu_model
  # model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to("cuda")
  # return model

model = mk_bert_pt_classifier(model_name, target_names, is_gpu)

In [None]:
# computes the metrics that would be used for callback function to be passed to trainer constructor
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  accuracy = accuracy_score(labels, preds) # from `sklearn` package
  return {
      'accuracy': accuracy,
  }

# training arguments for trainer
train_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
                                     # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [None]:
# instantiate trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=train_args,                  # training arguments, defined above
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,          # evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 16000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6000


Step,Training Loss,Validation Loss,Accuracy
400,1.2316,0.551792,0.8305
800,0.4692,0.370523,0.9125
1200,0.3229,0.318753,0.9245
1600,0.297,0.231478,0.9255
2000,0.2224,0.205199,0.9315
2400,0.168,0.208003,0.935
2800,0.1528,0.176558,0.94
3200,0.1615,0.203853,0.9395
3600,0.1654,0.172741,0.9385
4000,0.153,0.168449,0.9355


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-400
Configuration saved in ./results/checkpoint-400/config.json
Model weights saved in ./results/checkpoint-400/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-400/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-400/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-800
Configuration saved in ./results/checkpoint-800/config.json
Model weights saved in ./results/checkpoint-800/pytorch_model.bin
tokenizer config file saved in ./results

TrainOutput(global_step=6000, training_loss=0.2587536163330078, metrics={'train_runtime': 1127.9173, 'train_samples_per_second': 42.556, 'train_steps_per_second': 5.32, 'total_flos': 1972745984977920.0, 'train_loss': 0.2587536163330078, 'epoch': 3.0})

In [None]:
# mount gdrive
drive.mount('/content/drive')
save_path = join(model_save_path, model_save_name)

In [None]:
# saving the fine tuned model & tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
# reload model and tokenizer
model = BertForSequenceClassification.from_pretrained(save_path, num_labels=len(target_names)).to("cuda")
tokenizer = BertTokenizerFast.from_pretrained(save_path)

In [None]:
def get_prediction(text):
    # prepare text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

In [None]:
# Example
text = """
This is amazing! I'm so happy.
"""
print(get_prediction(text))

joy
