<a href="https://colab.research.google.com/github/jcha-ultra/data_toolkit/blob/master/bert_regression_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a minimal example of fine-tuning BERT to create a regressor for the vagueness of a sentence.

In [1]:
!pip install transformers
!pip install datasets

import torch
from transformers import Trainer, TrainingArguments, BertTokenizerFast, BertForSequenceClassification
from os.path import join
from google.colab import drive

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 12.0 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 18.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 31.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
 

In [18]:
# config info
model_name = "bert-base-uncased"
max_length = 512
is_gpu = True
# is_gpu = False

# save info
model_save_path = '/content/drive/MyDrive/ml_models'
model_save_name = "vagueness-bert-base-uncased"

In [3]:
# create tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
# create model
def mk_bert_pt_regressor(model_name, is_gpu):
  cpu_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
  return cpu_model.to("cuda") if is_gpu else cpu_model
  # model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to("cuda")
  # return model

model = mk_bert_pt_regressor(model_name, is_gpu)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
# loads vagueness data

from pickle import load

drive.mount('/content/drive')

# with open("/content/drive/MyDrive/ml_models/vagueness-bert-base-uncased/data/training_data", "rb") as training_data:
    # train_texts, valid_texts, train_labels, valid_labels = load(training_data)

class VaguenessDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

with open("/content/drive/MyDrive/ml_models/vagueness-bert-base-uncased/data/training_datasets", "rb") as training_datasets:
    train_dataset, valid_dataset = load(training_datasets)


In [16]:
# training arguments for trainer
train_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
                                     # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=100,               # log & save weights each logging_steps
    save_steps=100,
    # logging_steps=400,               # log & save weights each logging_steps
    # save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [17]:
# instantiate trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=train_args,                  # training arguments, defined above
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,          # evaluation dataset
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)
trainer.train()


# (train_dataset, valid_dataset, compute_metrics) <- (train_dataset) <- (train_dataset_shape) <- (rerun_old_example) <- {}

***** Running training *****
  Num examples = 3149
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1182


Step,Training Loss,Validation Loss
100,0.3299,0.31123
200,0.2547,0.324337
300,0.2964,0.335618
400,0.3569,0.363396
500,0.3045,0.509115
600,0.3456,0.342581
700,0.34,0.316853
800,0.3306,0.290784
900,0.2071,0.305337
1000,0.2023,0.292837


***** Running Evaluation *****
  Num examples = 1350
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-100
Configuration saved in ./results/checkpoint-100/config.json
Model weights saved in ./results/checkpoint-100/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-100/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1350
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-200
Configuration saved in ./results/checkpoint-200/config.json
Model weights saved in ./results/checkpoint-200/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-200/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-200/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1350
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-300
Configuration saved in ./results/checkpoint-300/config.json
Model w

TrainOutput(global_step=1182, training_loss=0.28241151561188416, metrics={'train_runtime': 1265.3665, 'train_samples_per_second': 7.466, 'train_steps_per_second': 0.934, 'total_flos': 1087444672429248.0, 'train_loss': 0.28241151561188416, 'epoch': 3.0})

In [19]:
# mount gdrive
# drive.mount('/content/drive')
save_path = join(model_save_path, model_save_name)

# saving the fine tuned model & tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

Configuration saved in /content/drive/MyDrive/ml_models/vagueness-bert-base-uncased/config.json
Model weights saved in /content/drive/MyDrive/ml_models/vagueness-bert-base-uncased/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/ml_models/vagueness-bert-base-uncased/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/ml_models/vagueness-bert-base-uncased/special_tokens_map.json


('/content/drive/MyDrive/ml_models/vagueness-bert-base-uncased/tokenizer_config.json',
 '/content/drive/MyDrive/ml_models/vagueness-bert-base-uncased/special_tokens_map.json',
 '/content/drive/MyDrive/ml_models/vagueness-bert-base-uncased/vocab.txt',
 '/content/drive/MyDrive/ml_models/vagueness-bert-base-uncased/added_tokens.json',
 '/content/drive/MyDrive/ml_models/vagueness-bert-base-uncased/tokenizer.json')

In [20]:
def get_prediction(text):
    # prepare text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    return outputs

In [32]:
# Example
text = """
The ESRB 's Privacy Online Program is designed to ensure that our pokemon.com website information disclosure practices are responsible and appropriate 
"""
print(get_prediction(text))

SequenceClassifierOutput(loss=None, logits=tensor([[1.8222]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [8]:
from sklearn.datasets import fetch_20newsgroups
dataset_ex = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers", "footers", "quotes"))
dataset_ex.target

array([10,  3, 17, ...,  3,  1,  7])