## The purpose of this Colab notebook is to illustrate a bug in the "trainer.train()" method in the fine-tuning of the basic BERT model: "BertForPreTraining.from_pretrained()".  This notebook is run with GPU support and runtime shape = "standard" (not high RAM).

* In the first example, we show how the trainer works correctly in fine-tuning a model built from "BertLMHeadModel.from_pretrained()".  
* In the second example, we will repeat all the same steps, but will initialize the model from "BertForPretraining.from_pretrained()".  This fails when we call the "trainer.train()" method

In [None]:
%pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
!pip uninstall --yes tensorflow
!pip uninstall --yes gast
!pip uninstall --yes tensorflow-probability


Found existing installation: tensorflow 2.15.0
Uninstalling tensorflow-2.15.0:
  Successfully uninstalled tensorflow-2.15.0
Found existing installation: gast 0.5.4
Uninstalling gast-0.5.4:
  Successfully uninstalled gast-0.5.4
Found existing installation: tensorflow-probability 0.23.0
Uninstalling tensorflow-probability-0.23.0:
  Successfully uninstalled tensorflow-probability-0.23.0


In [None]:
!pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!git clone https://github.com/huggingface/transformers
!cd transformers; pip install .
!pip install seqeval --no-deps
!pip install conllu    # required for run_ner.py

Looking in links: https://download.pytorch.org/whl/torch_stable.html
[31mERROR: Could not find a version that satisfies the requirement torch==1.6.0+cu101 (from versions: 1.11.0, 1.11.0+cpu, 1.11.0+cu102, 1.11.0+cu113, 1.11.0+cu115, 1.11.0+rocm4.3.1, 1.11.0+rocm4.5.2, 1.12.0, 1.12.0+cpu, 1.12.0+cu102, 1.12.0+cu113, 1.12.0+cu116, 1.12.0+rocm5.0, 1.12.0+rocm5.1.1, 1.12.1, 1.12.1+cpu, 1.12.1+cu102, 1.12.1+cu113, 1.12.1+cu116, 1.12.1+rocm5.0, 1.12.1+rocm5.1.1, 1.13.0, 1.13.0+cpu, 1.13.0+cu116, 1.13.0+cu117, 1.13.0+cu117.with.pypi.cudnn, 1.13.0+rocm5.1.1, 1.13.0+rocm5.2, 1.13.1, 1.13.1+cpu, 1.13.1+cu116, 1.13.1+cu117, 1.13.1+cu117.with.pypi.cudnn, 1.13.1+rocm5.1.1, 1.13.1+rocm5.2, 2.0.0, 2.0.0+cpu, 2.0.0+cpu.cxx11.abi, 2.0.0+cu117, 2.0.0+cu117.with.pypi.cudnn, 2.0.0+cu118, 2.0.0+rocm5.3, 2.0.0+rocm5.4.2, 2.0.1, 2.0.1+cpu, 2.0.1+cpu.cxx11.abi, 2.0.1+cu117, 2.0.1+cu117.with.pypi.cudnn, 2.0.1+cu118, 2.0.1+rocm5.3, 2.0.1+rocm5.4.2, 2.1.0, 2.1.0+cpu, 2.1.0+cpu.cxx11.abi, 2.1.0+cu118, 2.1.0+cu12

In [None]:
# Now the imports
import os
import sys
from google.colab import auth, drive
import torch
from torch.utils.data import DataLoader
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
from transformers import BertConfig, BertForPreTraining, BertLMHeadModel, BertTokenizer, logging
from transformers import pipeline, Trainer, TrainingArguments
from transformers import AutoConfig, AutoModel
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
# from nlp import load_dataset
# from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import seqeval
import json
import math
import logging
from datetime import datetime
import pytz     # for time zone
import gzip
import csv
import pickle
import re
import copy
# set the pathnames and authenticate for my google bucket
project_id = 'serene-mender-286105'
!gcloud config set project {project_id}
BUCKET_NAME = 'clinical_bert_bucket'


Updated property [core/project].


## First Example:
Build a BertLMHeadModel from the HuggingFace pre-trained model "bert-base-uncased" and then fine-tune the model with two additional training sentences.

In [None]:
PST = pytz.timezone('US/Pacific')
!mkdir "pytorch_finetuned_model"
!rm -rf "pytorch_finetuned_model/*"
!mkdir "pytorch_finetuned_log"
!rm -rf "pytorch_finetuned_log/*"

#instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

#instantiate the model
print("start loading model=",datetime.now(PST))
model = BertLMHeadModel.from_pretrained("bert-base-uncased")
# model = BertForPreTraining.from_pretrained("bert-base-uncased")

# define the arguments for the trainer
training_args = TrainingArguments(
    output_dir='pytorch_finetuned_model',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training (try 16 if needed)
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='pytorch_finetuned_log',     # directory for storing logs
    do_train=True,
    evaluation_strategy="steps",
    eval_steps=2
)

# prepare the training and validation data files
with open("sent_train.txt","w") as f_out:
  f_out.write("This is the first training sentence.\n")
  f_out.write("This is the second training sentence.\n")
with open("sent_eval.txt","w") as f_out:
  f_out.write("This is the first eval sentence.\n")
  f_out.write("This is the second eval sentence.\n")

# prepare the train_dataset
print("start building train_dataset=",datetime.now(PST))
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="sent_train.txt",
    block_size=128
)

print("start building eval_dataset=",datetime.now(PST))
eval_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="sent_eval.txt",
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Instantiate the trainer
print("start building trainer=",datetime.now(PST))
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,
    data_collator=data_collator,
    eval_dataset=eval_dataset            # evaluation dataset
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
print("finished=",datetime.now(PST))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

start loading model= 2024-04-18 06:43:42.862646-07:00


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


start building train_dataset= 2024-04-18 06:43:47.580221-07:00
start building eval_dataset= 2024-04-18 06:43:47.584426-07:00
start building trainer= 2024-04-18 06:43:47.586639-07:00




finished= 2024-04-18 06:43:48.488483-07:00


In [None]:
# now do training
trainer.train()

Step,Training Loss,Validation Loss
2,No log,17.89506


TrainOutput(global_step=3, training_loss=7.066388448079427, metrics={'train_runtime': 2.9104, 'train_samples_per_second': 2.062, 'train_steps_per_second': 1.031, 'total_flos': 27759882600.0, 'train_loss': 7.066388448079427, 'epoch': 3.0})

## Second Example:
do the exact same thing but using "BertForPreTraining.from_pretrained" instead of "BertLMHeadModel.from_pretrained"

In [None]:
from transformers.data.datasets.language_modeling import TextDatasetForNextSentencePrediction

In [None]:
!pip install evaluate

In [None]:
PST = pytz.timezone('US/Pacific')
!mkdir "pytorch_finetuned_model"
!rm -rf "pytorch_finetuned_model/*"
!mkdir "pytorch_finetuned_log"
!rm -rf "pytorch_finetuned_log/*"

#instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

#instantiate the model
print("start loading model=",datetime.now(PST))
# model = BertLMHeadModel.from_pretrained("bert-base-uncased")
model = BertForPreTraining.from_pretrained("bert-base-uncased")

# define the arguments for the trainer
training_args = TrainingArguments(
    output_dir='pytorch_finetuned_model',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training (try 16 if needed)
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='pytorch_finetuned_log',     # directory for storing logs
    do_train=True,
    evaluation_strategy="steps",
    eval_steps=2
)

# prepare the training and validation data files
with open("sent_train.txt","w") as f_out:
  f_out.write("This is the first training sentence.\n")
  f_out.write("This is the second training sentence.\n")
  f_out.write("\n This is the 3 training sentence.\n")
  f_out.write("This is the 4 training sentence.\n")
  f_out.write("\nThis is the 5 training sentence.\n")
  f_out.write("This is the 6 training sentence.\n")
with open("sent_eval.txt","w") as f_out:
  f_out.write("This is the first eval sentence.\n")
  f_out.write("This is the second eval sentence.\n")
  f_out.write("\nThis is the 3 eval sentence.\n")
  f_out.write("This is the 4 eval sentence.\n")

# prepare the train_dataset
print("start building train_dataset=",datetime.now(PST))
train_dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path="sent_train.txt",
    block_size=128
)

print("start building eval_dataset=",datetime.now(PST))
eval_dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path="sent_eval.txt",
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Instantiate the trainer
print("start building trainer=",datetime.now(PST))
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,
    data_collator=data_collator,
    eval_dataset=eval_dataset            # evaluation dataset
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

import evaluate
metric = evaluate.load("accuracy", )

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics
    labels = labels.reshape(-1)
    preds = preds.reshape(-1)
    mask = labels != -100
    labels = labels[mask]
    preds = preds[mask]
    return metric.compute(predictions=preds, references=labels)

print("finished=",datetime.now(PST))

mkdir: cannot create directory ‘pytorch_finetuned_model’: File exists
mkdir: cannot create directory ‘pytorch_finetuned_log’: File exists
start loading model= 2024-04-18 07:07:45.084924-07:00
start building train_dataset= 2024-04-18 07:07:45.852552-07:00
start building eval_dataset= 2024-04-18 07:07:45.856914-07:00
start building trainer= 2024-04-18 07:07:45.859228-07:00




finished= 2024-04-18 07:07:46.946490-07:00


In [None]:
# now do training
trainer.train()

Step,Training Loss,Validation Loss
2,No log,0.290111


TrainOutput(global_step=3, training_loss=0.030384081105391186, metrics={'train_runtime': 0.29, 'train_samples_per_second': 10.346, 'train_steps_per_second': 10.346, 'total_flos': 26398858680.0, 'train_loss': 0.030384081105391186, 'epoch': 3.0})