In [2]:
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install nltk
!pip install -U accelerate
!pip install -U transformers



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
%%capture
import datasets
import transformers
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import EncoderDecoderModel
import logging
import time
import datetime

import warnings
warnings.filterwarnings("ignore")

In [5]:
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#torch.cuda.empty_cache()

### Fine Tuning Pre Trained Model

In [6]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
    filename='/content/drive/My Drive/logs/BERT_Fine_Tuning.log',force=True)

In [7]:
logging.info("==========================================================================================================")
logging.info("BERT Fine Tuning Start ")

In [8]:
def roundTS(startTime, endTime):
    return round((endTime -startTime),4)


In [9]:
logging.info("Loading CNN Dataset ...")
train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:25%]", ignore_verifications=True)
val_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:3%]", ignore_verifications=True)
test_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test[:1%]", ignore_verifications=True)

In [10]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [11]:
batch_size=12  # change to 16 for full training
encoder_max_length=512
decoder_max_length=128

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

train_data = train_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article", "highlights", "id"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)


val_data = val_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article", "highlights", "id"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

In [12]:
bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

In [13]:
# set special tokens
bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
bert2bert.config.max_length = 60
bert2bert.config.min_length = 10
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

In [14]:
import evaluate
rouge = evaluate.load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}



In [15]:
#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
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
st_bert_ft=time.time()

In [17]:
# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    "bert-base-cnn-ft",
    #output_dir="/content/drive/My Drive/models/BERT",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    logging_steps=100,  # set to 1000 for full training
    save_steps=1000,  # set to 500 for full training
    eval_steps=100,  # set to 8000 for full training
    warmup_steps=1,  # set to 2000 for full training
    max_steps=2000, # delete for full training
    overwrite_output_dir=True,
    save_total_limit=1,
    fp16=True,
    weight_decay=0.01,
    optim = "adamw_torch",
    learning_rate=1e-5,
    push_to_hub=True,
    load_best_model_at_end=True,
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=bert2bert,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()
logging.info(f"BERT FT Model pushed to Huggingface Repo at location harish3742/bert-base-cnn-ft")


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,1.1034,0.002096,0.0337,0.0,0.031,0.0309,60.0
200,0.016,0.000848,0.0232,0.0,0.0205,0.0205,60.0
300,0.0028,0.000373,0.0359,0.0001,0.0323,0.0323,60.0
400,0.0019,0.000334,0.0328,0.0,0.0304,0.0304,60.0
500,0.0014,0.000288,0.0329,0.0,0.0305,0.0305,60.0
600,0.0011,0.000285,0.0332,0.0,0.0309,0.0309,60.0
700,0.001,0.000244,0.0333,0.0,0.0305,0.0306,60.0


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,1.1034,0.002096,0.0337,0.0,0.031,0.0309,60.0
200,0.016,0.000848,0.0232,0.0,0.0205,0.0205,60.0
300,0.0028,0.000373,0.0359,0.0001,0.0323,0.0323,60.0
400,0.0019,0.000334,0.0328,0.0,0.0304,0.0304,60.0
500,0.0014,0.000288,0.0329,0.0,0.0305,0.0305,60.0
600,0.0011,0.000285,0.0332,0.0,0.0309,0.0309,60.0
700,0.001,0.000244,0.0333,0.0,0.0305,0.0306,60.0
800,0.0017,0.000194,0.0335,0.0,0.0309,0.0308,60.0
900,0.0014,0.000213,0.0327,0.0,0.0303,0.0303,60.0
1000,0.0014,0.000165,0.0302,0.0,0.0274,0.0275,60.0


In [28]:
end_bert_ft=time.time()
logging.info(f"BERT FT Duration - {roundTS(st_bert_ft,end_bert_ft)} seconds")
print(f"BERT FT Duration - {roundTS(st_bert_ft,end_bert_ft)} seconds")

BERT FT Duration - 4653.696 seconds


In [29]:
trainer_state = trainer.state.log_history
logging.info(f"BERT Model Training Stats:\n" )

In [30]:
model = trainer.model

In [31]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    logging.info(
        f"trainable params: {trainable_params} \nall params: {all_param} \ntrainable %: {100 * trainable_params / all_param}"
    )
    print(
        f"trainable params: {trainable_params} \nall params: {all_param} \ntrainable %: {100 * trainable_params / all_param}"
    )

def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [32]:
print_trainable_parameters(model)
print(count_trainable_params(model))


trainable params: 247363386 
all params: 247363386 
trainable %: 100.0
247363386


In [33]:
import os
file_size = os.stat('/content/bert-base-cnn-ft/model.safetensors')
print("file size :", round((file_size.st_size/ (1024 * 1024)), 2), "mb")

file size : 943.68 mb


In [23]:
!pip install einops



In [24]:
params_dict = {
    'Name': [],
    'Shape': [],
    'Parameters': [],
    'RequiresGrad': []
}

for name, param in model.named_parameters():
    params_dict['Name'].append(name)
    params_dict['Shape'].append(param.shape)
    params_dict['Parameters'].append(param.numel())
    params_dict['RequiresGrad'].append(param.requires_grad)

df = pd.DataFrame(params_dict)
df.to_csv("/content/drive/My Drive/models/BERT/BERT_FT_Params.csv")
#logging.info("Model Parameters Table", df.head(20))
df.head(10)

Unnamed: 0,Name,Shape,Parameters,RequiresGrad
0,encoder.embeddings.word_embeddings.weight,"(30522, 768)",23440896,True
1,encoder.embeddings.position_embeddings.weight,"(512, 768)",393216,True
2,encoder.embeddings.token_type_embeddings.weight,"(2, 768)",1536,True
3,encoder.embeddings.LayerNorm.weight,"(768,)",768,True
4,encoder.embeddings.LayerNorm.bias,"(768,)",768,True
5,encoder.encoder.layer.0.attention.self.query.w...,"(768, 768)",589824,True
6,encoder.encoder.layer.0.attention.self.query.bias,"(768,)",768,True
7,encoder.encoder.layer.0.attention.self.key.weight,"(768, 768)",589824,True
8,encoder.encoder.layer.0.attention.self.key.bias,"(768,)",768,True
9,encoder.encoder.layer.0.attention.self.value.w...,"(768, 768)",589824,True


### Saving Model to  HuggingFace Repo

In [25]:
#trainer.push_to_hub("harish3742/bert-base-cnn-ft-1")

In [26]:
#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("/content/bert-base-cnn-ft")
model = EncoderDecoderModel.from_pretrained("/content/bert-base-cnn-ft")
model.to("cuda")

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [27]:
logging.info("BERT Fine Tuning End ")