In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Logging into HuggingFace repo
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install nltk
!pip install -U accelerate
!pip install -U transformers




In [None]:
%%capture
import datasets
import transformers
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import EncoderDecoderModel
import logging
import time
import datetime
from transformers import DataCollatorForSeq2Seq

import warnings
warnings.filterwarnings("ignore")

In [None]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
    filename='/content/drive/My Drive/logs/T5_Fine_Tuning.log',force=True)


In [None]:
logging.info("==========================================================================================================")
logging.info("T5 Fine Tuning Start ")

In [None]:
logging.info("Loading CNN Dataset ...")
data = load_dataset("cnn_dailymail",  "3.0.0", split="train[:25%]")
cnn = data.train_test_split(test_size=0.05)
cnn

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 68189
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 3589
    })
})

In [None]:
def roundTS(startTime, endTime):
    return round((endTime -startTime),4)

In [None]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_cnn = cnn.map(preprocess_function, batched=True)

Map:   0%|          | 0/68189 [00:00<?, ? examples/s]

Map:   0%|          | 0/3589 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
import evaluate
rouge = evaluate.load("rouge")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
#from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
model_name = "t5-small-cnn-ts"
model_dir = f"drive/MyDrive/models/{model_name}"

In [None]:
st_t5_ft=time.time()
batch_size = 12

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5-small-cnn-ft",
    max_steps=2000,
    eval_steps=100,
    evaluation_strategy="steps",
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=1000,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="none"
)



In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cnn["train"],
    eval_dataset=tokenized_cnn["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

logging.info(f"T5 FT Model pushed to Huggingface Repo at location harish3742/t5-small-cnn-ft")

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,2.3058,1.945862,0.2357,0.0946,0.1894,0.1893,19.0


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,2.3058,1.945862,0.2357,0.0946,0.1894,0.1893,19.0
200,2.122,1.861321,0.236,0.0975,0.1907,0.1905,19.0
300,2.0856,1.839131,0.2364,0.0981,0.191,0.1909,19.0
400,2.0626,1.826379,0.2369,0.0986,0.1913,0.1912,19.0
500,2.0713,1.81619,0.2381,0.0989,0.1915,0.1915,19.0
600,2.0507,1.811069,0.2383,0.0993,0.1923,0.1922,19.0
700,2.0924,1.805667,0.239,0.1001,0.1929,0.1928,19.0
800,2.0512,1.802865,0.2391,0.0996,0.1928,0.1927,19.0
900,2.0531,1.80171,0.2394,0.0997,0.1928,0.1927,19.0
1000,2.0377,1.800223,0.2391,0.0994,0.1923,0.1921,19.0


In [None]:
end_t5_ft=time.time()
logging.info(f"T5 FT Duration - {roundTS(st_t5_ft,end_t5_ft)} seconds")
print(f"T5 FT Duration - {roundTS(st_t5_ft,end_t5_ft)} seconds")

T5 FT Duration - 6017.5768 seconds


In [None]:
trainer_state = trainer.state.log_history
logging.info(f"T5 Model Training Stats:\n" )

In [None]:
model = trainer.model

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    logging.info(
        f"trainable params: {trainable_params} \nall params: {all_param} \ntrainable %: {100 * trainable_params / all_param}"
    )
    print(
        f"trainable params: {trainable_params} \nall params: {all_param} \ntrainable %: {100 * trainable_params / all_param}"
    )

def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
print_trainable_parameters(model)
print(count_trainable_params(model))

trainable params: 60506624 
all params: 60506624 
trainable %: 100.0
60506624


In [None]:
import os
file_size = os.stat('/content/t5-small-cnn-ft/checkpoint-1000/model.safetensors')
print("file size :", round((file_size.st_size/ (1024 * 1024)), 2), "mb")

file size : 230.83 mb


In [None]:
!pip install einops



In [None]:
params_dict = {
    'Name': [],
    'Shape': [],
    'Parameters': [],
    'RequiresGrad': []
}

for name, param in model.named_parameters():
    params_dict['Name'].append(name)
    params_dict['Shape'].append(param.shape)
    params_dict['Parameters'].append(param.numel())
    params_dict['RequiresGrad'].append(param.requires_grad)

df = pd.DataFrame(params_dict)
df.to_csv("/content/drive/My Drive/models/T5_FT/T5_FT_Params.csv")
#logging.info("Model Parameters Table", df.head(20))
df.head(10)

Unnamed: 0,Name,Shape,Parameters,RequiresGrad
0,shared.weight,"(32128, 512)",16449536,True
1,encoder.block.0.layer.0.SelfAttention.q.weight,"(512, 512)",262144,True
2,encoder.block.0.layer.0.SelfAttention.k.weight,"(512, 512)",262144,True
3,encoder.block.0.layer.0.SelfAttention.v.weight,"(512, 512)",262144,True
4,encoder.block.0.layer.0.SelfAttention.o.weight,"(512, 512)",262144,True
5,encoder.block.0.layer.0.SelfAttention.relative...,"(32, 8)",256,True
6,encoder.block.0.layer.0.layer_norm.weight,"(512,)",512,True
7,encoder.block.0.layer.1.DenseReluDense.wi.weight,"(2048, 512)",1048576,True
8,encoder.block.0.layer.1.DenseReluDense.wo.weight,"(512, 2048)",1048576,True
9,encoder.block.0.layer.1.layer_norm.weight,"(512,)",512,True


In [None]:
logging.info("T5 Fine Tuning End ")