In [None]:
MODEL = 'google/pegasus-large'
# MODEL = 'google/pegasus-cnn_dailymail'


BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'pegasus/50k_samples'
MAX_LENGTH = 1024 # Maximum context length to consider while preparing dataset.
epoch_metrics = []
DRIVE_DATA_PATH = "/content/drive/MyDrive/processed/50k_samples_new"   # UPDATE PATH
CLEAN_TEXT_COLUMN='article'
SUMMARY_COLUMN='highlights'

In [None]:
from google.colab import auth
auth.authenticate_user()

# Install gcsfuse
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse


# Create a local directory for mounting
!mkdir pegasus
# Mount the GCS bucket
# Replace 'your-bucket-name' with the actual name of your GCS bucket
!gcsfuse --implicit-dirs pegasus_large_50k_3rd pegasus

In [None]:
# !mkdir pegasus_3rd
# # Mount the GCS bucket
# # Replace 'your-bucket-name' with the actual name of your GCS bucket
# !gcsfuse --implicit-dirs pegasus_large_50k_3rd pegasus_3rd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !gcsfuse --implicit-dirs pegasus_large_50k_2nd pegasus

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')


In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score
!pip install tqdm

In [None]:
import torch
import pprint
import evaluate
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter

from transformers import (
    PegasusForConditionalGeneration,  Trainer, TrainingArguments,
    PegasusTokenizer,EarlyStoppingCallback,T5ForConditionalGeneration, T5Tokenizer,
    PegasusXForConditionalGeneration,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
)

from datasets import load_dataset

pp = pprint.PrettyPrinter()


In [None]:

def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  if val_dataset is not None:
    training_args = Seq2SeqTrainingArguments(
      output_dir=OUT_DIR,           # output directory
      num_train_epochs=6,           # total number of training epochs
      per_device_train_batch_size=16,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=16,    # batch size for evaluation, can increase if memory allows
      save_steps=100,                  # number of updates steps before checkpoint saves
      save_total_limit=10,              # limit the total amount of checkpoints and deletes the older checkpoints
      # eval_strategy='steps',     # evaluation strategy to adopt during training
      # warmup_steps=100,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./pegasus/logs',            # directory for storing logs
      learning_rate=0.00003,
      predict_with_generate=True,
      bf16=True,
      fp16=False,
      tf32=True,
      warmup_ratio = 0.1,
      logging_steps=200,
      gradient_accumulation_steps=8,
      logging_strategy="epoch",
    # eval_strategy='steps',
      eval_strategy='epoch',
      eval_steps=200,
      save_strategy='epoch',


      torch_compile=True,
      report_to='tensorboard',
      metric_for_best_model='eval_loss',
      greater_is_better=False,
      load_best_model_at_end=True
    )

    trainer = Seq2SeqTrainer(
      model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=val_dataset,            # evaluation dataset
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

  else:
    training_args = Seq2SeqTrainingArguments(
      output_dir=OUT_DIR,           # output directory
      num_train_epochs=10,           # total number of training epochs
      per_device_train_batch_size=8,   # batch size per device during training, can increase if memory allows
      save_steps=100,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      warmup_ratio = 0.1,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./pegasus/logs',            # directory for storing logs
      logging_steps=100,
      learning_rate=0.00003,
      predict_with_generate=True,
      report_to='tensorboard',
      eval_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=100,                  # number of update steps before evaluation
      bf16=True,
      fp16=False,
      tf32=True,

    torch_compile=True,
          metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True
    )

    trainer = Seq2SeqTrainer(
      model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

  return trainer

In [None]:
from torch.utils.tensorboard import SummaryWriter
from transformers import TrainerCallback
import time
writer = SummaryWriter(log_dir=OUT_DIR)

class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)


    def on_step_end(self, args, state, control, **kwargs):
        if torch.cuda.is_available():
            gpu_mem = torch.cuda.memory_allocated() / (1024 ** 3)
        return control

class GpuLoggerCallback(TrainerCallback):
    def __init__(self, writer):
        self.writer = writer

    def on_step_end(self, args, state, control, **kwargs):
        if torch.cuda.is_available():
            gpu_mem = torch.cuda.memory_allocated() / (1024 ** 3)
            self.writer.add_scalar("gpu_memory_gb", gpu_mem, state.global_step)
        return control


def prepare_data(model_name,
                 train_texts, train_labels,
                 val_texts=None, val_labels=None,
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = PegasusTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer

def compute_metrics(eval_pred):
    rouge = evaluate.load("rouge")
    # predictions, labels = eval_pred.predictions[0], eval_pred.label_ids[0]
    predictions, labels = eval_pred.predictions, eval_pred.label_ids


    # Decode the predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels before decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE
    rouge_result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"]
    )
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    rouge_result["gen_len"] = np.mean(prediction_lens)

    # NEW: Log to TensorBoard
    for k, v in rouge_result.items():
        writer.add_scalar(f"eval/{k}", v, trainer.state.global_step)
    # Print rounded values
    # pprint.print({k: round(v, 4) for k, v in rouge_result.items()})

    # Must return a *dict*, not a set
    return {k: v for k, v in rouge_result.items()}

In [None]:
from datasets import Dataset
import pandas as pd
  # train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]

print("Loading data...")
train_df = pd.read_csv(f"{DRIVE_DATA_PATH}/train.csv").head(50000)
val_df = pd.read_csv(f"{DRIVE_DATA_PATH}/val.csv").head(10000)


dataset_train = Dataset.from_pandas(train_df)
dataset_valid = Dataset.from_pandas(val_df)
train_texts, train_labels = dataset_train[CLEAN_TEXT_COLUMN], dataset_train[SUMMARY_COLUMN]
valid_texts, valid_labels = dataset_valid[CLEAN_TEXT_COLUMN], dataset_valid[SUMMARY_COLUMN]

print("Train:", len(train_texts))
print("Val:", len(valid_texts))


train_dataset, val_dataset, test_dataset, tokenizer = prepare_data(MODEL, train_texts, train_labels, valid_texts, valid_labels)



In [None]:

trainer = prepare_fine_tuning(MODEL, tokenizer, train_dataset, val_dataset=val_dataset)
start = time.time()
trainer.add_callback(GpuLoggerCallback(writer))
trainer.train()
end = time.time()
print(f'time:  {start - end}')
