In [None]:
MODEL = 't5-large'
BATCH_SIZE = 2
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5_large_regularized/50k_samples_fixed'
MAX_LENGTH = 1024 # Maximum context length to consider while preparing dataset.
epoch_metrics = []
DRIVE_DATA_PATH = "/content/drive/MyDrive/processed/50k_samples_new"   # UPDATE PATH
CLEAN_TEXT_COLUMN='article'
SUMMARY_COLUMN='highlights'

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set your data path in Google Drive
# DRIVE_DATA_PATH = '/content/drive/MyDrive/processed/'  # Update this path

In [None]:
from google.colab import auth
auth.authenticate_user()

# Install gcsfuse
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

# Create a local directory for mounting
!mkdir results_t5_large_regularized

# Mount the GCS bucket
# Replace 'your-bucket-name' with the actual name of your GCS bucket
!gcsfuse --implicit-dirs t5_large_10_run results_t5_large_regularized

In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score
!pip install tqdm
!pip install tensorboard-data-server
!pip install tbparse

In [None]:
import torch
import pprint
import evaluate
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback,
    T5Config
)
from datasets import load_dataset

pp = pprint.PrettyPrinter()


In [None]:
# Load data from Google Drive

print("Loading data...")
train_df = pd.read_csv(f"{DRIVE_DATA_PATH}/train.csv")
val_df = pd.read_csv(f"{DRIVE_DATA_PATH}/validation.csv").head(2000)

print("Train:", len(train_df))
print("Val:", len(val_df))
# print("Test:", len(test_df))

In [None]:

from datasets import Dataset
dataset_train = Dataset.from_pandas(train_df)
dataset_valid = Dataset.from_pandas(val_df)

print(dataset_train)
print(dataset_valid)

In [None]:
def find_longest_length(dataset):
    """
    Find the longest article and summary in the entire training set.
    """
    max_length = 0
    counter_4k = 0
    counter_2k = 0
    counter_1k = 0
    counter_500 = 0
    counter_700 = 0
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        if len(corpus) > 4000:
            counter_4k += 1
        if len(corpus) > 2000:
            counter_2k += 1
        if len(corpus) > 1000:
            counter_1k += 1
        if len(corpus) > 700:
            counter_700 += 1
        if len(corpus) > 500:
            counter_500 += 1
        if len(corpus) > max_length:
            max_length = len(corpus)
    return max_length, counter_4k, counter_2k, counter_1k, counter_700, counter_500

longest_article_length, counter_4k, counter_2k, counter_1k, counter_700, counter_500 = find_longest_length(dataset_train[CLEAN_TEXT_COLUMN])
print(f"Longest article length: {longest_article_length} words")
print(f"Artciles larger than 4000 words: {counter_4k}")
print(f"Artciles larger than 2000 words: {counter_2k}")
print(f"Artciles larger than 1000 words: {counter_1k}")
print(f"Artciles larger than 700 words: {counter_700}")
print(f"Artciles larger than 500 words: {counter_500}")
longest_summary_length, counter_4k, counter_2k, counter_1k, counter_700, counter_500 = find_longest_length(dataset_train[SUMMARY_COLUMN])
print(f"Longest summary length: {longest_summary_length} words")
print(f"Summaries larger than 4000 words: {counter_4k}")
print(f"Summaries larger than 2000 words: {counter_2k}")
print(f"Summaries larger than 1000 words: {counter_1k}")
print(f"Summaries larger than 700 words: {counter_700}")
print(f"Summaries larger than 500 words: {counter_500}")

In [None]:
def find_avg_sentence_length(dataset):
    """
    Find the average sentence in the entire training set.
    """
    sentence_lengths = []
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        sentence_lengths.append(len(corpus))
    return sum(sentence_lengths)/len(sentence_lengths)

avg_article_length = find_avg_sentence_length(dataset_train[CLEAN_TEXT_COLUMN])
print(f"Average article length: {avg_article_length} words")
avg_summary_length = find_avg_sentence_length(dataset_train[SUMMARY_COLUMN])
print(f"Averrage summary length: {avg_summary_length} words")

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples[CLEAN_TEXT_COLUMN]]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in examples[SUMMARY_COLUMN]]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the function to the whole dataset
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

In [None]:
# model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




# 1. Load the default configuration for T5-Base
config = T5Config.from_pretrained(MODEL)

# 2. Set the desired dropout rate
# Default for T5 is typically 0.1 (10%). To fight overfitting, you might increase it to 0.2 or 0.3.
NEW_DROPOUT_RATE = 0.2
config.dropout_rate = NEW_DROPOUT_RATE
config.attention_dropout_rate = NEW_DROPOUT_RATE # Also set for attention mechanisms

print(f"New dropout rate set to: {config.dropout_rate}")

# 3. Initialize the model using the modified configuration
model = T5ForConditionalGeneration.from_pretrained(
    MODEL,
    config=config
)

model.to(device)
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.pad_token_id
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

rouge = evaluate.load("rouge")


In [None]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [None]:
from torch.utils.tensorboard import SummaryWriter
from transformers import TrainerCallback
import time
writer = SummaryWriter(log_dir=OUT_DIR)

class GpuLoggerCallback(TrainerCallback):
    def __init__(self, writer):
        self.writer = writer

    def on_step_end(self, args, state, control, **kwargs):
        if torch.cuda.is_available():
            gpu_mem = torch.cuda.memory_allocated() / (1024 ** 3)
            self.writer.add_scalar("gpu_memory_gb", gpu_mem, state.global_step)
        return control



In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=['rouge1','rouge2','rougeL']
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    # NEW: Log to TensorBoard
    for k, v in result.items():
        writer.add_scalar(f"eval/{k}", v, trainer.state.global_step)

    # if trainer.state.is_local_process_zero and trainer.state.epoch is not None:
    #     print(result)
    #     pd.DataFrame([result]).to_csv(f"{OUT_DIR}/rouge_epoch_{int(trainer.state.epoch)}.csv")


    # FIXED: make sure it only runs on epoch boundaries
    # if trainer.state.is_local_process_zero and trainer.state.epoch is not None:
    #     epoch_num = int(trainer.state.epoch)
    #     print(f"[Saving ROUGE metrics for epoch {epoch_num}]")
    #     pd.DataFrame([result]).to_csv(f"{OUT_DIR}/rouge_epoch_{epoch_num}.csv", index=False)
    pd.DataFrame([result]).to_csv(f"{OUT_DIR}/rouge_results_step_{trainer.state.global_step}.csv")


    # NEW: Save as CSV for later plotting
    # pd.DataFrame([result]).to_csv(f"{OUT_DIR}/rouge_results_step_{trainer.state.global_step}.csv")

    return {k: round(v, 4) for k, v in result.items()}


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=2, #todo to test this how much to use more means faster
    per_device_eval_batch_size=16,#todo to test this how much to use
    # warmup_steps=50,
    warmup_ratio = 0.1,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=200,
    gradient_accumulation_steps=8,
    logging_strategy="epoch",
    # eval_strategy='steps',
    eval_strategy='epoch',
    predict_with_generate=True,
    eval_steps=200,
    save_strategy='epoch',
    report_to='tensorboard',

    learning_rate=0.00005,
    dataloader_num_workers=4,
    bf16=True,
    fp16=False,
    tf32=True,
    save_total_limit=2,
    torch_compile=True,
    generation_num_beams=4,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.add_callback(GpuLoggerCallback(writer))
pd.DataFrame(trainer.state.log_history).to_csv(f"{OUT_DIR}/training_history.csv")

start = time.time()
history = trainer.train()
end = time.time()

writer.add_scalar("total_training_time_seconds", end - start, 0)