In [1]:
!nvidia-smi

Fri Nov 15 18:52:18 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0              32W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# !pip install transformers datasets evaluate rouge_score

In [3]:
!pip install --upgrade datasets
!pip install evaluate



In [4]:
!pip install transformers[sentencepiece] datasets evaluate rouge_score py7zr -q

In [5]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Found existing installation: transformers 4.46.2
Uninstalling transformers-4.46.2:
  Successfully uninstalled transformers-4.46.2
Found existing installation: accelerate 1.1.1
Uninstalling accelerate-1.1.1:
  Successfully uninstalled accelerate-1.1.1
Collecting transformers
  Using cached transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
Collecting accelerate
  Using cached accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Using cached transformers-4.46.2-py3-none-any.whl (10.0 MB)
Using cached accelerate-1.1.1-py3-none-any.whl (333 kB)
Installing collected packages: accelerate, transformers
Successfully installed accelerate-1.1.1 transformers-4.46.2


In [6]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
from evaluate import load
import matplotlib.pyplot as plt
import pandas as pd
from datasets import concatenate_datasets

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [8]:

model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
dataset_samsum = load_dataset("samsum")
dataset_dailydialog = load_dataset("daily_dialog")

In [13]:

# Preprocess DailyDialog to match SAMSum's structure
def preprocess_dailydialog(batch):
    if "dialog" in batch:  # Ensure "dialog" field exists
        combined_dialogue = " ".join(batch["dialog"])  # Combine list of turns into a single string
        placeholder_summary = "No summary available."  # Placeholder for missing summaries
        return {"dialogue": combined_dialogue, "summary": placeholder_summary}
    else:
        return batch  # Skip if no "dialog" field

# Map preprocessing to DailyDialog
dailydialog_processed = dataset_dailydialog["train"].map(preprocess_dailydialog, batched=False)

# Combine SAMSum and DailyDialog datasets
combined_train = concatenate_datasets([dataset_samsum["train"], dailydialog_processed])
combined_train = combined_train.shuffle(seed=42)

# Validate combined dataset structure
print("Combined Dataset Example:", combined_train[0])

# Tokenization function
def convert_examples_to_features(example_batch):
    input_texts = example_batch["dialogue"]
    target_texts = example_batch["summary"]

    if isinstance(input_texts, str):
        input_texts = [input_texts]  # Convert to list if a single string
    if isinstance(target_texts, str):
        target_texts = [target_texts]  # Convert to list if a single string

    # Tokenize input and target
    input_encodings = tokenizer(input_texts, max_length=1024, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length")

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }

# Apply tokenization to the combined dataset
combined_train = combined_train.map(convert_examples_to_features, batched=True)



Combined Dataset Example: {'id': '13716967', 'dialogue': 'Maria: Just got a key to my new house!\r\nMakayla: 😍 awesome!\r\nKaitlyn: Aaaah, the key... the famous key! How exciting!\r\nJasmine: As my boys are in Manchester, I can help with moving your stuff this weekend. Huge boot in my estate and would love to help 😘 happy new Home x\r\nZachary: Great news Maria, congratulations 👏🎉👍\r\nMaria: Thank you everyone. Thank you Jasmine for the offer, will let you know. 😉\r\nMakayla: When’s the house warming party?\r\nMaria: There will be. No worries. But not now!', 'summary': 'Maria has just got a key to her new house. Jasmine offers to help with the moving this weekend, as her boys are in Manchester. ', 'dialog': None, 'act': None, 'emotion': None}


In [14]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
training_args = TrainingArguments(
    output_dir="pegasus-combined-dataset",
    num_train_epochs=1,  # Single epoch for faster training
    per_device_train_batch_size=1,  # Reduce batch size
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=32,  # Accumulate gradients
    logging_steps=500,  # Less frequent logging
    eval_steps=2000,  # Less frequent evaluation
    save_steps=1e6,  # Save less frequently
    fp16=True,  # Mixed precision for faster training
    warmup_steps=500,
    weight_decay=0.01
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=combined_train,
    eval_dataset=dataset_samsum["validation"]
)


  trainer = Trainer(


In [None]:
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdishuhanwate007[0m ([33mdishuhanwate007-purdue-university-fort-wayne[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Step,Training Loss


In [None]:
# # Error analysis: Save failed summaries
# def save_failed_summaries(input_text, reference, model_output):
#     with open("failed_summaries.txt", "a") as f:
#         f.write(f"Dialogue: {input_text}\nReference: {reference}\nOutput: {model_output}\n\n")

# # Evaluation function with error logging
# def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, batch_size=16, device=device,
#                                 column_text="dialogue", column_summary="summary"):
#     article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
#     target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

#     for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
#         inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
#         summaries = model.generate(input_ids=inputs["input_ids"].to(device),
#                                    attention_mask=inputs["attention_mask"].to(device),
#                                    length_penalty=0.8, num_beams=8, max_length=128)
#         decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]

#         for i, (pred, ref) in enumerate(zip(decoded_summaries, target_batch)):
#             if pred != ref:
#                 save_failed_summaries(article_batch[i], ref, pred)

#         metric.add_batch(predictions=decoded_summaries, references=target_batch)

#     # Return the ROUGE score
#     score = metric.compute()
#     return score

# Evaluation function
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, batch_size=16):
    def generate_batch_sized_chunks(elements, batch_size):
        for i in range(0, len(elements), batch_size):
            yield elements[i : i + batch_size]

    article_batches = list(generate_batch_sized_chunks(dataset["dialogue"], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset["summary"], batch_size))

    for article_batch, target_batch in zip(article_batches, target_batches):
        inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=128)
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    return metric.compute()

In [None]:
# Evaluate on test dataset
rouge_metric = load("rouge")
score = calculate_metric_on_test_ds(dataset_samsum["test"], rouge_metric, model, tokenizer)
rouge_scores = {k: score[k].mid.fmeasure for k in ["rouge1", "rouge2", "rougeL", "rougeLsum"]}
print("ROUGE Scores:", pd.DataFrame(rouge_scores, index=["Pegasus"]))

# Save the fine-tuned model
model.save_pretrained("pegasus-combined-model")
tokenizer.save_pretrained("pegasus-combined-tokenizer")

# Test inference
sample_text = dataset_samsum["test"][0]["dialogue"]
pipe = pipeline("summarization", model=model, tokenizer=tokenizer)
print("\nSample Dialogue:", sample_text)
print("\nModel Summary:", pipe(sample_text, num_beams=8, max_length=128, length_penalty=0.8)[0]["summary_text"])
