In [3]:
!pip install --quiet transformers datasets evaluate rouge_score accelerate huggingface_hub

In [None]:
import os
from huggingface_hub import login
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments, Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import evaluate
import numpy as np
import shutil

In [7]:
# 2.1 Set your Hugging Face token and login
os.environ["HF_TOKEN"] = "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" 
login(token=os.environ["HF_TOKEN"])


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
# 2.2 Prepare model & tokenizer
model_name = "google/pegasus-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.gradient_checkpointing_enable()
model.config.use_cache = False

# 2.3 Load & preprocess dataset
raw = load_dataset("reddit_tifu",'long')
splits = raw["train"].train_test_split(test_size=0.1)
train_raw, eval_raw = splits["train"], splits["test"]

In [None]:
def preprocess_fn(examples):
    inputs  = examples["documents"]
    targets = examples["tldr"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_raw.map(preprocess_fn, batched=True, remove_columns=train_raw.column_names)
eval_dataset  = eval_raw.map(preprocess_fn, batched=True, remove_columns=eval_raw.column_names)


In [None]:
# 2.4 ROUGE metric function
rouge = evaluate.load("rouge")
def compute_metrics(pred):
    preds, labels = pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: v.mid.fmeasure * 100 for k, v in result.items()}

In [None]:
import time
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    optim="adafactor",
    fp16=True,
    dataloader_num_workers=2,
    logging_steps=100,          # log every 100 steps
    logging_first_step=True,    # also log the very first step
    disable_tqdm=False,         # make sure the progress bar isn’t disabled
    report_to="none",           # avoid errors if no logger is set up
    save_total_limit=2,
    push_to_hub=True,
    hub_model_id="hmankar01/pegasus-reddit",
    hub_strategy="checkpoint",
)

# 2) Rebuild your trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
)

In [None]:
# 3) Time your training
start_time = time.time()
train_result = trainer.train()
end_time = time.time()

# 4) Print out a summary
duration = end_time - start_time
print(f"\n Training finished in {duration/60:.2f} minutes")
print(f" Total epochs run: {trainer.state.epoch:.2f}")
print(" Final training metrics:")
for k, v in train_result.metrics.items():
    print(f"  • {k}: {v}")
# 5) Then go ahead and save/zip as before
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
shutil.make_archive("/kaggle/working/pegasus_final", "zip", output_dir)
print("Model saved and zipped to /kaggle/working/pegasus_final.zip")


In [None]:

# 1. Install necessary libraries (if not already installed)
!pip install -q transformers datasets evaluate rouge_score

In [None]:
from datasets import load_dataset
import evaluate
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm

# 2. Load your fine-tuned model & tokenizer from the Hugging Face Hub
model_repo = "hmankar01/pegasus-reddit"
tokenizer = AutoTokenizer.from_pretrained(model_repo)
model = AutoModelForSeq2SeqLM.from_pretrained(model_repo)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 3. Prepare a small evaluation subset (50 examples)
raw = load_dataset("reddit_tifu", "long")["train"]
splits = raw.train_test_split(test_size=0.1, seed=42)
eval_raw = splits["test"].select(range(50))

# 4. Generate predictions
predictions = []
references = []
for example in tqdm(eval_raw, desc="Generating summaries"):
    inputs = tokenizer(
        example["documents"],
        return_tensors="pt",
        max_length=1024,
        truncation=True
    ).to(device)
    summary_ids = model.generate(
        **inputs,
        max_length=128,
        num_beams=4
    )
    pred = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    predictions.append(pred)
    references.append(example["tldr"])

# 5. Compute ROUGE scores
rouge = evaluate.load("rouge")
results = rouge.compute(
    predictions=predictions,
    references=references,
    use_stemmer=True
)
# Convert to percentages
results_pct = {key: value * 100 for key, value in results.items()}

# 6. Print rough ROUGE scores
print("▶ Rough ROUGE on 50 samples:")
print(f" • ROUGE-1: {results_pct['rouge1']:.2f}%")
print(f" • ROUGE-2: {results_pct['rouge2']:.2f}%")
print(f" • ROUGE-L: {results_pct['rougeL']:.2f}%")


In [None]:
import matplotlib.pyplot as plt

# Assuming your ROUGE scores dictionary from the last run:
# results_pct = {'rouge1': value1, 'rouge2': value2, 'rougeL': valueL}

# For demonstration, replace with actual values if needed:
results_pct = {
    'rouge1': 19.31,   # example values; replace with your actual
    'rouge2': 6.32,
    'rougeL': 16.51
}

# Prepare data for plotting
labels = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
scores = [results_pct['rouge1'], results_pct['rouge2'], results_pct['rougeL']]

# Create bar chart
plt.figure(figsize=(6, 4))
plt.bar(labels, scores)
plt.ylabel('Score (%)')
plt.title('ROUGE Scores by Pagsus')
plt.ylim(0, max(scores) + 10)
plt.tight_layout()
plt.show()
