In [None]:
CLEAN_TEXT_COLUMN='article'
SUMMARY_COLUMN='highlights'

In [None]:
# ============================================================================
# MOUNT GOOGLE DRIVE
# ============================================================================
# This cell mounts your Google Drive to access your dataset files.
# You'll be prompted to authorize access - follow the instructions.
# ============================================================================

from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install tensorboard
!pip install tensorboard-data-server
!pip install google-cloud-storage
!pip install tbparse matplotlib seaborn pandas numpy


In [None]:
from google.colab import auth
auth.authenticate_user()

# Install gcsfuse
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse


In [None]:
!mkdir models
!gcsfuse --implicit-dirs models models


In [None]:
!mkdir results

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PegasusForConditionalGeneration, PegasusTokenizer, T5Tokenizer, T5ForConditionalGeneration
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
dir = "/content/models/t5_base_10k"

#Load t5 models from checkpoint
model = T5ForConditionalGeneration.from_pretrained(dir)
tokenizer = T5Tokenizer.from_pretrained(dir)

In [None]:
#Load pegasus models from checkpoint
tokenizer = AutoTokenizer.from_pretrained(dir)
model = PegasusForConditionalGeneration.from_pretrained(dir).to(device)


In [None]:
# ------------------------------------------------------------
# Text summarization helper function
# ------------------------------------------------------------
def summarize_text(text, model, tokenizer, max_length=512, num_beams=5):
    """
    Generate a summary for a single input text using a
    fine-tuned sequence-to-sequence model.
    """

    # T5-style models require the task prefix "summarize:" (remove is pegasus)
    encoded = tokenizer(
        "summarize: " + text,
        return_tensors="pt",       # Return PyTorch tensors
        max_length=max_length,     # Maximum input length
        truncation=True,           # Truncate long inputs
    )

    # Move inputs to the same device as the model
    device = model.device
    encoded = {
        key: value.to(device)
        for key, value in encoded.items()
    }

    # Summary generation
    summary_ids = model.generate(
        **encoded,
        max_length=128,            # Maximum generated summary length
        num_beams=num_beams,       # Beam search size
        length_penalty=1.1,        # Encourage concise summaries
        no_repeat_ngram_size=3,    # Avoid repeating phrases
        early_stopping=True,       # Stop when beams are finished
    )

    # Decode generated tokens to text
    summary_text = tokenizer.decode(summary_ids[0],skip_special_tokens=True)

    return summary_text


In [None]:
#Load data from drive
test_df = pd.read_csv(f"{DRIVE_DATA_PATH}/test.csv")

In [None]:
from tqdm import tqdm
model.eval()
results = []

print(f"Processing {len(test_df)} articles...")

for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    article_id = int(idx)
    text = str(row[CLEAN_TEXT_COLUMN])
    original_summary = str(row[SUMMARY_COLUMN])

    summary = summarize_text(text, model, tokenizer)
    print(summary)
    results.append({
        "article_id": article_id,
        "original_text": text,
        "summary": summary,
        "original_summary": original_summary
    })

print(f"Processed {len(results)} articles")

In [None]:
# Save results_samples
import json
import os

os.makedirs(OUT_DIR, exist_ok=True)
output_path = "/content/results_samples/t5_base_10k.json"
with open(output_path, "w") as f:
    json.dump(results, f, indent=2)
print(f"Results saved to {output_path}")
