# T5 Fine-tuning and Inference in Google Colab

This notebook fine-tunes T5 model on your dataset and runs inference pipeline.

## Setup
1. Upload your data to Google Drive
2. Update the `DRIVE_DATA_PATH` variable below
3. Run all cells


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set your data path in Google Drive
DRIVE_DATA_PATH = '/content/drive/MyDrive/event-extraction-from-news/data/processed'  # Update this path


In [None]:
# Install required packages
!pip install transformers==4.40.0 torch datasets accelerate pandas numpy rouge-score bert-score tqdm


In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    TrainingArguments, Trainer, DataCollatorForSeq2Seq
)
from tqdm import tqdm
import json
from pathlib import Path

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


In [None]:
class SummarizationDataset(Dataset):
    """Dataset for summarization fine-tuning."""
    
    def __init__(self, dataframe, tokenizer, max_input_length=512, max_target_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Use clean_text as input, Summary as target
        article = str(row['clean_text'])
        summary = str(row['Summary'])
        
        # Tokenize inputs
        inputs = self.tokenizer(
            article,
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Tokenize targets
        targets = self.tokenizer(
            summary,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }


In [None]:
# Load data from Google Drive
print("Loading data from Google Drive...")
train_df = pd.read_csv(f"{DRIVE_DATA_PATH}/train.csv")
val_df = pd.read_csv(f"{DRIVE_DATA_PATH}/val.csv")
test_df = pd.read_csv(f"{DRIVE_DATA_PATH}/test.csv")

# Filter out rows with missing summaries
train_df = train_df.dropna(subset=['Summary', 'clean_text'])
val_df = val_df.dropna(subset=['Summary', 'clean_text'])

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")


## Fine-tune T5 Model


In [None]:
# Initialize T5 model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

print(f"Model loaded on: {next(model.parameters()).device}")

# Create datasets
train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)


In [None]:
# Training arguments
output_dir = '/content/t5-finetuned'
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Larger batch size for Colab GPU
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Effective batch size = 8 * 2 = 16
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'{output_dir}/logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True,  # Enable mixed precision for faster training
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

# Train
print(f"Training on {len(train_df)} samples...")
trainer.train()

# Save model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")


## Run Inference Pipeline


In [None]:
# Load fine-tuned model for inference
model.eval()
results = []

# Process test set (limit to first 50 for demo)
test_limit = 50
test_subset = test_df.head(test_limit)

print(f"Processing {len(test_subset)} articles...")

for idx, row in tqdm(test_subset.iterrows(), total=len(test_subset)):
    article_id = int(idx)
    text = str(row["clean_text"])
    
    # Generate summary
    prompt = "summarize: " + text
    encoding = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    
    with torch.inference_mode():
        output = model.generate(
            **encoding,
            max_length=128,
            num_beams=1,
            do_sample=False
        )
    
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    
    results.append({
        "article_id": article_id,
        "original_text": text,
        "summary": summary
    })

print(f"Processed {len(results)} articles")


In [None]:
# Save results
results_dir = '/content/results'
os.makedirs(results_dir, exist_ok=True)

output_path = f"{results_dir}/t5_results.json"
with open(output_path, "w") as f:
    json.dump(results, f, indent=2)

print(f"Results saved to {output_path}")

# Optionally save to Google Drive
drive_output_path = f"{DRIVE_DATA_PATH}/../results/t5_results.json"
os.makedirs(os.path.dirname(drive_output_path), exist_ok=True)
with open(drive_output_path, "w") as f:
    json.dump(results, f, indent=2)

print(f"Results also saved to Google Drive: {drive_output_path}")
