# BERT Extractive Summarization in Google Colab

This notebook uses BERT for extractive summarization (selects sentences from original text).

**Note:** BERT extractive summarization works differently - it doesn't need fine-tuning in the same way as T5/Pegasus. This notebook demonstrates how to use BERT for summarization.

## Setup
1. Upload your data to Google Drive
2. Update the `DRIVE_DATA_PATH` variable below
3. Run all cells


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set your data path in Google Drive
DRIVE_DATA_PATH = '/content/drive/MyDrive/event-extraction-from-news/data/processed'  # Update this path


In [None]:
# Install required packages
!pip install transformers torch bert-extractive-summarizer pandas numpy rouge-score bert-score tqdm nltk


In [None]:
import os
import pandas as pd
import torch
from summarizer import Summarizer
from transformers import AutoModel, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import json

# Download NLTK data
nltk.download("punkt", quiet=True)

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


In [None]:
# Load data from Google Drive
print("Loading data from Google Drive...")
test_df = pd.read_csv(f"{DRIVE_DATA_PATH}/test.csv")

print(f"Test samples: {len(test_df)}")


## Initialize BERT Summarizer


In [None]:
# Initialize BERT model for extractive summarization
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(
    model_name,
    output_hidden_states=True
).to(device)
bert_model.eval()

# Create summarizer
summarizer = Summarizer(
    custom_model=bert_model,
    custom_tokenizer=tokenizer
)

print(f"BERT model loaded on: {next(bert_model.parameters()).device}")


## Run Inference Pipeline


In [None]:
# Process test set (limit to first 50 for demo)
results = []
test_limit = 50
test_subset = test_df.head(test_limit)

print(f"Processing {len(test_subset)} articles...")

for idx, row in tqdm(test_subset.iterrows(), total=len(test_subset)):
    article_id = int(idx)
    text = str(row["clean_text"])
    
    # Generate extractive summary
    try:
        summary = summarizer(
            text,
            min_length=40,
            max_length=180
        )
        summary = summary.strip()
    except Exception as e:
        # Fallback: use first 3 sentences
        sentences = sent_tokenize(text)
        summary = " ".join(sentences[:3])
    
    results.append({
        "article_id": article_id,
        "original_text": text,
        "summary": summary
    })

print(f"Processed {len(results)} articles")


In [None]:
# Save results
results_dir = '/content/results'
os.makedirs(results_dir, exist_ok=True)

output_path = f"{results_dir}/bert_results.json"
with open(output_path, "w") as f:
    json.dump(results, f, indent=2)

print(f"Results saved to {output_path}")

# Optionally save to Google Drive
drive_output_path = f"{DRIVE_DATA_PATH}/../results/bert_results.json"
os.makedirs(os.path.dirname(drive_output_path), exist_ok=True)
with open(drive_output_path, "w") as f:
    json.dump(results, f, indent=2)

print(f"Results also saved to Google Drive: {drive_output_path}")
