In [None]:
!pip install transformers datasets --quiet

In [None]:
import json
from transformers import pipeline

input_path = "cleaned_AI.json"
output_path = "cleaned_AI_summarized.json"

In [None]:
with open(input_path, "r", encoding="utf-8") as f:
    articles = json.load(f)

print(f"Loaded {len(articles)} cleaned articles")

In [None]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device_map="auto")

In [None]:
def summarize_articles(articles, batch_size=8, max_input_tokens=1024):
    summarized = []
    valid_indices = []
    valid_texts = []

    for idx, article in enumerate(articles):
        text = article['content']
        if len(text.split()) >= 50:
            valid_indices.append(idx)
            valid_texts.append(text[:max_input_tokens])

    for i in range(0, len(valid_texts), batch_size):
        batch_texts = valid_texts[i:i + batch_size]
        batch_indices = valid_indices[i:i + batch_size]
        try:
            summaries = summarizer(batch_texts, max_length=130, min_length=30, do_sample=False)
            for idx, summary in zip(batch_indices, summaries):
                article = articles[idx].copy()
                article['summary'] = summary['summary_text']
                summarized.append(article)
        except Exception as e:
            print(f"Skipping batch {i} due to error: {e}")

    return summarized

In [None]:
summarized_articles = summarize_articles(articles)

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(summarized_articles, f, indent=2, ensure_ascii=False)

print(f"Summarized {len(summarized_articles)} articles")