In [None]:
!pip install transformers datasets --quiet

In [None]:
import json
from transformers import pipeline

input_path = "cleaned_AI.json"
output_path = "cleaned_AI_summarized.json"

In [None]:
with open(input_path, "r", encoding="utf-8") as f:
    articles = json.load(f)

print(f"Loaded {len(articles)} cleaned articles")

In [None]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device_map="auto")

In [None]:
def summarize_articles(articles, batch_size=8, max_input_tokens=1024):
    summarized = []
    texts = []

    for article in articles:
        text = article['content']
        if len(text.split()) >= 50:
            texts.append(text[:max_input_tokens])
        else:
            texts.append(None)

    for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            valid_batch = [t for t in batch if t is not None]

            try:
                summaries = summarizer(valid_batch, max_length=130, min_length=30, do_sample=False)
                j = 0
                for k, text in enumerate(batch):
                     if text is not None:
                        articles[i+k]['summary'] = summaries[j]['summary_text']
                        summarized.append(articles[i+k])
                        j +=1
            except Exception as e:
                print(f"Skipping batch {i} due to error: {e}")
    
    return summarized

In [None]:
summarized_articles = summarize_articles(articles)

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(summarized_articles, f, indent=2, ensure_ascii=False)

print(f"Summarized {len(summarized_articles)} articles")