# Automated Topic Summary Page Generation

## 1. Project Introduction

In [1]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from datetime import datetime, timedelta, timezone
from tqdm import tqdm
import json
import os
import re

## 2. Crawl the news

In [2]:
# Key
NewsAPI_Key = "8406ef98a8b24bec854801aa9f2c6a35"
GNews_Key = "9a6066514e3ca31d8ec6c184b2c33594"
TheNewsAPI_Key = "wEj2kyyJhPKLICmZavDq2MeJgbOr1KcyLbU0X3Au"
CurrentsAPI_Key = "wMSLtPfn74YOMCOyIGv49vXAfIrD2bcXGVgEj_zN1AgA8b3G"
Mediastack_Key = "465890a7953f6a540676c7c0fb86508a"

# URL
NewsAPI_URL = "https://newsapi.org/v2/everything"
GNews_URL = "https://gnews.io/api/v4/search"
TheNewsAPI_URL = "https://api.thenewsapi.com/v1/news/all"
Mediastack_URL = "http://api.mediastack.com/v1/news"

# json name
raw_json = "raw_news.json"
cleaned_json = "cleaned_news.json"

In [None]:
def extract_article_content(url):
    """
    Extract main content from news webpage URL
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove unwanted tags
        for tag in ['script', 'style', 'nav', 'header', 'footer', 'aside']:
            for element in soup.find_all(tag):
                element.decompose()
        
        # Content selectors for news websites
        content_selectors = [
            # Main content area
            'article',
            'main',
            '.main-content',
            '.content-main',
            '#main-content',
            '#content-main',
            
            # News specific selector
            '.article',
            '.story',
            '.news-article',
            '.post',
            '.entry',
            
            # Main content
            '.article-body',
            '.story-body',
            '.post-body',
            '.entry-content',
            '.article-content',
            '.story-content',
            '.post-content',
            '.news-content',
            '.content-body',
            '.body-content',
            
            # text content
            '.text-content',
            '.article-text',
            '.story-text',
            '.post-text',
            
            # General Content
            '[class*="content"]',
            '[class*="article"]',
            '[class*="story"]',
            '[class*="post"]',
            '[class*="entry"]',
            '[class*="body"]',
            '[class*="text"]',
            
            # Specific news websites
            '.zn-body__paragraph',  # CNN
            '.caas-body',           # Yahoo News
            '.Article__Content',    # Bloomberg
            '.article-section',     # Reuters
            '.article-page',        # BBC
            '.story-wrapper',       # NBC
            '.article-wrapper',
            
            # Container selector
            '.container',
            '.wrapper',
            '.main',
            '#main',
            '#content',
            '.page-content'
        ]
        
        # Try selectors first
        for selector in content_selectors:
            elements = soup.select(selector)
            for element in elements:
                text = element.get_text(strip=True)
                text = re.sub(r'\s+', ' ', text)
                if len(text) > 200:
                    return text
        
        # Fallback: combine paragraphs
        paragraphs = soup.find_all('p')
        if paragraphs:
            content = ' '.join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 50)
            content = re.sub(r'\s+', ' ', content)
            if len(content) > 100:
                return content
        
        return "No valid content extracted"
        
    except Exception as e:
        return f"Error: {str(e)}"

In [None]:
def save_to_json(data, filename='raw_news.json'):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"The data has been saved to {filename}")
        return True
    except Exception as e:
        print(f"Error saving file: {e}")
        return False

In [None]:
def fetch_news_from_newsapi(keyword, start_time, end_time):
    params = {
        'q': keyword,
        'from': start_time,
        'to': end_time,
        'sortBy': 'publishedAt',
        'pageSize': 100,
        'language': 'en',
        'apiKey': NewsAPI_Key
    }
    
    try:
        response = requests.get(NewsAPI_URL, params=params)
        response.raise_for_status()
        data = response.json()
        articles = data.get('articles', [])
        print(f"Fetched {len(articles)} articles from NewsAPI")
        return articles
    except Exception as e:
        print(f"NewsAPI request failed: {e}")
        return []

def fetch_news_from_gnews(keyword, start_time, end_time):  
    params = {
        'q': keyword,
        'from': start_time,
        'to': end_time,
        'max': 100,
        'lang': 'en',
        'token': GNews_Key
    }
    
    try:
        response = requests.get(GNews_URL, params=params)
        response.raise_for_status()
        data = response.json()
        articles = data.get('articles', [])
        print(f"Fetched {len(articles)} articles from GNews")
        return articles
    except Exception as e:
        print(f"GNews API request failed: {e}")
        return []

def fetch_news_from_thenewsapi(keyword, start_time, end_time):
    params = {
        'api_token': TheNewsAPI_Key,
        'search': keyword,
        'published_after': start_time,
        'language': 'en',
        'limit': 100
    }
    
    try:
        response = requests.get(TheNewsAPI_URL, params=params)
        response.raise_for_status()
        data = response.json()
        articles = data.get('data', [])
        print(f"Fetched {len(articles)} articles from The News API")
        return articles
    except Exception as e:
        print(f"The News API request failed: {e}")
        return []

def fetch_nobel_news_from_currentsapi(keyword, start_time, end_time):

    start_time = datetime.strptime(start_time, "%Y-%m-%d").replace(tzinfo=timezone.utc)
    end_time = datetime.strptime(end_time, "%Y-%m-%d").replace(tzinfo=timezone.utc)
    
    url = (f'https://api.currentsapi.services/v1/search?'
           f'keywords={keyword}&language=en&'
           f'apiKey={CurrentsAPI_Key}&'
           f'start_date{start_time}&end_date{end_time}')
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        data = response.json()
        
        if data.get('status') == 'ok':
            articles = data.get('news', [])
            print(f"Fetched {len(articles)} articles from CurrentsAPI")
            return articles
        else:
            print(f"CurrentsAPI returned error: {data.get('message', 'Unknown error')}")
            return []
        
    except requests.exceptions.RequestException as e:
        print(f"CurrentsAPI request failed: {e}")
        return []
    except json.JSONDecodeError:
        print("Failed to parse CurrentsAPI response")
        return []
    except Exception as e:
        print(f"Unexpected error: {e}")
        return []

def fetch_news_from_mediastack(keyword, start_time, end_time):
    params = {
        'access_key': Mediastack_Key,
        'keywords': keyword,
        'languages': 'en',
        'limit': 100,
        'sort': 'published_desc',
        'date': f'{start_time},{end_time}'
    }
    
    try:
        response = requests.get(Mediastack_URL, params=params)
        data = response.json()
        
        if 'data' in data:
            articles = data.get('data', [])
            print(f"Fetched {len(articles)} articles from Mediastack")
            return articles
        else:
            print(f"Error: {data.get('error', 'Unknown error')}")
            return []
            
    except Exception as e:
        print(f"Mediastack API error: {e}")
        return []

In [None]:
def process_news_data(keyword, start_time, end_time):
    print("Start obtaining news data...")

    raw_data = []
    # 
    start_dt = datetime.strptime(start_time, "%Y-%m-%d")
    end_dt = datetime.strptime(end_time, "%Y-%m-%d")
    delta = end_dt - start_dt
    random_days = random.randint(0, delta.days)
    middle_time = (start_dt + timedelta(days=random_days)).strftime("%Y-%m-%d")
    
    # Get data from all APIs
    # Search twice
    newsapi_articles_partone = fetch_news_from_newsapi(keyword, start_time, middle_time)
    newsapi_articles_parttwo = fetch_news_from_newsapi(keyword, middle_time, end_time)
    gnews_articles = fetch_news_from_gnews(keyword, start_time, end_time)
    thenewsapi_articles = fetch_news_from_thenewsapi(keyword, start_time, end_time)
    currents_articles = fetch_nobel_news_from_currentsapi(keyword, start_time, end_time)
    # Search twice
    mediastack_articles = fetch_news_from_mediastack(keyword, start_time, end_time)
    
    # Combine all articles
    all_articles = []
    all_articles.extend(newsapi_articles_partone)
    all_articles.extend(newsapi_articles_parttwo)
    all_articles.extend(gnews_articles)
    all_articles.extend(thenewsapi_articles)
    all_articles.extend(currents_articles)
    all_articles.extend(mediastack_articles)
    
    print(f"Total articles: {len(all_articles)}")
    
    for i, article in enumerate(all_articles, 1):
        print(f"Processing {i}/{len(all_articles)}: {article['title'][:50]}...")
        
        # Extract article content
        text_content = extract_article_content(article['url'])
        
        # Build data structure
        news_item = {
            "title": article.get('title', 'No title'),
            "date": article.get('publishedAt', 'No date'),
            "link": article.get('url', ''),
            "text": text_content
        }
        
        raw_data.append(news_item)
        
        # Add delay to avoid rate limiting
        time.sleep(1)
    
    return raw_data

In [None]:
def crawl_the_news(keyword, start_time, end_time): 
    # Processing news data
    raw_news_data = process_news_data(keyword, start_time, end_time)
    
    if raw_news_data:
        # Save to JSON file
        success = save_to_json(raw_news_data, raw_json)
        
        if success:
            print(f"Successfully processed {len(raw_news_data)} articles")
        else:
            print("Failed to save file")
    else:
        print("No data obtained")

## 3. Clean the data

input: raw_news.json
<br/>output: cleaned_news.json

In [None]:
def cleaned_the_data(): 
    # Load the original file
    with open(raw_json, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    cleaned = []
    seen = set()
    
    for item in data:
        title = item.get("title", "").strip()
        link = item.get("link", "").strip()
        text = item.get("text", "").strip()
        date_str = item.get("date", "").strip()
    
        # Skip empty records or invalid text
        if not title or not link or not text:
            continue
        if text.lower() == "no valid content extracted".lower():
            continue
    
        # Skip duplicates
        if (title, link) in seen:
            continue
        seen.add((title, link))
    
        # Remove gibberish or control characters (keep printable English/Chinese chars)
        def clean_str(s):
            return re.sub(r"[^\x09\x0A\x0D\x20-\x7E\u4E00-\u9FFF]", " ", s)
    
        title = clean_str(title)
        text = clean_str(text).lower()  # convert all text to lowercase
    
        # Normalize date format to YYYY-MM-DD
        if date_str:
            try:
                dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
                date_str = dt.strftime("%Y-%m-%d")
            except Exception:
                match = re.search(r"(\d{4})[-/](\d{2})[-/](\d{2})", date_str)
                if match:
                    date_str = "-".join(match.groups())
                else:
                    date_str = ""
    
        cleaned.append({
            "title": title,
            "date": date_str,
            "link": link,
            "text": text.strip()
        })
    
    # Save cleaned data
    with open("cleaned_news.json", "w", encoding="utf-8") as f:
        json.dump(cleaned, f, ensure_ascii=False, indent=2)
    
    print(f"✅ Cleaning completed. {len(cleaned)} valid news articles saved to cleaned_news.json.")

In [None]:
# cleaned data structure
cleaned_data_list = []
cleaned_data = {
    "title" : title
    "date" : date
    "link" : link
    "text" : text
}

## 4. Extract the information

input:cleaned_news.json
<br/>output:timeline.json, entities.json
<br/>NLP

In [None]:
# timeline structure
timeline_list = []
timeline = {
    "title" : title
    "date" : date
}

# entities.json
entities_list = []

entity = {
    "names" : names
    "locations" : locations
    "organizations" organizaitions
}

## 5. Summarize

input:cleaned_news.json
<br/>output:summary(string/text)
<br/>LLM API

## 6. Generate HTML Page
include **a main summary**, **a list of key entities**, **a timeline of major developments**, and **links to the original source articles**

input: summary,entities.json,timeline.json,cleaned_news.json(link)
<br/>output:summary_page.html

## 7. Run pipeline

In [None]:
# Input event name:
keyword = "Nobel Prize" # 2025 Nobel Prize
# Input start time:
start_time = "2025-09-23" # “2025-09-23” 
# Input end time:
end_time = "2025-10-22" # “2025-10-22”


In [None]:
# Crawl the news
# crawl_the_news(keyword, start_time, end_time)
# Clean the data
# cleaned_the_data()
# Extract information

# Summarize

# Generate HTML page
summary_page = ""