In [1]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time
from fake_useragent import UserAgent
from urllib.parse import urljoin, urlparse

class NewsCrawler:
    def __init__(self, starting_url, no_of_articles, depth, delay):
        self.starting_url = starting_url
        self.no_of_articles = no_of_articles
        self.depth = depth
        self.delay = delay
        self.urls_visited = set()
        self.queue = deque()
        self.ua = UserAgent()
        self.articles = []
        self.session = requests.Session()
        self.domain = urlparse(self.starting_url).netloc

        # session headers
        self.session.headers.update({
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Cache-Control': 'max-age=0',
        })

    def is_url_valid(self, url):
        parsed_url = urlparse(url)
        return (parsed_url.netloc == self.domain
                and url not in self.urls_visited
                and not any(extension in url.lower()
                           for extension in ['.pdf', '.jpg', '.png', 'jpeg', '.gif', '.mp4']))

    def is_page_an_article(self, url, soup):
        # Specific checks for Hong Kong Business site
        article = soup.find('article') or soup.find('div', {'class': 'article-content'})
        headline = soup.find('h1', {'class': 'article-title'}) or soup.find('h1')

        # Check for date - Hong Kong Business uses 'date' class
        date_published = soup.find('div', class_='date') or soup.find('time') or soup.find('meta', property='article:published_time')

        return (article is not None and headline is not None and date_published is not None)

    def extract_found_article(self, soup):
        # Try to find the main article content
        article = (soup.find('article') or
                  soup.find('div', {'class': 'article-content'}) or
                  soup.find('div', class_=lambda tag: tag and 'article' in tag.lower()) or
                  soup.find('div', class_=lambda tag: tag and 'content' in tag.lower()))

        if not article:
            return None

        # Remove unwanted elements
        for element in article.find_all(['script', 'style', 'nav', 'footer', 'aside', 'figure', 'img', 'iframe', 'form']):
            element.decompose()

        # Clean up by removing divs with specific classes (common for ads, share buttons, etc.)
        for div in article.find_all('div', class_=lambda x: x and any(cls in x.lower() for cls in ['share', 'related', 'advert', 'comments', 'author'])):
            div.decompose()

        return article.get_text(separator='\n', strip=True)

    def process_page(self, url, depth):
        try:
            time.sleep(self.delay)
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            self.urls_visited.add(url)

            if self.is_page_an_article(url, soup):
                content = self.extract_found_article(soup)
                if content:
                    # Extract title - specific to Hong Kong Business
                    title = (soup.find('h1', class_='article-title') or
                            soup.find('h1') or
                            soup.find('title'))
                    title = title.get_text(strip=True) if title else "No title"

                    # Extract date - specific to Hong Kong Business
                    date = (soup.find('div', class_='date') or
                           soup.find('time') or
                           soup.find('meta', property='article:published_time'))
                    date = (date.get('datetime') if hasattr(date, 'get') and date.get('datetime')
                           else date.get_text(strip=True) if date else "Unknown")

                    self.articles.append({
                        'url': url,
                        'title': title,
                        'date': date,
                        'content': content[:5000] + "..." if len(content) > 5000 else content,
                        "depth": depth
                    })
                    print(f"Article found at depth {depth}: {title[:50]}...")

            if depth < self.depth and len(self.articles) < self.no_of_articles:
                for link in soup.find_all('a', href=True):
                    absolute_url = urljoin(self.starting_url, link['href'])
                    if self.is_url_valid(absolute_url):
                        self.queue.append((absolute_url, depth + 1))

        except Exception as e:
            print(f"Error processing {url}: {str(e)}")

    def crawl(self):
        print(f"Crawler started crawling on {self.starting_url} (max depth {self.depth})")
        self.queue.append((self.starting_url, 0))
        while self.queue and len(self.articles) < self.no_of_articles:
            url, depth = self.queue.popleft()
            if url not in self.urls_visited:
                self.process_page(url, depth)
        print(f"Crawling found {len(self.articles)} articles")

    def save_Results(self, filename="NewsArticles.json"):
        import json
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.articles, f, ensure_ascii=False, indent=2)
        print(f"Results saved in {filename}")

    def print_summary(self):
        print("\nCollected article summary:")
        for idx, article in enumerate(self.articles, 1):
            print(f"\n{idx}. {article['title']}")
            print(f"Depth: {article['depth']} | Date: {article['date']}")
            print(f"URL: {article['url']}")
            print(f"Preview: {article['content'][:100]}...")

if __name__ == "__main__":
    sites_list = {
        'Reuters News': 'https://www.reuters.com/',
        'Hong Kong Business': 'https://hongkongbusiness.hk/financial-technology',
        'SCMP News': 'https://www.scmp.com/',
        'BBC News': 'https://www.bbc.com/news'
    }

    for idx, (name, url) in enumerate(sites_list.items(), 1):
        print(f"{idx}: {name} ({url})")

    choice = int(input("\nSelect a site to crawl (1-4): ")) - 1
    selected_url = list(sites_list.values())[choice]

    crawler = NewsCrawler(
        starting_url=selected_url,
        no_of_articles=10,
        depth=3,  # Reduced depth as the site might have complex navigation
        delay=1  # Increased delay to be polite
    )

    crawler.crawl()
    crawler.print_summary()
    crawler.save_Results()

1: Reuters News (https://www.reuters.com/)
2: Hong Kong Business (https://hongkongbusiness.hk/financial-technology)
3: SCMP News (https://www.scmp.com/)
4: BBC News (https://www.bbc.com/news)
Crawler started crawling on https://hongkongbusiness.hk/financial-technology (max depth 3)


KeyboardInterrupt: 