In [1]:
import requests
import time
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from datetime import datetime, timezone, timedelta
from dateutil import parser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
from common.db import SQLiteHook
from common.writer import Writer

class BaomoiCrawler:
    BASE_URL = "https://baomoi.com"
    LOCAL_TIMEZONE = timezone(timedelta(hours=7))

    def __init__(self, max_pages=200, topic="tin-moi", max_article_age_days=4):
        self.TOPIC_URL = f"{self.BASE_URL}/{topic}.epi"
        self.max_pages = max_pages
        self.max_article_age_days = max_article_age_days
        self.count = 0

        chrome_options = Options()
        # chrome_options.add_argument("--headless=new")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        self.driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()),
            options=chrome_options
        )

        self.cache = SQLiteHook()
        self.writer = Writer()

    def close(self):
        self.driver.quit()
        self.cache.close()

    @retry(
        wait=wait_exponential(multiplier=1, min=2, max=10),
        stop=stop_after_attempt(3),
        retry=retry_if_exception_type(requests.RequestException)
    )
    def fetch_html(self, url):
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text

    def fetch_html_js_rendering(self, url):
        self.driver.get(url)
        time.sleep(2)

        last_height = self.driver.execute_script("return document.body.scrollHeight")
        scroll_pause_time = 1

        while True:
            try:
                load_more_div = self.driver.find_element(By.CSS_SELECTOR, "div.load-more")
                if load_more_div.is_displayed():
                    break
            except NoSuchElementException:
                pass

            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)

            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        return self.driver.page_source

    def extract_urls(self, html):
        soup = BeautifulSoup(html, "html.parser")
        urls = set()

        for a_tag in soup.select("h3 > a[href]"):
            href = a_tag.get("href")
            if href:
                url = urljoin(self.BASE_URL, href)
                urls.add(url)

        return list(urls)

    def extract_next_page_url(self, html):
        soup = BeautifulSoup(html, "html.parser")
        load_more_hrefs = soup.select("div.load-more a[href]")
        if load_more_hrefs:
            next_page_href = load_more_hrefs[-1].get("href")
            if next_page_href:
                return urljoin(self.BASE_URL, next_page_href)
        return None

    def parse_article_from_url(self, url) -> dict | None:
        html = self.fetch_html(url)
        if not html:
            return None

        soup = BeautifulSoup(html, "html.parser")

        title_tag = soup.find("h1")
        title = title_tag.text.strip() if title_tag else None

        crawled_time = datetime.now(self.LOCAL_TIMEZONE).isoformat()
        time_tag = soup.find("time", datetime=True)
        published_time = None
        if time_tag:
            try:
                dt = parser.isoparse(time_tag["datetime"])
                published_time = dt.astimezone(self.LOCAL_TIMEZONE).isoformat()
            except Exception:
                published_time = None

        content_div = soup.find("div", class_="content-body")
        author_paragraphs = []
        content_paragraphs = []

        if content_div:
            for p in content_div.find_all("p"):
                if "body-author" in p.get("class", []):
                    author_paragraphs.append(p.text.strip())
                else:
                    content_paragraphs.append(p.text.strip())

        author = " ".join(author_paragraphs) if author_paragraphs else None
        content = " ".join(content_paragraphs) if content_paragraphs else None

        return {
            "title": title,
            "published_time": published_time,
            "crawled_time": crawled_time,
            "content": content,
            "author": author,
            "url": url,
        }

    def process_article_url(self, url) -> None:
        try:
            if self.cache.has_visited(url):
                return

            print(f"📰 Crawling article: {url}")
            article = self.parse_article_from_url(url)
            if not article:
                return

            if not article.get("published_time"):
                return

            published_time = datetime.fromisoformat(article["published_time"])
            crawled_time = datetime.fromisoformat(article["crawled_time"])
            article_age_days = (crawled_time - published_time).days
            if article_age_days > self.max_article_age_days:
                print(f"Skipping {url}: Expired article publish date")
                return

            if not article.get("content"):
                print("Skipping {url}: No content")
                return

            self.cache.mark_visited(url)
            self.writer.write_article(article)
            self.count += 1
        except Exception as e:
            print(f"Error scraping {url}: {e}")

    def run(self):
        start_url = self.TOPIC_URL

        while self.count < self.max_pages and start_url:
            print(f"🔎 Rendering page: {start_url}")
            html = self.fetch_html_js_rendering(start_url)
            article_urls = self.extract_urls(html)
            print(len(article_urls))
            start_url = self.extract_next_page_url(html)

            for article_url in article_urls:
                self.process_article_url(article_url)

            print(f"Count {self.count} articles processed")

        print(f"Finished. Total articles crawled: {self.count}")


if __name__ == "__main__":
    crawler = BaomoiCrawler()
    crawler.run()

    crawler.close()


🔎 Rendering page: https://baomoi.com/tin-moi.epi
145
📰 Crawling article: https://baomoi.com/khap-noi-trong-tinh-ngay-28-6-2025-c52624522.epi
📰 Crawling article: https://baomoi.com/the-gioi-nua-dau-2025-dau-an-tong-thong-trump-c52581005.epi
📰 Crawling article: https://baomoi.com/vu-chay-xuong-tai-che-tai-hung-yen-it-nhat-hai-nguoi-nghi-da-tu-vong-c52624676.epi
📰 Crawling article: https://baomoi.com/tp-hcm-dieu-chinh-quy-hoach-tao-luc-hut-dau-tu-moi-c52624738.epi
📰 Crawling article: https://baomoi.com/gia-ca-phe-hom-nay-29-6-gia-ca-phe-robusta-va-arabica-tiep-tuc-giam-c52624710.epi
📰 Crawling article: https://baomoi.com/nghi-quyet-ve-thi-diem-mot-so-co-che-chinh-sach-dac-thu-phat-trien-thanh-pho-hai-phong-c52624742.epi
📰 Crawling article: https://baomoi.com/ban-tri-su-phat-giao-q-8-tong-ket-nhiem-ky-tri-an-dong-gop-cua-tang-ni-c52624697.epi
📰 Crawling article: https://baomoi.com/lu-doan-101-kiem-tra-ban-dan-that-6-thang-dau-nam-c52624681.epi
📰 Crawling article: https://baomoi.com/vo-si-n