In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import csv
import os
import re
import time
import random
import hashlib
from datetime import datetime, timezone
from typing import Optional, Dict, Any, List, Set, Tuple

import requests
from bs4 import BeautifulSoup
from dateutil import parser as dateparser
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# thay vì zoneinfo:
from datetime import datetime, timezone, timedelta

VN_TZ = timezone(timedelta(hours=7))

# ================== CONFIG ==================
CATEGORY_URLS = [
    # "https://vnexpress.net/the-gioi",
    # "https://vnexpress.net/thoi-su",
    # "https://vnexpress.net/kinh-doanh",
    # "https://vnexpress.net/khoa-hoc-cong-nghe",
    # "https://vnexpress.net/goc-nhin",
    # "https://vnexpress.net/bat-dong-san",
    # "https://vnexpress.net/suc-khoe",
    # "https://vnexpress.net/the-thao",
    # "https://vnexpress.net/giai-tri",
    # "https://vnexpress.net/phap-luat",
    "https://vnexpress.net/giao-duc",
    "https://vnexpress.net/du-lich",
    "https://vnexpress.net/oto-xe-may",
    "https://vnexpress.net/doi-song"
]

# Crawl từ mới -> cũ cho tới khi bài có ngày < END_DATE (theo giờ VN)
END_DATE = "2025-11-01"  # YYYY-MM-DD
MAX_PAGES_PER_CATEGORY = 200  # safety stop

CSV_PATH = "vnexpress_html_categories_vi_v2.csv"

TIMEOUT = 25
REQUEST_DELAY_BASE = 0.25
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; VnExpressHTMLCrawler/1.0)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
# ===========================================

CSV_HEADER = [
    "id",
    "title",
    "published_at",
    "source.name",
    "url",
    "language",
    "category.primary",
    "keywords",
    "entities",
]

SOURCE_NAME = "VnExpress"
DEFAULT_LANGUAGE = "vi"
DEBUG = False



# ----- HTTP session with retry -----
session = requests.Session()
session.headers.update(HEADERS)

retry = Retry(
    total=6,
    connect=6,
    read=6,
    backoff_factor=0.6,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET", "HEAD"],
    respect_retry_after_header=True,
    raise_on_status=False,
)
adapter = HTTPAdapter(max_retries=retry, pool_connections=50, pool_maxsize=50)
session.mount("http://", adapter)
session.mount("https://", adapter)


def log(msg: str):
    if DEBUG:
        print(msg)


def polite_sleep():
    time.sleep(REQUEST_DELAY_BASE + random.uniform(0, 0.4))


def md5_id(text: str) -> str:
    return hashlib.md5(text.encode("utf-8")).hexdigest()


def is_english_site(url: str) -> bool:
    return bool(url) and (
        url.startswith("https://e.vnexpress.net/") or url.startswith("http://e.vnexpress.net/")
    )


def fetch_text(url: str) -> str:
    r = session.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r.text


def to_iso_utc(s: Optional[str]) -> Optional[str]:
    if not s:
        return None
    try:
        dt = dateparser.parse(s)
        if not dt:
            return None
        if dt.tzinfo is None:
            # nếu không có tz, coi như giờ VN (thực tế VnExpress thường có tz trong meta)
            if VN_TZ:
                dt = dt.replace(tzinfo=VN_TZ)
            else:
                dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc).isoformat()
    except Exception:
        return None


def iso_to_local_date(iso_utc: str) -> Optional[str]:
    """
    iso_utc -> YYYY-MM-DD theo giờ VN để so sánh với END_DATE.
    """
    if not iso_utc:
        return None
    try:
        dt = dateparser.parse(iso_utc)
        if not dt:
            return None
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        if VN_TZ:
            dt_local = dt.astimezone(VN_TZ)
        else:
            dt_local = dt
        return dt_local.date().isoformat()
    except Exception:
        return None


def ensure_csv_header(csv_path: str):
    if not os.path.exists(csv_path) or os.path.getsize(csv_path) == 0:
        with open(csv_path, "w", encoding="utf-8", newline="") as f:
            csv.writer(f).writerow(CSV_HEADER)


def load_seen_from_csv(csv_path: str) -> Tuple[Set[str], Set[str]]:
    seen_urls, seen_ids = set(), set()
    if not os.path.exists(csv_path):
        return seen_urls, seen_ids
    try:
        with open(csv_path, "r", encoding="utf-8", newline="") as f:
            r = csv.reader(f)
            header = next(r, None)
            if not header:
                return seen_urls, seen_ids
            id_idx = header.index("id") if "id" in header else 0
            url_idx = header.index("url") if "url" in header else 4
            for row in r:
                if len(row) > url_idx:
                    u = row[url_idx].strip()
                    if u:
                        seen_urls.add(u)
                if len(row) > id_idx:
                    i = row[id_idx].strip()
                    if i:
                        seen_ids.add(i)
    except Exception:
        pass
    return seen_urls, seen_ids


def append_row(csv_path: str, row: Dict[str, Any]):
    with open(csv_path, "a", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow([row.get(k, "") for k in CSV_HEADER])
        f.flush()


def extract_language_from_html(soup: BeautifulSoup) -> str:
    html_tag = soup.find("html")
    if html_tag:
        lang = html_tag.get("lang") or html_tag.get("xml:lang")
        if lang:
            lang = lang.lower().strip()
            if lang.startswith("vi"):
                return "vi"
            if lang.startswith("en"):
                return "en"
            return lang
    return DEFAULT_LANGUAGE


def extract_keywords_from_html(soup: BeautifulSoup) -> List[str]:
    for sel in ['meta[name="keywords"]', 'meta[name="news_keywords"]']:
        tag = soup.select_one(sel)
        if tag and tag.get("content"):
            raw = tag["content"]
            kws = [x.strip() for x in raw.split(",") if x.strip()]
            seen = set()
            out = []
            for k in kws:
                if k not in seen:
                    seen.add(k)
                    out.append(k)
            return out
    return []


def extract_article_meta(article_html: str) -> Dict[str, Any]:
    soup = BeautifulSoup(article_html, "lxml")

    # title
    title = ""
    og = soup.select_one('meta[property="og:title"]')
    if og and og.get("content"):
        title = og["content"].strip()
    if not title:
        h1 = soup.select_one("h1.title-detail, h1")
        if h1:
            title = h1.get_text(strip=True)

    # published_at
    pub = ""
    m_pub = soup.select_one('meta[property="article:published_time"]')
    if m_pub and m_pub.get("content"):
        pub = to_iso_utc(m_pub["content"].strip()) or ""
    if not pub:
        m2 = soup.select_one('meta[itemprop="datePublished"]')
        if m2 and m2.get("content"):
            pub = to_iso_utc(m2["content"].strip()) or ""
    if not pub:
        ttag = soup.select_one("time")
        if ttag:
            pub = to_iso_utc(ttag.get("datetime") or ttag.get_text(strip=True)) or ""

    # category.primary (fallback từ meta)
    category_primary = ""
    sec = soup.select_one('meta[property="article:section"]')
    if sec and sec.get("content"):
        category_primary = sec["content"].strip()

    language = extract_language_from_html(soup)
    keywords = extract_keywords_from_html(soup)

    return {
        "title": title,
        "published_at": pub,
        "language": language,
        "keywords": keywords,
        "category_from_article": category_primary,
        "entities": [],
    }


def extract_article_urls_from_category_page(html: str) -> List[str]:
    soup = BeautifulSoup(html, "lxml")

    urls = []
    for a in soup.select("a[href]"):
        href = a.get("href", "").strip()
        if not href:
            continue
        if href.startswith("/"):
            href = "https://vnexpress.net" + href
        if not href.startswith("https://vnexpress.net/"):
            continue
        if is_english_site(href):
            continue
        # bài viết thường kết thúc .html
        if ".html" not in href:
            continue
        # loại các link tracking/ảnh/video nếu cần
        urls.append(href.split("?")[0])

    # unique giữ thứ tự
    seen = set()
    out = []
    for u in urls:
        if u not in seen:
            seen.add(u)
            out.append(u)
    return out


def find_next_page_url(category_url: str, html: str, current_page: int) -> Optional[str]:
    """
    Cố gắng tìm link trang kế tiếp.
    VnExpress có thể dùng ?p=2 hoặc /p2 hoặc cấu trúc khác tùy thời điểm.
    Nếu không tìm được thì fallback theo quy ước ?p=...
    """
    soup = BeautifulSoup(html, "lxml")

    # thử rel=next
    ln = soup.select_one('link[rel="next"]')
    if ln and ln.get("href"):
        href = ln["href"].strip()
        if href.startswith("/"):
            href = "https://vnexpress.net" + href
        return href

    # thử anchor có text "Sau" / "Tiếp" / class pagination
    for sel in ['a.next', 'a[rel="next"]', '.pagination a.next', 'a:contains("Tiếp")', 'a:contains("Sau")']:
        # BeautifulSoup không hỗ trợ :contains chuẩn, nên chỉ dùng selector đơn giản
        pass

    # fallback: thử pattern ?p=
    # nếu category_url đã có ?p= thì tăng lên
    if "?p=" in category_url:
        return re.sub(r"\?p=\d+", f"?p={current_page+1}", category_url)
    return f"{category_url}?p={current_page+1}"


def make_row(url: str, meta: Dict[str, Any], category_fallback: str) -> Dict[str, Any]:
    id_ = md5_id(url)
    category_primary = meta.get("category_from_article") or category_fallback
    keywords_str = "|".join(meta.get("keywords") or [])
    entities_str = "|".join(meta.get("entities") or [])

    return {
        "id": id_,
        "title": meta.get("title") or "",
        "published_at": meta.get("published_at") or "",
        "source.name": SOURCE_NAME,
        "url": url,
        "language": meta.get("language") or DEFAULT_LANGUAGE,
        "category.primary": category_primary or "",
        "keywords": keywords_str,
        "entities": entities_str,
    }


def crawl_category(category_url: str, end_date: str, seen_urls: Set[str], seen_ids: Set[str]) -> int:
    """
    Crawl 1 chuyên mục từ mới -> cũ đến khi < end_date (YYYY-MM-DD, theo giờ VN).
    """
    added = 0
    page = 1
    url_page = category_url

    # dùng slug của category làm fallback category.primary
    category_slug = category_url.rstrip("/").split("/")[-1]

    while page <= MAX_PAGES_PER_CATEGORY and url_page:
        html = fetch_text(url_page)
        article_urls = extract_article_urls_from_category_page(html)

        if DEBUG:
            log(f"[{category_slug}] page {page} got {len(article_urls)} candidate urls: {url_page}")

        if not article_urls:
            # không có bài -> dừng
            break

        page_has_any_new = False
        page_all_older_than_end = True

        for aurl in article_urls:
            if aurl in seen_urls:
                continue
            if is_english_site(aurl):
                continue

            aid = md5_id(aurl)
            if aid in seen_ids:
                continue

            # fetch article
            try:
                ah = fetch_text(aurl)
                meta = extract_article_meta(ah)
            except Exception as e:
                log(f"[WARN] article fetch failed {aurl}: {e}")
                continue
            finally:
                polite_sleep()

            pub_iso = meta.get("published_at") or ""
            pub_local_date = iso_to_local_date(pub_iso) or ""

            # nếu có ngày và nhỏ hơn end_date => đánh dấu cũ
            if pub_local_date and pub_local_date < end_date:
                # bài này cũ hơn end_date
                # vẫn coi là old
                pass
            else:
                page_all_older_than_end = False

            # Nếu bài >= end_date thì ghi
            if (not pub_local_date) or (pub_local_date >= end_date):
                row = make_row(aurl, meta, category_fallback=category_slug)
                append_row(CSV_PATH, row)
                seen_urls.add(aurl)
                seen_ids.add(aid)
                added += 1
                page_has_any_new = True

        # Nếu cả trang toàn bài cũ hơn end_date thì dừng category này
        if page_all_older_than_end:
            if DEBUG:
                log(f"[{category_slug}] stop: page {page} all older than end_date={end_date}")
            break

        # đi trang tiếp
        next_url = find_next_page_url(category_url, html, current_page=page)
        if next_url == url_page:
            break
        url_page = next_url
        page += 1
        polite_sleep()

    return added


def main():
    ensure_csv_header(CSV_PATH)
    seen_urls, seen_ids = load_seen_from_csv(CSV_PATH)

    total = 0
    for cat in CATEGORY_URLS:
        try:
            added = crawl_category(cat, END_DATE, seen_urls, seen_ids)
            print(f"[{cat}] added {added}")
            total += added
        except Exception as e:
            print(f"[{cat}] ERROR: {e}")

    print(f"Done. Total appended {total} rows to {CSV_PATH}")


if __name__ == "__main__":
    main()

[https://vnexpress.net/giao-duc] added 766
[https://vnexpress.net/du-lich] added 834
[https://vnexpress.net/oto-xe-may] added 1036
[https://vnexpress.net/doi-song] added 822
Done. Total appended 3458 rows to vnexpress_html_categories_vi_v2.csv
