In [None]:
%pip install requests beautifulsoup4 lxml python-dateutil tqdm

Collecting lxml
  Using cached lxml-6.0.2-cp310-cp310-win_amd64.whl.metadata (3.7 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached lxml-6.0.2-cp310-cp310-win_amd64.whl (4.0 MB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, lxml

   ---------------------------------------- 0/2 [tqdm]
   -------------------- ------------------- 1/2 [lxml]
   -------------------- ------------------- 1/2 [lxml]
   -------------------- ------------------- 1/2 [lxml]
   -------------------- ------------------- 1/2 [lxml]
   ---------------------------------------- 2/2 [lxml]

Successfully installed lxml-6.0.2 tqdm-4.67.1



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import csv
import os
import re
import time
import random
import hashlib
from datetime import datetime, timezone, timedelta
from typing import Optional, Dict, Any, List, Set, Tuple
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

VN_TZ = timezone(timedelta(hours=7))

# ================== CONFIG ==================
CATEGORY_URLS = [
    # "https://dantri.com.vn/the-gioi.htm",
    # "https://dantri.com.vn/thoi-su.htm",
    # "https://dantri.com.vn/phap-luat.htm",
    "https://dantri.com.vn/suc-khoe.htm",
    "https://dantri.com.vn/doi-song.htm",
    "https://dantri.com.vn/du-lich.htm",
    "https://dantri.com.vn/kinh-doanh.htm",
    "https://dantri.com.vn/the-thao.htm",
    "https://dantri.com.vn/giai-tri.htm",
    "https://dantri.com.vn/giao-duc.htm",
    "https://dantri.com.vn/cong-nghe.htm",
]

# Crawl từ mới -> cũ cho tới khi bài có ngày < END_DATE (theo giờ VN)
END_DATE = "2025-11-01"  # YYYY-MM-DD

# Không giới hạn số trang; chỉ dừng theo END_DATE hoặc hết trang
MAX_PAGES_PER_CATEGORY = None

CSV_PATH = "dantri_html_categories_vi.csv"

TIMEOUT = 25
REQUEST_DELAY_BASE = 0.25
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; DanTriHTMLCrawler/2.0; +https://dantri.com.vn/)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
# ===========================================

CSV_HEADER = [
    "id",
    "title",
    "published_at",        # ISO UTC
    "source.name",
    "url",
    "language",
    "category.primary",
    "keywords",
    "entities",
]

SOURCE_NAME = "DanTri"
DEFAULT_LANGUAGE = "vi"
DEBUG = False

# lọc link không phải bài
BLOCKED_PREFIXES = (
    "/event/",
    "/tag/",
    "/video/",
    "/photo/",
    "/infographic/",
    "/emagazine/",
    "/interactive/",
)

ALLOWED_SECTIONS = {
    "the-gioi",
    "thoi-su",
    "phap-luat",
    "suc-khoe",
    "doi-song",
    "du-lich",
    "kinh-doanh",
    "the-thao",
    "giai-tri",
    "giao-duc",
    "cong-nghe",
}

# ----- HTTP session with retry -----
session = requests.Session()
session.headers.update(HEADERS)

retry = Retry(
    total=6,
    connect=6,
    read=6,
    backoff_factor=0.6,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET", "HEAD"],
    respect_retry_after_header=True,
    raise_on_status=False,
)
adapter = HTTPAdapter(max_retries=retry, pool_connections=50, pool_maxsize=50)
session.mount("http://", adapter)
session.mount("https://", adapter)


def log(msg: str):
    if DEBUG:
        print(msg)


def polite_sleep():
    time.sleep(REQUEST_DELAY_BASE + random.uniform(0, 0.4))


def md5_id(text: str) -> str:
    return hashlib.md5(text.encode("utf-8")).hexdigest()


# def fetch_text(url: str) -> str:
#     r = session.get(url, timeout=TIMEOUT)
#     r.raise_for_status()
#     return r.text
def fetch_text(url: str) -> str:
    r = session.get(url, timeout=TIMEOUT, allow_redirects=True)
    # print("FETCH:", url)
    # print("STATUS:", r.status_code)
    # print("FINAL_URL:", r.url)
    # print("CONTENT_TYPE:", r.headers.get("Content-Type", ""))
    # print("LEN:", len(r.text))
    # print("HAS author-time:", "author-time" in r.text)
    # # in thử 500 ký tự đầu
    # print("HEAD_SNIP:", r.text[:500].replace("\n", " ")[:500])
    r.raise_for_status()
    return r.text


def ensure_csv_header(csv_path: str):
    if not os.path.exists(csv_path) or os.path.getsize(csv_path) == 0:
        with open(csv_path, "w", encoding="utf-8", newline="") as f:
            csv.writer(f).writerow(CSV_HEADER)


def load_seen_from_csv(csv_path: str) -> Tuple[Set[str], Set[str]]:
    seen_urls, seen_ids = set(), set()
    if not os.path.exists(csv_path):
        return seen_urls, seen_ids
    try:
        with open(csv_path, "r", encoding="utf-8", newline="") as f:
            r = csv.reader(f)
            header = next(r, None)
            if not header:
                return seen_urls, seen_ids
            id_idx = header.index("id") if "id" in header else 0
            url_idx = header.index("url") if "url" in header else 4
            for row in r:
                if len(row) > url_idx:
                    u = row[url_idx].strip()
                    if u:
                        seen_urls.add(u)
                if len(row) > id_idx:
                    i = row[id_idx].strip()
                    if i:
                        seen_ids.add(i)
    except Exception:
        pass
    return seen_urls, seen_ids


def append_row(csv_path: str, row: Dict[str, Any]):
    with open(csv_path, "a", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow([row.get(k, "") for k in CSV_HEADER])
        f.flush()


def iso_to_local_date(iso_utc: str) -> Optional[str]:
    if not iso_utc:
        return None
    try:
        dt = datetime.fromisoformat(iso_utc.replace("Z", "+00:00"))
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(VN_TZ).date().isoformat()
    except Exception:
        return None


def parse_author_time_to_iso_utc(dt_str: str) -> Optional[str]:
    """
    Dantri: datetime="YYYY-MM-DD HH:MM" (giờ VN)
    Output: ISO UTC (YYYY-MM-DDTHH:MM:SS+00:00)
    """
    if not dt_str:
        return None
    dt_str = dt_str.strip()
    try:
        dt_local = datetime.strptime(dt_str, "%Y-%m-%d %H:%M").replace(tzinfo=VN_TZ)
        return dt_local.astimezone(timezone.utc).isoformat()
    except Exception:
        # phòng khi có giây
        try:
            dt_local = datetime.strptime(dt_str, "%Y-%m-%d %H:%M:%S").replace(tzinfo=VN_TZ)
            return dt_local.astimezone(timezone.utc).isoformat()
        except Exception:
            return None


def extract_article_meta(article_html: str) -> Dict[str, Any]:
    soup = BeautifulSoup(article_html, "lxml")

    # title
    title = ""
    og = soup.select_one('meta[property="og:title"]')
    if og and og.get("content"):
        title = og["content"].strip()
    if not title:
        h1 = soup.select_one("h1")
        if h1:
            title = h1.get_text(strip=True)

    # published_at (chuẩn Dantri theo bạn cung cấp)
    pub = ""
    t_auth = soup.select_one("time.author-time[datetime]")
    if t_auth and t_auth.get("datetime"):
        pub = parse_author_time_to_iso_utc(t_auth["datetime"]) or ""

    # category.primary
    category_primary = ""
    sec = soup.select_one('meta[property="article:section"]')
    if sec and sec.get("content"):
        category_primary = sec["content"].strip()

    # language (đa số vi)
    language = DEFAULT_LANGUAGE
    html_tag = soup.find("html")
    if html_tag:
        lang = html_tag.get("lang") or html_tag.get("xml:lang")
        if lang:
            language = lang.lower().strip()

    # keywords (nếu có)
    keywords = []
    kw = soup.select_one('meta[name="keywords"]')
    if kw and kw.get("content"):
        keywords = [x.strip() for x in kw["content"].split(",") if x.strip()]

    return {
        "title": title,
        "published_at": pub,
        "language": language,
        "keywords": keywords,
        "category_from_article": category_primary,
        "entities": [],
    }


def normalize_url(u: str) -> Optional[str]:
    if not u:
        return None
    u = u.strip()
    if u.startswith("//"):
        u = "https:" + u
    if u.startswith("/"):
        u = "https://dantri.com.vn" + u
    if not (u.startswith("https://dantri.com.vn/") or u.startswith("http://dantri.com.vn/")):
        return None
    try:
        p = urlparse(u)
        p = p._replace(query="", fragment="")
        return urlunparse(p)
    except Exception:
        return u.split("?")[0].split("#")[0]


def is_category_url(u: str) -> bool:
    # category: /the-gioi.htm (1 segment)
    try:
        p = urlparse(u)
        path = (p.path or "").strip("/")
        return bool(path) and path.endswith(".htm") and path.count("/") == 0
    except Exception:
        return False


def is_allowed_article_url(u: str) -> bool:
    try:
        p = urlparse(u)
        path = p.path or ""
        if not path.startswith("/"):
            return False

        for pref in BLOCKED_PREFIXES:
            if path.startswith(pref):
                return False

        if not path.endswith(".htm"):
            return False

        if is_category_url(u):
            return False

        parts = [x for x in path.strip("/").split("/") if x]
        if len(parts) < 2:
            return False

        return parts[0] in ALLOWED_SECTIONS
    except Exception:
        return False


def extract_article_urls_from_category_page(html: str) -> List[str]:
    soup = BeautifulSoup(html, "lxml")
    urls: List[str] = []
    for a in soup.select("a[href]"):
        nu = normalize_url(a.get("href", ""))
        if not nu:
            continue
        if not is_allowed_article_url(nu):
            continue
        urls.append(nu)

    seen = set()
    out = []
    for u in urls:
        if u not in seen:
            seen.add(u)
            out.append(u)
    return out


def set_page_param(url: str, page: int) -> str:
    p = urlparse(url)
    qs = parse_qs(p.query, keep_blank_values=True)
    qs["page"] = [str(page)]
    return urlunparse(p._replace(query=urlencode(qs, doseq=True)))


def find_next_page_url(category_url: str, html: str, current_page: int) -> Optional[str]:
    soup = BeautifulSoup(html, "lxml")

    ln = soup.select_one('link[rel="next"]')
    if ln and ln.get("href"):
        nu = normalize_url(ln["href"].strip())
        if nu:
            return nu

    a_next = soup.select_one('a[rel="next"], a.next, .pagination a.next')
    if a_next and a_next.get("href"):
        nu = normalize_url(a_next["href"])
        if nu:
            return nu

    return set_page_param(category_url, current_page + 1)


def make_row(url: str, meta: Dict[str, Any], category_fallback: str) -> Dict[str, Any]:
    return {
        "id": md5_id(url),
        "title": meta.get("title") or "",
        "published_at": meta.get("published_at") or "",
        "source.name": SOURCE_NAME,
        "url": url,
        "language": meta.get("language") or DEFAULT_LANGUAGE,
        "category.primary": (meta.get("category_from_article") or category_fallback) or "",
        "keywords": "|".join(meta.get("keywords") or []),
        "entities": "|".join(meta.get("entities") or []),
    }


def category_slug_from_url(category_url: str) -> str:
    path = urlparse(category_url).path
    base = path.rstrip("/").split("/")[-1]
    return re.sub(r"\.htm$", "", base)


def crawl_category(category_url: str, end_date: str, seen_urls: Set[str], seen_ids: Set[str]) -> int:
    added = 0
    page = 1
    url_page = category_url
    category_slug = category_slug_from_url(category_url)

    while url_page and (MAX_PAGES_PER_CATEGORY is None or page <= MAX_PAGES_PER_CATEGORY):
        html = fetch_text(url_page)
        article_urls = extract_article_urls_from_category_page(html)

        if not article_urls:
            break

        page_all_older_than_end = True

        for aurl in article_urls:
            if aurl in seen_urls:
                continue
            aid = md5_id(aurl)
            if aid in seen_ids:
                continue

            try:
                ah = fetch_text(aurl)
                meta = extract_article_meta(ah)
            except Exception as e:
                log(f"[WARN] article fetch failed {aurl}: {e}")
                continue
            finally:
                polite_sleep()

            pub_iso = meta.get("published_at") or ""
            pub_local_date = iso_to_local_date(pub_iso) or ""

            # Nếu bài không có pub => tránh stop sớm (nhưng theo bạn thì sẽ có)
            if not pub_local_date:
                page_all_older_than_end = False
            elif pub_local_date >= end_date:
                page_all_older_than_end = False

            # chỉ ghi bài >= END_DATE (hoặc không có pub)
            if (not pub_local_date) or (pub_local_date >= end_date):
                row = make_row(aurl, meta, category_fallback=category_slug)
                append_row(CSV_PATH, row)
                seen_urls.add(aurl)
                seen_ids.add(aid)
                added += 1

        # dừng theo END_DATE
        if page_all_older_than_end:
            break

        next_url = find_next_page_url(category_url, html, current_page=page)
        if not next_url or next_url == url_page:
            break

        url_page = next_url
        page += 1
        polite_sleep()

    return added


def main():
    ensure_csv_header(CSV_PATH)
    seen_urls, seen_ids = load_seen_from_csv(CSV_PATH)

    total = 0
    for cat in CATEGORY_URLS:
        try:
            added = crawl_category(cat, END_DATE, seen_urls, seen_ids)
            print(f"[{cat}] added {added}")
            total += added
        except Exception as e:
            print(f"[{cat}] ERROR: {e}")

    print(f"Done. Total appended {total} rows to {CSV_PATH}")


if __name__ == "__main__":
    main()

[https://dantri.com.vn/suc-khoe.htm] added 687
[https://dantri.com.vn/doi-song.htm] added 443
[https://dantri.com.vn/du-lich.htm] added 384
[https://dantri.com.vn/kinh-doanh.htm] added 746
[https://dantri.com.vn/the-thao.htm] added 718
[https://dantri.com.vn/giai-tri.htm] added 753
[https://dantri.com.vn/giao-duc.htm] added 611


KeyboardInterrupt: 