In [None]:
%pip install requests beautifulsoup4 lxml python-dateutil tqdm

In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import csv
import os
import re
import time
import random
import hashlib
from datetime import datetime, timezone, timedelta
from typing import Optional, Dict, Any, List, Set, Tuple
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

VN_TZ = timezone(timedelta(hours=7))

# ================== CONFIG ==================
CATEGORY_URLS = [
    "https://thanhnien.vn/thoi-su.htm",
    "https://thanhnien.vn/the-gioi.htm",
    "https://thanhnien.vn/doi-song.htm",
    "https://thanhnien.vn/phap-luat.htm",
    "https://thanhnien.vn/giao-duc.htm",
    "https://thanhnien.vn/suc-khoe.htm",
    "https://thanhnien.vn/kinh-doanh.htm",
    "https://thanhnien.vn/ban-can-biet.htm",
    "https://thanhnien.vn/the-thao.htm",
    "https://thanhnien.vn/giai-tri.htm",
    "https://thanhnien.vn/xe.htm",
    "https://thanhnien.vn/cong-nghe.htm",
    "https://thanhnien.vn/du-lich.htm",
]

# Crawl từ mới -> cũ cho tới khi bài có ngày < END_DATE (theo giờ VN)
END_DATE = "2026-02-01"  # YYYY-MM-DD

# Không giới hạn số trang; chỉ dừng theo END_DATE hoặc hết trang
MAX_PAGES_PER_CATEGORY = None

CSV_PATH = "thanhnien_html_categories_vi.csv"

TIMEOUT = 25
REQUEST_DELAY_BASE = 0.25
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; ThanhNienHTMLCrawler/1.0; +https://thanhnien.vn/)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
# ===========================================

CSV_HEADER = [
    "id",
    "title",
    "published_at",        # ISO UTC
    "source.name",
    "url",
    "language",
    "category.primary",
    "keywords",
    "entities",
    "content.text",
]

SOURCE_NAME = "ThanhNien"
DEFAULT_LANGUAGE = "vi"
DEBUG = False

# lọc link không phải bài
BLOCKED_PREFIXES = (
    "/video/",
    "/tag/",
    "/photo/",
    "/multimedia/",
)

BLOCKED_PATTERNS = (
    "tin-nhanh-360.htm",
    "thong-tin-toa-soan.html",
    "rss.html",
    "policy.html",
)

# ----- HTTP session with retry -----
session = requests.Session()
session.headers.update(HEADERS)

retry = Retry(
    total=6,
    connect=6,
    read=6,
    backoff_factor=0.6,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET", "HEAD"],
    respect_retry_after_header=True,
    raise_on_status=False,
)
adapter = HTTPAdapter(max_retries=retry, pool_connections=50, pool_maxsize=50)
session.mount("http://", adapter)
session.mount("https://", adapter)


def log(msg: str):
    if DEBUG:
        print(msg)


def polite_sleep():
    time.sleep(REQUEST_DELAY_BASE + random.uniform(0, 0.4))


def md5_id(text: str) -> str:
    return hashlib.md5(text.encode("utf-8")).hexdigest()


def fetch_text(url: str) -> str:
    r = session.get(url, timeout=TIMEOUT, allow_redirects=True)
    r.raise_for_status()
    return r.text


def ensure_csv_header(csv_path: str):
    if not os.path.exists(csv_path) or os.path.getsize(csv_path) == 0:
        with open(csv_path, "w", encoding="utf-8", newline="") as f:
            csv.writer(f).writerow(CSV_HEADER)


def load_seen_from_csv(csv_path: str) -> Tuple[Set[str], Set[str]]:
    seen_urls, seen_ids = set(), set()
    if not os.path.exists(csv_path):
        return seen_urls, seen_ids
    try:
        with open(csv_path, "r", encoding="utf-8", newline="") as f:
            r = csv.reader(f)
            header = next(r, None)
            if not header:
                return seen_urls, seen_ids
            id_idx = header.index("id") if "id" in header else 0
            url_idx = header.index("url") if "url" in header else 4
            for row in r:
                if len(row) > url_idx:
                    u = row[url_idx].strip()
                    if u:
                        seen_urls.add(u)
                if len(row) > id_idx:
                    i = row[id_idx].strip()
                    if i:
                        seen_ids.add(i)
    except Exception:
        pass
    return seen_urls, seen_ids


def append_row(csv_path: str, row: Dict[str, Any]):
    with open(csv_path, "a", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow([row.get(k, "") for k in CSV_HEADER])
        f.flush()


def iso_to_local_date(iso_utc: str) -> Optional[str]:
    if not iso_utc:
        return None
    try:
        dt = datetime.fromisoformat(iso_utc.replace("Z", "+00:00"))
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(VN_TZ).date().isoformat()
    except Exception:
        return None


def parse_time_to_iso_utc(dt_str: str) -> Optional[str]:
    """
    Thanh Niên format: có thể là ISO hoặc format khác
    Output: ISO UTC (YYYY-MM-DDTHH:MM:SS+00:00)
    """
    if not dt_str:
        return None
    dt_str = dt_str.strip()
    
    # Thử parse ISO format trước
    try:
        dt = datetime.fromisoformat(dt_str.replace("Z", "+00:00"))
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=VN_TZ)
        return dt.astimezone(timezone.utc).isoformat()
    except Exception:
        pass
    
    # Thử format DD/MM/YYYY HH:MM
    try:
        dt_local = datetime.strptime(dt_str, "%d/%m/%Y %H:%M").replace(tzinfo=VN_TZ)
        return dt_local.astimezone(timezone.utc).isoformat()
    except Exception:
        pass
    
    # Thử format YYYY-MM-DD HH:MM
    try:
        dt_local = datetime.strptime(dt_str, "%Y-%m-%d %H:%M").replace(tzinfo=VN_TZ)
        return dt_local.astimezone(timezone.utc).isoformat()
    except Exception:
        pass
    
    return None


def extract_article_meta(article_html: str) -> Dict[str, Any]:
    soup = BeautifulSoup(article_html, "lxml")

    # title
    title = ""
    og = soup.select_one('meta[property="og:title"]')
    if og and og.get("content"):
        title = og["content"].strip()
    if not title:
        h1 = soup.select_one("h1")
        if h1:
            title = h1.get_text(strip=True)

    # published_at
    pub = ""
    # Thử meta tag article:published_time
    m_pub = soup.select_one('meta[property="article:published_time"]')
    if m_pub and m_pub.get("content"):
        pub = parse_time_to_iso_utc(m_pub["content"]) or ""
    
    # Thử time tag với datetime
    if not pub:
        t_tag = soup.select_one("time[datetime]")
        if t_tag and t_tag.get("datetime"):
            pub = parse_time_to_iso_utc(t_tag["datetime"]) or ""
    
    # Thử tìm trong class date hoặc time
    if not pub:
        date_elem = soup.select_one(".time, .date, .cms-date")
        if date_elem:
            pub = parse_time_to_iso_utc(date_elem.get_text(strip=True)) or ""

    # category.primary
    category_primary = ""
    sec = soup.select_one('meta[property="article:section"]')
    if sec and sec.get("content"):
        category_primary = sec["content"].strip()

    # language (đa số vi)
    language = DEFAULT_LANGUAGE
    html_tag = soup.find("html")
    if html_tag:
        lang = html_tag.get("lang") or html_tag.get("xml:lang")
        if lang:
            language = lang.lower().strip()

    # keywords (nếu có)
    keywords = []
    kw = soup.select_one('meta[name="keywords"]')
    if kw and kw.get("content"):
        keywords = [x.strip() for x in kw["content"].split(",") if x.strip()]

    # content.text - trích xuất nội dung bài báo
    content_text = ""
    # Thanh Niên thường dùng class .detail-cmain hoặc .detail-content
    article_body = soup.select_one("article .detail-cmain")
    if not article_body:
        article_body = soup.select_one(".detail-cmain")
    if not article_body:
        article_body = soup.select_one(".detail-content")
    if not article_body:
        article_body = soup.select_one("article .content")
    if not article_body:
        article_body = soup.select_one("article")
    
    if article_body:
        # Lấy tất cả đoạn văn
        paragraphs = article_body.find_all("p")
        text_parts = []
        for p in paragraphs:
            text = p.get_text(strip=True)
            if text:
                text_parts.append(text)
        content_text = " ".join(text_parts)

    return {
        "title": title,
        "published_at": pub,
        "language": language,
        "keywords": keywords,
        "category_from_article": category_primary,
        "entities": [],
        "content_text": content_text,
    }


def normalize_url(u: str) -> Optional[str]:
    if not u:
        return None
    u = u.strip()
    if u.startswith("//"):
        u = "https:" + u
    if u.startswith("/"):
        u = "https://thanhnien.vn" + u
    if not (u.startswith("https://thanhnien.vn/") or u.startswith("http://thanhnien.vn/")):
        return None
    try:
        p = urlparse(u)
        p = p._replace(query="", fragment="")
        return urlunparse(p)
    except Exception:
        return u.split("?")[0].split("#")[0]


def is_category_url(u: str) -> bool:
    # category: /thoi-su.htm (1 segment)
    try:
        p = urlparse(u)
        path = (p.path or "").strip("/")
        return bool(path) and path.endswith(".htm") and path.count("/") == 0 and not re.search(r'\-\d+\.htm$', path)
    except Exception:
        return False


def is_allowed_article_url(u: str) -> bool:
    try:
        p = urlparse(u)
        path = p.path or ""
        if not path.startswith("/"):
            return False

        for pref in BLOCKED_PREFIXES:
            if path.startswith(pref):
                return False
        
        # Kiểm tra blocked patterns
        for pattern in BLOCKED_PATTERNS:
            if pattern in u:
                return False

        # Thanh Niên dùng pattern: /ten-bai-viet-{id}.htm
        # ID là chuỗi số dài ở cuối
        if not path.endswith(".htm"):
            return False

        if is_category_url(u):
            return False

        # Bài viết phải có dạng /slug-{số}.htm hoặc /category/slug-{số}.htm
        # Ví dụ: /bat-chu-hui-o-ca-mau-185260202003431028.htm
        filename = path.split("/")[-1]
        if not re.search(r'\-\d+\.htm$', filename):
            return False

        return True
    except Exception:
        return False


def extract_article_urls_from_category_page(html: str) -> List[str]:
    soup = BeautifulSoup(html, "lxml")
    urls: List[str] = []
    for a in soup.select("a[href]"):
        nu = normalize_url(a.get("href", ""))
        if not nu:
            continue
        if not is_allowed_article_url(nu):
            continue
        urls.append(nu)

    seen = set()
    out = []
    for u in urls:
        if u not in seen:
            seen.add(u)
            out.append(u)
    return out


def set_page_param(url: str, page: int) -> str:
    """Thanh Niên dùng format trang-{page}.htm"""
    # Nếu đã có trang-X.htm thì thay thế
    if re.search(r"trang-\d+\.htm", url):
        return re.sub(r"trang-\d+\.htm", f"trang-{page}.htm", url)
    # Nếu chưa có (trang 1), thêm vào
    return url.replace(".htm", f"/trang-{page}.htm")


def find_next_page_url(category_url: str, html: str, current_page: int) -> Optional[str]:
    soup = BeautifulSoup(html, "lxml")

    # Thử tìm link rel="next"
    ln = soup.select_one('link[rel="next"]')
    if ln and ln.get("href"):
        nu = normalize_url(ln["href"].strip())
        if nu:
            return nu

    # Thử tìm link pagination
    a_next = soup.select_one('a[rel="next"], a.next, .pagination a.next')
    if a_next and a_next.get("href"):
        nu = normalize_url(a_next["href"])
        if nu:
            return nu

    # Fallback: trang tiếp theo
    return set_page_param(category_url, current_page + 1)


def make_row(url: str, meta: Dict[str, Any], category_fallback: str) -> Dict[str, Any]:
    return {
        "id": md5_id(url),
        "title": meta.get("title") or "",
        "published_at": meta.get("published_at") or "",
        "source.name": SOURCE_NAME,
        "url": url,
        "language": meta.get("language") or DEFAULT_LANGUAGE,
        "category.primary": (meta.get("category_from_article") or category_fallback) or "",
        "keywords": "|".join(meta.get("keywords") or []),
        "entities": "|".join(meta.get("entities") or []),
        "content.text": meta.get("content_text") or "",
    }


def category_slug_from_url(category_url: str) -> str:
    path = urlparse(category_url).path
    base = path.rstrip("/").split("/")[-1]
    return re.sub(r"\.htm$", "", base)


def crawl_category(category_url: str, end_date: str, seen_urls: Set[str], seen_ids: Set[str]) -> int:
    added = 0
    page = 1
    url_page = category_url
    category_slug = category_slug_from_url(category_url)

    while url_page and (MAX_PAGES_PER_CATEGORY is None or page <= MAX_PAGES_PER_CATEGORY):
        try:
            html = fetch_text(url_page)
        except Exception as e:
            log(f"[WARN] Failed to fetch category page {url_page}: {e}")
            break
        
        article_urls = extract_article_urls_from_category_page(html)

        if not article_urls:
            break

        page_all_older_than_end = True

        for aurl in article_urls:
            if aurl in seen_urls:
                continue
            aid = md5_id(aurl)
            if aid in seen_ids:
                continue

            try:
                ah = fetch_text(aurl)
                meta = extract_article_meta(ah)
            except Exception as e:
                log(f"[WARN] article fetch failed {aurl}: {e}")
                continue
            finally:
                polite_sleep()

            pub_iso = meta.get("published_at") or ""
            pub_local_date = iso_to_local_date(pub_iso) or ""

            # Nếu bài không có pub => tránh stop sớm
            if not pub_local_date:
                page_all_older_than_end = False
            elif pub_local_date >= end_date:
                page_all_older_than_end = False

            # chỉ ghi bài >= END_DATE (hoặc không có pub)
            if (not pub_local_date) or (pub_local_date >= end_date):
                row = make_row(aurl, meta, category_fallback=category_slug)
                append_row(CSV_PATH, row)
                seen_urls.add(aurl)
                seen_ids.add(aid)
                added += 1

        # dừng theo END_DATE
        if page_all_older_than_end:
            break

        next_url = find_next_page_url(category_url, html, current_page=page)
        if not next_url or next_url == url_page:
            break

        url_page = next_url
        page += 1
        polite_sleep()

    return added


def main():
    ensure_csv_header(CSV_PATH)
    seen_urls, seen_ids = load_seen_from_csv(CSV_PATH)

    total = 0
    for cat in CATEGORY_URLS:
        try:
            added = crawl_category(cat, END_DATE, seen_urls, seen_ids)
            print(f"[{cat}] added {added}")
            total += added
        except Exception as e:
            print(f"[{cat}] ERROR: {e}")

    print(f"Done. Total appended {total} rows to {CSV_PATH}")


if __name__ == "__main__":
    main()


[https://thanhnien.vn/thoi-su.htm] added 31


KeyboardInterrupt: 

In [2]:
# TEST: Kiểm tra cấu trúc trang Thanh Niên
import requests
from bs4 import BeautifulSoup

test_url = "https://thanhnien.vn/thoi-su.htm"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

try:
    r = requests.get(test_url, headers=headers, timeout=15)
    print(f"Status: {r.status_code}")
    print(f"URL: {r.url}")
    print(f"Content length: {len(r.text)}")
    
    soup = BeautifulSoup(r.text, "lxml")
    
    # Tìm tất cả links
    all_links = soup.select("a[href]")
    print(f"\nTotal links found: {len(all_links)}")
    
    # Lọc ra các link có chứa .html
    html_links = [a.get("href") for a in all_links if a.get("href") and ".html" in a.get("href")]
    print(f"Links with .html: {len(html_links)}")
    
    # In ra 10 link đầu tiên
    print("\nFirst 10 .html links:")
    for i, link in enumerate(html_links[:10], 1):
        print(f"{i}. {link}")
    
    # Kiểm tra xem có bài viết nào không
    article_links = []
    for link in html_links[:20]:
        if link.startswith("/"):
            full_link = "https://thanhnien.vn" + link
        else:
            full_link = link
        
        if "thanhnien.vn" in full_link and ".html" in full_link:
            article_links.append(full_link)
    
    print(f"\nPotential article links: {len(article_links)}")
    for i, link in enumerate(article_links[:5], 1):
        print(f"{i}. {link}")
        
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()


Status: 200
URL: https://thanhnien.vn/thoi-su.htm
Content length: 307915

Total links found: 264
Links with .html: 5

First 10 .html links:
1. https://my.thanhnien.vn/page/login.html?redirect_url=https://thanhnien.vn/thoi-su.htm
2. /thong-tin-toa-soan.html
3. https://thanhnien.vn/rss.html
4. https://thanhnien.vn/thong-tin-toa-soan.html
5. https://thanhnien.vn/policy.html

Potential article links: 5
1. https://my.thanhnien.vn/page/login.html?redirect_url=https://thanhnien.vn/thoi-su.htm
2. https://thanhnien.vn/thong-tin-toa-soan.html
3. https://thanhnien.vn/rss.html
4. https://thanhnien.vn/thong-tin-toa-soan.html
5. https://thanhnien.vn/policy.html


In [3]:
# TEST 2: Xem tất cả các link và pattern của chúng
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

test_url = "https://thanhnien.vn/thoi-su.htm"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

try:
    r = requests.get(test_url, headers=headers, timeout=15)
    soup = BeautifulSoup(r.text, "lxml")
    
    all_links = soup.select("a[href]")
    
    # Phân loại các link
    internal_links = []
    for a in all_links:
        href = a.get("href", "").strip()
        if not href or href.startswith("#"):
            continue
        
        if href.startswith("/"):
            href = "https://thanhnien.vn" + href
        
        if "thanhnien.vn" in href:
            internal_links.append(href)
    
    print(f"Total internal links: {len(internal_links)}")
    
    # Tìm pattern của bài viết
    # Thanh Niên có thể dùng pattern khác
    article_patterns = {}
    for link in internal_links:
        parsed = urlparse(link)
        path = parsed.path
        
        # Bỏ qua trang chủ, category
        if path in ["/", "/thoi-su.htm", "/the-gioi.htm"] or path.endswith(".htm"):
            continue
        
        # Phân tích pattern
        parts = path.strip("/").split("/")
        if len(parts) >= 2:
            pattern = f"/{parts[0]}/..."
            if pattern not in article_patterns:
                article_patterns[pattern] = []
            if len(article_patterns[pattern]) < 3:
                article_patterns[pattern].append(link)
    
    print("\nArticle URL patterns found:")
    for pattern, examples in article_patterns.items():
        print(f"\n{pattern} ({len(examples)} examples):")
        for ex in examples[:3]:
            print(f"  - {ex}")
    
    # Tìm các link có số ID (thường là bài viết)
    print("\n\nLinks with numeric IDs:")
    id_links = []
    for link in internal_links[:50]:
        if any(char.isdigit() for char in link) and not any(x in link for x in [".htm", "login", "page", "trang-"]):
            id_links.append(link)
    
    for i, link in enumerate(id_links[:10], 1):
        print(f"{i}. {link}")
        
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()


Total internal links: 232

Article URL patterns found:

/page/... (1 examples):
  - https://my.thanhnien.vn/page/login.html?redirect_url=https://thanhnien.vn/thoi-su.htm


Links with numeric IDs:


In [4]:
# TEST 3: Xem HTML thô và tìm dữ liệu JSON/API
import requests
from bs4 import BeautifulSoup
import re
import json

test_url = "https://thanhnien.vn/thoi-su.htm"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

try:
    r = requests.get(test_url, headers=headers, timeout=15)
    html = r.text
    
    # Tìm các link bài viết trong HTML (có thể có pattern khác)
    # Thanh Niên có thể dùng /-12345.htm hoặc /slug-12345.htm
    patterns_to_check = [
        r'https://thanhnien\.vn/[^"\'>\s]+\-\d+\.htm',  # /slug-123456.htm
        r'/[^"\'>\s]+\-\d+\.htm',  # /slug-123456.htm
        r'"url":"([^"]+)"',  # JSON data
    ]
    
    print("Searching for article URL patterns in HTML...")
    for pattern in patterns_to_check:
        matches = re.findall(pattern, html)
        if matches:
            print(f"\nPattern: {pattern}")
            print(f"Found {len(matches)} matches")
            unique_matches = list(set(matches))[:5]
            for match in unique_matches:
                print(f"  - {match}")
    
    # Tìm script tags có thể chứa dữ liệu
    soup = BeautifulSoup(html, "lxml")
    scripts = soup.find_all("script")
    
    print(f"\n\nFound {len(scripts)} script tags")
    
    # Tìm các script có JSON data
    for i, script in enumerate(scripts):
        content = script.string or ""
        if "article" in content.lower() or "post" in content.lower():
            if len(content) > 100 and len(content) < 5000:
                print(f"\n--- Script {i+1} (first 500 chars) ---")
                print(content[:500])
                
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()


Searching for article URL patterns in HTML...

Pattern: https://thanhnien\.vn/[^"\'>\s]+\-\d+\.htm
Found 29 matches
  - https://thanhnien.vn/ca-sau-khung-xuat-hien-o-kenh-xang-nang-mau-la-thong-tin-sai-su-that-185251212113345486.htm
  - https://thanhnien.vn/tang-giay-khen-nam-sinh-ho-tro-phi-cong-trong-vu-may-bay-roi-o-dak-lak-185260129140610757.htm
  - https://thanhnien.vn/bat-chu-hui-o-ca-mau-lua-dao-chiem-doat-gan-900-trieu-dong-185260202003431028.htm
  - https://thanhnien.vn/tphcm-nhieu-vu-ngo-doc-thuc-pham-khong-xac-dinh-duoc-can-nguyen-185260202152935604.htm
  - https://thanhnien.vn/xuc-dong-gap-lai-ong-noi-trong-khu-luu-giu-thi-hai-o-tphcm-1852601302318233.htm

Pattern: /[^"\'>\s]+\-\d+\.htm
Found 258 matches
  - /nguoi-dan-khong-nen-hoang-mang-185260131200624933.htm
  - /xuc-dong-gap-lai-ong-noi-trong-khu-luu-giu-thi-hai-o-tphcm-1852601302318233.htm
  - /khanh-hoa-khanh-thanh-tru-so-hanh-chinh-tap-trung-hon-544-ti-dong-185260128123250169.htm
  - /xa-cu-chi-phoi-hop-cac-don-vi-g

In [5]:
# TEST 4: Test lại với code đã sửa
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse
import re

def normalize_url(u: str):
    if not u:
        return None
    u = u.strip()
    if u.startswith("//"):
        u = "https:" + u
    if u.startswith("/"):
        u = "https://thanhnien.vn" + u
    if not (u.startswith("https://thanhnien.vn/") or u.startswith("http://thanhnien.vn/")):
        return None
    try:
        p = urlparse(u)
        p = p._replace(query="", fragment="")
        return urlunparse(p)
    except Exception:
        return u.split("?")[0].split("#")[0]

def is_category_url(u: str) -> bool:
    try:
        p = urlparse(u)
        path = (p.path or "").strip("/")
        return bool(path) and path.endswith(".htm") and path.count("/") == 0 and not re.search(r'\-\d+\.htm$', path)
    except Exception:
        return False

def is_allowed_article_url(u: str) -> bool:
    try:
        p = urlparse(u)
        path = p.path or ""
        if not path.startswith("/"):
            return False
        
        blocked = ("/video/", "/tag/", "/photo/", "/multimedia/")
        for pref in blocked:
            if path.startswith(pref):
                return False
        
        if not path.endswith(".htm"):
            return False
        
        if is_category_url(u):
            return False
        
        filename = path.split("/")[-1]
        if not re.search(r'\-\d+\.htm$', filename):
            return False
        
        return True
    except Exception:
        return False

# Test
test_url = "https://thanhnien.vn/thoi-su.htm"
headers = {"User-Agent": "Mozilla/5.0"}

r = requests.get(test_url, headers=headers, timeout=15)
soup = BeautifulSoup(r.text, "lxml")

urls = []
for a in soup.select("a[href]"):
    nu = normalize_url(a.get("href", ""))
    if not nu:
        continue
    if not is_allowed_article_url(nu):
        continue
    urls.append(nu)

seen = set()
out = []
for u in urls:
    if u not in seen:
        seen.add(u)
        out.append(u)

print(f"Found {len(out)} article URLs")
print("\nFirst 10 articles:")
for i, url in enumerate(out[:10], 1):
    print(f"{i}. {url}")


Found 69 article URLs

First 10 articles:
1. https://thanhnien.vn/tin-nhanh-360.htm
2. https://thanhnien.vn/tham-my-chui-bua-vay-den-tan-nha-khach-de-hanh-nghe-185260201201830897.htm
3. https://thanhnien.vn/tuyen-an-100-bi-cao-vu-cap-220000-phieu-thu-nghiem-gia-185260202110100969.htm
4. https://thanhnien.vn/tphcm-thanh-lap-37-ban-quan-ly-du-an-truc-thuoc-phuong-xa-1852602020857329.htm
5. https://thanhnien.vn/gia-lai-nguoi-tong-xe-vao-csgt-bi-khoi-to-toi-giet-nguoi-185260202102947406.htm
6. https://thanhnien.vn/pho-di-bo-ben-bo-bien-dep-nhat-quang-ngai-185260202080808302.htm
7. https://thanhnien.vn/dai-tuong-luong-tam-quang-ra-soat-danh-sach-cu-tri-bao-dam-quyen-bau-cu-185260202164814066.htm
8. https://thanhnien.vn/tphcm-ra-soat-cong-tac-chuan-bi-bau-cu-185260127171552695.htm
9. https://thanhnien.vn/vi-sao-phai-dua-gau-ngua-misa-nang-120-kg-ve-vuon-quoc-gia-bach-ma-185260202153331996.htm
10. https://thanhnien.vn/vuon-quoc-gia-chu-mom-ray-gau-ngua-ga-tien-mat-do-quy-hiem-sap-bay-anh-1852

In [6]:
# TEST 5: Test crawl một bài viết thực tế
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timezone, timedelta

VN_TZ = timezone(timedelta(hours=7))

def parse_time_to_iso_utc(dt_str: str):
    if not dt_str:
        return None
    dt_str = dt_str.strip()
    
    # Thử parse ISO format trước
    try:
        dt = datetime.fromisoformat(dt_str.replace("Z", "+00:00"))
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=VN_TZ)
        return dt.astimezone(timezone.utc).isoformat()
    except Exception:
        pass
    
    return None

# Test với một bài viết thực
test_article = "https://thanhnien.vn/tham-my-chui-bua-vay-den-tan-nha-khach-de-hanh-nghe-185260201201830897.htm"
headers = {"User-Agent": "Mozilla/5.0"}

r = requests.get(test_article, headers=headers, timeout=15)
soup = BeautifulSoup(r.text, "lxml")

# Lấy title
title = ""
og = soup.select_one('meta[property="og:title"]')
if og and og.get("content"):
    title = og["content"].strip()

# Lấy published_at
pub = ""
m_pub = soup.select_one('meta[property="article:published_time"]')
if m_pub and m_pub.get("content"):
    pub = parse_time_to_iso_utc(m_pub["content"]) or ""

# Lấy content
content_text = ""
article_body = soup.select_one("article .detail-cmain")
if not article_body:
    article_body = soup.select_one(".detail-cmain")
if not article_body:
    article_body = soup.select_one(".detail-content")

if article_body:
    paragraphs = article_body.find_all("p")
    text_parts = []
    for p in paragraphs:
        text = p.get_text(strip=True)
        if text:
            text_parts.append(text)
    content_text = " ".join(text_parts)

print(f"Title: {title}")
print(f"Published: {pub}")
print(f"Content length: {len(content_text)} chars")
print(f"\nFirst 500 chars of content:\n{content_text[:500]}")


Title: Thẩm mỹ 'chui' bủa vây: Đến tận nhà khách để hành nghề
Published: 2026-02-02T00:22:00+00:00
Content length: 7031 chars

First 500 chars of content:
Không đăng ký kinh doanh, không có giấy phép hoạt động khám chữa bệnh, không có giấy phép hành nghề chuyên môn, nhưng một số người vẫn ngang nhiên hành nghềthẩm mỹ. Hầu hết đều quảng cáo bản thân là bác sĩ, mời chào làm đẹp trên các trang mạng xã hội hay qua hội nhóm, thực hiện các thủ thuật từtiêm fillerđến nâng mũi, cắt mí, độn cằm. Hộp thuốc mà cô gái tên L. dự định tiêm cho PV Ảnh: Thanh Niên Chỉ cần một tin nhắn hay cuộc điện thoại hẹn, họ sẽ đến nhà khách hàng cùng chiếc vali hoặc túi xách
