In [None]:
%pip install requests beautifulsoup4 lxml python-dateutil feedparser

In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import csv
import os
import re
import time
import random
import hashlib
from datetime import datetime, timezone, timedelta
from typing import Optional, Dict, Any, List, Set, Tuple

import requests
import feedparser
from bs4 import BeautifulSoup
from dateutil import parser as dateparser
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

VN_TZ = timezone(timedelta(hours=7))

# ================== CONFIG ==================
RSS_FEEDS = [
    ("https://cdn.24h.com.vn/upload/rss/trangchu24h.rss", "trang-chu"),
    ("https://cdn.24h.com.vn/upload/rss/tintuctrongngay.rss", "tin-tuc-trong-ngay"),
    ("https://cdn.24h.com.vn/upload/rss/bongda.rss", "bong-da"),
    ("https://cdn.24h.com.vn/upload/rss/asiancup2019.rss", "the-thao"),
    ("https://cdn.24h.com.vn/upload/rss/thoitrang.rss", "thoi-trang"),
    ("https://cdn.24h.com.vn/upload/rss/thoitranghitech.rss", "hi-tech"),
    ("https://cdn.24h.com.vn/upload/rss/taichinhbatdongsan.rss", "tai-chinh-bat-dong-san"),
    ("https://cdn.24h.com.vn/upload/rss/phim.rss", "phim"),
    ("https://cdn.24h.com.vn/upload/rss/giaoducduhoc.rss", "giao-duc-du-hoc"),
    ("https://cdn.24h.com.vn/upload/rss/bantrecuocsong.rss", "ban-tre-cuoc-song"),
    ("https://cdn.24h.com.vn/upload/rss/thethao.rss", "the-thao"),
]

# Crawl t·ª´ m·ªõi -> c≈© cho t·ªõi khi b√†i c√≥ ng√†y < END_DATE (theo gi·ªù VN)
# L∆∞u √Ω: RSS c·ªßa 24h ch·ªâ cung c·∫•p ~5 ng√†y data g·∫ßn nh·∫•t
END_DATE = "2026-01-30"  # YYYY-MM-DD (ƒëi·ªÅu ch·ªânh ph√π h·ª£p v·ªõi RSS limitation)

CSV_PATH = "24h_html_categories_vi.csv"

TIMEOUT = 25
REQUEST_DELAY_BASE = 0.25
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; 24hHTMLCrawler/1.0)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
# ===========================================

CSV_HEADER = [
    "id",
    "title",
    "published_at",
    "source.name",
    "url",
    "language",
    "category.primary",
    "keywords",
    "entities",
    "content.text",
]

SOURCE_NAME = "24h"
DEFAULT_LANGUAGE = "vi"
DEBUG = False

# ----- HTTP session with retry -----
session = requests.Session()
session.headers.update(HEADERS)

retry = Retry(
    total=6,
    connect=6,
    read=6,
    backoff_factor=0.6,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET", "HEAD"],
    respect_retry_after_header=True,
    raise_on_status=False,
)
adapter = HTTPAdapter(max_retries=retry, pool_connections=50, pool_maxsize=50)
session.mount("http://", adapter)
session.mount("https://", adapter)


def log(msg: str):
    if DEBUG:
        print(msg)


def polite_sleep():
    time.sleep(REQUEST_DELAY_BASE + random.uniform(0, 0.4))


def md5_id(text: str) -> str:
    return hashlib.md5(text.encode("utf-8")).hexdigest()


def fetch_text(url: str) -> str:
    r = session.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    # X·ª≠ l√Ω encoding ƒë·∫∑c bi·ªát c·ªßa 24h
    r.encoding = r.apparent_encoding or 'utf-8'
    return r.text


def fetch_rss(rss_url: str) -> feedparser.FeedParserDict:
    """Fetch v√† parse RSS feed, x·ª≠ l√Ω encoding ƒë√∫ng c√°ch"""
    r = session.get(rss_url, timeout=TIMEOUT)
    r.raise_for_status()
    # Feedparser t·ª± x·ª≠ l√Ω encoding
    feed = feedparser.parse(r.content)
    return feed


def to_iso_utc(s: Optional[str]) -> Optional[str]:
    if not s:
        return None
    try:
        dt = dateparser.parse(s)
        if not dt:
            return None
        if dt.tzinfo is None:
            if VN_TZ:
                dt = dt.replace(tzinfo=VN_TZ)
            else:
                dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc).isoformat()
    except Exception:
        return None


def iso_to_local_date(iso_utc: str) -> Optional[str]:
    if not iso_utc:
        return None
    try:
        dt = dateparser.parse(iso_utc)
        if not dt:
            return None
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        if VN_TZ:
            dt_local = dt.astimezone(VN_TZ)
        else:
            dt_local = dt
        return dt_local.date().isoformat()
    except Exception:
        return None


def ensure_csv_header(csv_path: str):
    if not os.path.exists(csv_path) or os.path.getsize(csv_path) == 0:
        with open(csv_path, "w", encoding="utf-8", newline="") as f:
            csv.writer(f).writerow(CSV_HEADER)


def load_seen_from_csv(csv_path: str) -> Tuple[Set[str], Set[str]]:
    seen_urls, seen_ids = set(), set()
    if not os.path.exists(csv_path):
        return seen_urls, seen_ids
    try:
        with open(csv_path, "r", encoding="utf-8", newline="") as f:
            r = csv.reader(f)
            header = next(r, None)
            if not header:
                return seen_urls, seen_ids
            id_idx = header.index("id") if "id" in header else 0
            url_idx = header.index("url") if "url" in header else 4
            for row in r:
                if len(row) > url_idx:
                    u = row[url_idx].strip()
                    if u:
                        seen_urls.add(u)
                if len(row) > id_idx:
                    i = row[id_idx].strip()
                    if i:
                        seen_ids.add(i)
    except Exception:
        pass
    return seen_urls, seen_ids


def append_row(csv_path: str, row: Dict[str, Any]):
    with open(csv_path, "a", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow([row.get(k, "") for k in CSV_HEADER])
        f.flush()


def extract_language_from_html(soup: BeautifulSoup) -> str:
    html_tag = soup.find("html")
    if html_tag:
        lang = html_tag.get("lang") or html_tag.get("xml:lang")
        if lang:
            lang = lang.lower().strip()
            if lang.startswith("vi"):
                return "vi"
            if lang.startswith("en"):
                return "en"
            return lang
    return DEFAULT_LANGUAGE


def extract_keywords_from_html(soup: BeautifulSoup) -> List[str]:
    for sel in ['meta[name="keywords"]', 'meta[name="news_keywords"]']:
        tag = soup.select_one(sel)
        if tag and tag.get("content"):
            raw = tag["content"]
            kws = [x.strip() for x in raw.split(",") if x.strip()]
            seen = set()
            out = []
            for k in kws:
                if k not in seen:
                    seen.add(k)
                    out.append(k)
            return out
    return []


def extract_article_meta(article_html: str) -> Dict[str, Any]:
    soup = BeautifulSoup(article_html, "lxml")

    # title
    title = ""
    og = soup.select_one('meta[property="og:title"]')
    if og and og.get("content"):
        title = og["content"].strip()
    if not title:
        h1 = soup.select_one("h1.title-detail, h1.cate-24h-title-detail, h1")
        if h1:
            title = h1.get_text(strip=True)

    # published_at
    pub = ""
    m_pub = soup.select_one('meta[property="article:published_time"]')
    if m_pub and m_pub.get("content"):
        pub = to_iso_utc(m_pub["content"].strip()) or ""
    if not pub:
        m2 = soup.select_one('meta[itemprop="datePublished"]')
        if m2 and m2.get("content"):
            pub = to_iso_utc(m2["content"].strip()) or ""
    if not pub:
        ttag = soup.select_one("time")
        if ttag:
            pub = to_iso_utc(ttag.get("datetime") or ttag.get_text(strip=True)) or ""
    if not pub:
        # 24h c√≥ th·ªÉ d√πng class .cate-24h-date-published
        date_pub = soup.select_one(".cate-24h-date-published")
        if date_pub:
            pub = to_iso_utc(date_pub.get_text(strip=True)) or ""

    # category.primary
    category_primary = ""
    sec = soup.select_one('meta[property="article:section"]')
    if sec and sec.get("content"):
        category_primary = sec["content"].strip()

    language = extract_language_from_html(soup)
    keywords = extract_keywords_from_html(soup)

    # content.text - 24h th∆∞·ªùng d√πng class .cate-24h-content-text
    content_text = ""
    article_body = soup.select_one(".cate-24h-content-text")
    if not article_body:
        article_body = soup.select_one("article .content-text")
    if not article_body:
        article_body = soup.select_one(".content-text")
    if not article_body:
        article_body = soup.select_one(".article-content")
    if not article_body:
        article_body = soup.select_one("article")
    
    if article_body:
        paragraphs = article_body.find_all("p")
        text_parts = []
        for p in paragraphs:
            text = p.get_text(strip=True)
            if text:
                text_parts.append(text)
        content_text = " ".join(text_parts)

    return {
        "title": title,
        "published_at": pub,
        "language": language,
        "keywords": keywords,
        "category_from_article": category_primary,
        "entities": [],
        "content_text": content_text,
    }


def make_row(url: str, meta: Dict[str, Any], category_fallback: str) -> Dict[str, Any]:
    id_ = md5_id(url)
    category_primary = meta.get("category_from_article") or category_fallback
    keywords_str = "|".join(meta.get("keywords") or [])
    entities_str = "|".join(meta.get("entities") or [])

    return {
        "id": id_,
        "title": meta.get("title") or "",
        "published_at": meta.get("published_at") or "",
        "source.name": SOURCE_NAME,
        "url": url,
        "language": meta.get("language") or DEFAULT_LANGUAGE,
        "category.primary": category_primary or "",
        "keywords": keywords_str,
        "entities": entities_str,
        "content.text": meta.get("content_text") or "",
    }


def crawl_rss_feed(rss_url: str, category_slug: str, end_date: str, 
                   seen_urls: Set[str], seen_ids: Set[str]) -> Tuple[int, int, int]:
    """
    Crawl articles t·ª´ RSS feed
    Returns: (added, skipped_duplicate, skipped_old)
    """
    added = 0
    skipped_old = 0
    skipped_duplicate = 0
    
    try:
        feed = fetch_rss(rss_url)
    except Exception as e:
        log(f"[WARN] RSS fetch failed {rss_url}: {e}")
        return (0, 0, 0)
    
    if not feed.entries:
        log(f"[WARN] No entries in RSS feed {rss_url}")
        return (0, 0, 0)
    
    for entry in feed.entries:
        article_url = entry.get("link", "").strip()
        if not article_url:
            continue
            
        # Normalize URL
        if not article_url.startswith("http"):
            article_url = "https://www.24h.com.vn" + article_url
        
        # L·∫•y published date t·ª´ RSS ƒë·ªÉ check tr∆∞·ªõc
        pub_date_rss = entry.get("published") or entry.get("updated")
        pub_iso_rss = to_iso_utc(pub_date_rss) if pub_date_rss else ""
        pub_local_date = iso_to_local_date(pub_iso_rss) or ""
        
        # Skip articles older than END_DATE tr∆∞·ªõc khi check duplicate
        # V√¨ RSS ƒë∆∞·ª£c s·∫Øp x·∫øp theo th·ªùi gian, c√≥ th·ªÉ early exit
        if pub_local_date and pub_local_date < end_date:
            skipped_old += 1
            continue
            
        # Check duplicate - QUAN TR·ªåNG: Skip n·∫øu ƒë√£ crawl
        # Khi ch·∫°y h√†ng ng√†y, ƒëa s·ªë articles s·∫Ω b·ªã skip ·ªü ƒë√¢y
        if article_url in seen_urls:
            skipped_duplicate += 1
            continue
            
        aid = md5_id(article_url)
        if aid in seen_ids:
            skipped_duplicate += 1
            continue
        
        # Fetch full article content (ch·ªâ v·ªõi articles m·ªõi)
        try:
            article_html = fetch_text(article_url)
            meta = extract_article_meta(article_html)
        except Exception as e:
            log(f"[WARN] article fetch failed {article_url}: {e}")
            # Fallback: use RSS data
            meta = {
                "title": entry.get("title", ""),
                "published_at": pub_iso_rss,
                "language": DEFAULT_LANGUAGE,
                "keywords": [],
                "category_from_article": "",
                "entities": [],
                "content_text": BeautifulSoup(entry.get("summary", ""), "lxml").get_text(strip=True),
            }
        finally:
            polite_sleep()
        
        # Use RSS published date if article doesn't have one
        if not meta.get("published_at") and pub_iso_rss:
            meta["published_at"] = pub_iso_rss
        
        row = make_row(article_url, meta, category_fallback=category_slug)
        append_row(CSV_PATH, row)
        seen_urls.add(article_url)
        seen_ids.add(aid)
        added += 1
    
    # Always show summary for transparency
    print(f"  [{category_slug}] RSS entries: {len(feed.entries)} | Added: {added} | Duplicates: {skipped_duplicate} | Old: {skipped_old}")
    
    return (added, skipped_duplicate, skipped_old)


def main():
    print("="*80)
    print(f"24H.COM.VN CRAWLER - Duplicate-Safe Daily Crawling")
    print("="*80)
    
    ensure_csv_header(CSV_PATH)
    seen_urls, seen_ids = load_seen_from_csv(CSV_PATH)
    
    print(f"\nüìä Initial state:")
    print(f"  - Already crawled: {len(seen_urls)} URLs, {len(seen_ids)} IDs")
    print(f"  - Date filter: Articles >= {END_DATE}")
    print(f"  - Total feeds: {len(RSS_FEEDS)}")
    print()

    total_added = 0
    total_duplicates = 0
    total_old = 0
    
    for rss_url, category_slug in RSS_FEEDS:
        try:
            added, duplicates, old = crawl_rss_feed(rss_url, category_slug, END_DATE, seen_urls, seen_ids)
            total_added += added
            total_duplicates += duplicates
            total_old += old
        except Exception as e:
            print(f"  [{category_slug}] ERROR: {e}")

    print()
    print("="*80)
    print(f"‚úÖ CRAWL SUMMARY")
    print("="*80)
    print(f"üìù New articles added: {total_added}")
    print(f"üîÅ Duplicates skipped: {total_duplicates} (already in CSV)")
    print(f"‚è∞ Old articles skipped: {total_old} (before {END_DATE})")
    print(f"üìä Total processed: {total_added + total_duplicates + total_old}")
    print(f"üíæ Output: {CSV_PATH}")
    print(f"üìà Total in CSV now: {len(seen_urls) + total_added} articles")
    print("="*80)
    
    if total_duplicates > 0:
        efficiency = (total_duplicates / (total_added + total_duplicates + total_old) * 100) if (total_added + total_duplicates + total_old) > 0 else 0
        print(f"\nüí° Duplicate rate: {efficiency:.1f}% - Perfect for daily runs!")
        print(f"   (High rate = most articles already crawled = efficient)")
    print()


if __name__ == "__main__":
    main()

24H.COM.VN CRAWLER - Duplicate-Safe Daily Crawling

üìä Initial state:
  - Already crawled: 358 URLs, 358 IDs
  - Date filter: Articles >= 2026-01-30
  - Total feeds: 11

  [trang-chu] RSS entries: 108 | Added: 0 | Duplicates: 108 | Old: 0
  [tin-tuc-trong-ngay] RSS entries: 24 | Added: 0 | Duplicates: 24 | Old: 0
  [bong-da] RSS entries: 24 | Added: 0 | Duplicates: 24 | Old: 0
  [the-thao] RSS entries: 1 | Added: 0 | Duplicates: 0 | Old: 1
  [thoi-trang] RSS entries: 24 | Added: 0 | Duplicates: 24 | Old: 0
  [hi-tech] RSS entries: 24 | Added: 0 | Duplicates: 24 | Old: 0
  [tai-chinh-bat-dong-san] RSS entries: 24 | Added: 0 | Duplicates: 0 | Old: 24
  [phim] RSS entries: 24 | Added: 0 | Duplicates: 9 | Old: 15
  [giao-duc-du-hoc] RSS entries: 24 | Added: 0 | Duplicates: 24 | Old: 0
  [ban-tre-cuoc-song] RSS entries: 24 | Added: 0 | Duplicates: 24 | Old: 0
  [the-thao] RSS entries: 24 | Added: 0 | Duplicates: 24 | Old: 0

‚úÖ CRAWL SUMMARY
üìù New articles added: 0
üîÅ Duplicates ski

## Gi·∫£i th√≠ch v·ªÅ x·ª≠ l√Ω Duplicate khi ch·∫°y h√†ng ng√†y

**V·∫•n ƒë·ªÅ**: Khi ch·∫°y crawler h√†ng ng√†y, RSS feed s·∫Ω c√≥ nhi·ªÅu articles ƒë√£ crawl ng√†y h√¥m tr∆∞·ªõc

**V√≠ d·ª•**:
- Ng√†y 1 (4/2): RSS c√≥ 109 articles t·ª´ 30/1 ‚Üí 4/2, crawl ƒë∆∞·ª£c 228 articles
- Ng√†y 2 (5/2): RSS c√≥ 109 articles t·ª´ 31/1 ‚Üí 5/2, trong ƒë√≥ ~90 articles ƒë√£ c√≥
- ‚Üí C·∫ßn skip ~90 articles duplicate, ch·ªâ crawl ~19 articles m·ªõi

**C√°ch x·ª≠ l√Ω trong code**:

1. **Load seen data**: `load_seen_from_csv()` ƒë·ªçc t·∫•t c·∫£ URLs ƒë√£ crawl t·ª´ CSV
2. **Check duplicate TR∆Ø·ªöC fetch**: 
   - Check `if article_url in seen_urls` ‚Üí skip ngay
   - Kh√¥ng fetch HTML c·ªßa articles ƒë√£ c√≥ ‚Üí ti·∫øt ki·ªám bandwidth
3. **Ch·ªâ crawl articles m·ªõi**: Fetch v√† parse HTML ch·ªâ v·ªõi articles ch∆∞a c√≥

**K·∫øt qu·∫£**: 
- L·∫ßn ch·∫°y ƒë·∫ßu: Crawl 200+ articles (~3 ph√∫t)
- L·∫ßn ch·∫°y sau: Ch·ªâ crawl 10-20 articles m·ªõi (~30 gi√¢y)
- An to√†n: Kh√¥ng bao gi·ªù duplicate data trong CSV