In [4]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import json


In [5]:

def get_article_links(page_url):
    headers = {"User-Agent": "Mozilla/5.0"}
    html = requests.get(page_url, headers=headers).text
    soup = BeautifulSoup(html, "html.parser")

    links = []
    for h3 in soup.find_all("h3", class_="title-news"):
        a = h3.find("a")
        if a and a.get("href"):
            links.append(a["href"])
    return links


In [6]:
def crawl_article(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        html = requests.get(url, headers=headers, timeout=5).text
        soup = BeautifulSoup(html, "html.parser")

        # Summary
        desc = soup.find("p", class_="description")
        summary = desc.get_text(strip=True) if desc else ""

        # Body (giao diện mới)
        content = []
        article_tag = soup.find("article", class_="fck_detail")
        if article_tag:
            for p in article_tag.find_all("p"):
                text = p.get_text(strip=True)
                if text:
                    content.append(text)

        article = " ".join(content)

        return {
            "url": url,
            "summary": summary,
            "article": article
        }
    except:
        return None

In [7]:
def crawl_many(links, max_workers=10):
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(crawl_article, link): link for link in links}

        for future in as_completed(futures):
            data = future.result()
            if data and len(data["article"]) > 200:
                results.append(data)

    return results

In [8]:
all_data = []

for page in range(1, 501):
    page_url = f"https://vnexpress.net/thoi-su-p{page}"
    links = get_article_links(page_url)
    print(f"Trang {page}: {len(links)} links")
    # Nếu trang không còn bài → dừng luôn
    if len(links) == 0:
        print("Hết trang, dừng crawl.")
        break
    data = crawl_many(links, max_workers=20)
    all_data.extend(data)
print("Tổng số bài lấy được:", len(all_data))

Trang 1: 47 links
Trang 2: 29 links
Trang 3: 29 links
Trang 4: 29 links
Trang 5: 29 links
Trang 6: 29 links
Trang 7: 29 links
Trang 8: 29 links
Trang 9: 29 links
Trang 10: 29 links
Trang 11: 29 links
Trang 12: 29 links
Trang 13: 29 links
Trang 14: 29 links
Trang 15: 29 links
Trang 16: 29 links
Trang 17: 29 links
Trang 18: 29 links
Trang 19: 29 links
Trang 20: 29 links
Trang 21: 0 links
Hết trang, dừng crawl.
Tổng số bài lấy được: 577


In [None]:
# lưu data vào file JSON
with open("vnexpress_dataset.json", "w", encoding="utf-8") as f:
    json.dump(all_data, f, ensure_ascii=False, indent=2)