In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time

BASE_IDX = "https://news.detik.com/indeks"
HEADERS  = {"User-Agent": "Mozilla/5.0"}
OUTPUT   = "detik_full_2-300.csv"

# Regex to find timestamps
TS_RE = re.compile(r"\d+\s+(detik|menit|jam|hari)\s+yang\s+lalu", re.I)

def clean(text: str) -> str:
    return text.strip().replace("\r"," ").replace("\n", " ")

def extract_article(link: str) -> dict:
    """Fetch a Detik article and return its title, url, timestamp, and main content."""
    r = requests.get(link, headers=HEADERS, timeout=10)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # Title
    title_tag = soup.select_one("h1.detail__title, h1.entry-title")
    title = clean(title_tag.get_text()) if title_tag else ""

    # Timestamp
    ts = ""
    meta = soup.select_one("div.detail__date, span.date")
    if meta:
        m = TS_RE.search(meta.get_text())
        if m:
            ts = m.group(0)

    # Body paragraphs
    body_div = soup.select_one("div.detail__body-text.itp_bodycontent") \
           or soup.select_one("div.detail__body")
    paragraphs = []
    if body_div:
        for p in body_div.find_all("p"):
            txt = clean(p.get_text())
            if not txt:
                continue
            if txt.lower().startswith("baca juga"):
                continue
            # skip captions or labels that contain klik in txt
            if "klik" in txt.lower():
                continue
            paragraphs.append(txt)
    content = "\n".join(paragraphs)

    return {"title": title, "url": link, "timestamp": ts, "content": content}

# Prepare CSV
with open(OUTPUT, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["page","title","url","timestamp","content"])
    writer.writeheader()

for page in range(2, 401):
    print(f"→ Scraping index page {page}")
    idx_r = requests.get(BASE_IDX, params={"page": page}, headers=HEADERS, timeout=10)
    idx_r.raise_for_status()
    idx_soup = BeautifulSoup(idx_r.text, "html.parser")

    # Each index entry
    links = [a["href"] for a in idx_soup.select("h3 > a[href^='https://news.detik.com/']")]

    page_rows = []
    for i, link in enumerate(links, 1):
        try:
            rec = extract_article(link)
            rec["page"] = page
            page_rows.append(rec)
            print(f"   • [{i}/{len(links)}] {rec['title']!r}")
        except Exception as e:
            print(f"   [WARN] Failed {link}: {e}")
        time.sleep(0.2) 

    with open(OUTPUT, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["page","title","url","timestamp","content"])
        writer.writerows(page_rows)

print(f"Finished! All data written to {OUTPUT}")


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import sys
import time

BASE_URL = "https://turnbackhoax.id"
HEADERS  = {"User-Agent": "Mozilla/5.0"}
CSV_FILE = "hoax_final.csv"

def clean(text: str) -> str:
    return text.strip().replace("\r", "")

def parse_article_legacy(soup: BeautifulSoup) -> dict:
    """
    Parse a single article using the legacy template:
      - Title from <h1.entry-title>
      - Narasi by splitting on 'NARASI:' / 'Narasi :'
      - Penjelasan by splitting on 'PENJELASAN:' / 'Penjelasan :'
    """
    # Title
    title_tag = soup.select_one("h1.entry-title")
    title = clean(title_tag.get_text()) if title_tag else ""

    # Full content text
    content = soup.select_one("div.post-content") or soup.select_one("div.entry-content")
    full_text = content.get_text("\n") if content else ""

    # Split on the labels
    parts = re.split(
        r'(?mi)^(NARASI\s*:|Narasi\s*:|PENJELASAN\s*:|Penjelasan\s*:)\s*',
        full_text, flags=re.MULTILINE
    )
    narasi = penjelasan = ""
    for i in range(1, len(parts), 2):
        label = parts[i].lower()
        body  = parts[i+1].strip()
        if "narasi" in label:
            narasi = clean(body)
        elif "penjelasan" in label:
            penjelasan = clean(body)
    return {"Title": title, "Narasi": narasi, "Penjelasan": penjelasan}

def scrape_legacy_range(start_page: int, end_page: int, csv_file: str):
    # Write header once
    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["Title", "Narasi", "Penjelasan"])
        writer.writeheader()

    for page in range(start_page, end_page + 1):
        idx_url = f"{BASE_URL}/page/{page}/"
        try:
            idx_r = requests.get(idx_url, headers=HEADERS, timeout=10)
            idx_r.raise_for_status()
        except Exception as e:
            print(f"[ERROR] Could not load index page {page}: {e}", file=sys.stderr)
            continue

        idx_soup = BeautifulSoup(idx_r.text, "html.parser")
        links = [
            a["href"] for a in idx_soup.select("h3.entry-title a[href^='http']")
        ]
        print(f"→ Page {page}: {len(links)} articles found")

        page_rows = []
        for i, link in enumerate(links, 1):
            try:
                art_r = requests.get(link, headers=HEADERS, timeout=10)
                art_r.raise_for_status()
                art_soup = BeautifulSoup(art_r.text, "html.parser")
                rec = parse_article_legacy(art_soup)
                if rec["Narasi"] or rec["Penjelasan"] or rec["Title"]:
                    page_rows.append(rec)
                print(f"   • [{i}/{len(links)}] {rec['Title']!r}")
            except Exception as e:
                print(f"   [WARN] Skipped {link}: {e}", file=sys.stderr)
            time.sleep(0.5)

        with open(csv_file, "a", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=["Title", "Narasi", "Penjelasan"])
            writer.writerows(page_rows)

        print(f"Page {page} done — {len(page_rows)} articles written\n")

if __name__ == "__main__":
    scrape_legacy_range(100, 800, CSV_FILE)
    print("All done — legacy template scraped to", CSV_FILE)
