In [9]:
# Cell 1 - Install dependencies
!pip install --quiet requests beautifulsoup4 dateparser tqdm pytz pandas

In [10]:
# Cell 2 - Konfigurasi & imports
import re
import time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import dateparser
from datetime import datetime, timedelta
import pytz
import pandas as pd
import json

# ---------------------------
# CONFIG
# ---------------------------
CHANNELS = [
    "health",
    "sport"
    "hot",
]

BASE_INDEX_PATH = "/indeks"
BASE_DOMAIN_TEMPLATE = "https://{channel}.detik.com"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; scraping-bot/1.0; +https://example.com/bot)"
}

DELAY_SEC = 0.6                 # dipercepat tapi masih aman
MAX_PAGES_PER_CHANNEL = 500
TIMEZONE = "Asia/Jakarta"

START_DATE_STR = "2025-1-1"
END_DATE_STR   = "2025-12-20"
# ---------------------------

tz = pytz.timezone(TIMEZONE)
start_date = tz.localize(datetime.strptime(START_DATE_STR, "%Y-%m-%d"))
end_date   = tz.localize(datetime.strptime(END_DATE_STR, "%Y-%m-%d") + timedelta(days=1))

print(f"Rentang tanggal: {start_date.isoformat()} → {end_date.isoformat()}")

Rentang tanggal: 2025-12-18T00:00:00+07:00 → 2025-12-21T00:00:00+07:00


In [11]:
# Cell 3 - Helper functions
def absolutize(href, base_domain):
    if not href:
        return None
    if href.startswith("http"):
        return href
    return urljoin(base_domain, href)


def fetch(url, headers=HEADERS, max_retries=3, timeout=15):
    for attempt in range(1, max_retries + 1):
        try:
            r = requests.get(url, headers=headers, timeout=timeout)
            r.raise_for_status()
            return r.text
        except Exception as e:
            print(f"[fetch] gagal {url} (attempt {attempt}): {e}")
            time.sleep(0.4 + attempt * 0.4)
    return None


def extract_article_links(index_html, base_domain):
    soup = BeautifulSoup(index_html, "html.parser")
    anchors = soup.find_all("a", href=True)

    links = set()
    for a in anchors:
        href = a["href"].strip()

        # ❌ SKIP video / 20.detik.com
        if "20.detik.com" in href:
            continue

        if re.search(r"/d-\d+", href):
            abs_url = absolutize(href, base_domain)
            if abs_url and "20.detik.com" not in abs_url:
                links.add(abs_url)

    return sorted(links)


def parse_article(article_html, url=None):
    soup = BeautifulSoup(article_html, "html.parser")

    title_tag = soup.find("h1", class_="detail__title")
    title = title_tag.get_text(strip=True) if title_tag else None

    date_tag = soup.find("div", class_="detail__date")
    date_raw = date_tag.get_text(strip=True) if date_tag else None

    lead = None
    for cont in [
        soup.find("div", class_="detail__body"),
        soup.find("div", class_="detail__body-text"),
        soup.find("article"),
    ]:
        if cont:
            p = cont.find("p")
            if p and p.get_text(strip=True):
                lead = p.get_text(" ", strip=True)
                break

    parsed_dt = None
    if date_raw:
        parsed_dt = dateparser.parse(
            date_raw,
            languages=["id"],
            settings={
                "TIMEZONE": TIMEZONE,
                "RETURN_AS_TIMEZONE_AWARE": True
            }
        )

    return {
        "url": url,
        "title": title,
        "date_raw": date_raw,
        "date_parsed": parsed_dt,
        "lead": lead
    }


def find_next_index_page(index_html, base_domain):
    soup = BeautifulSoup(index_html, "html.parser")
    a = soup.find("a", class_="pagination__item", string=lambda t: t and "Next" in t)
    if a and a.get("href"):
        return absolutize(a["href"], base_domain)
    return None

In [12]:
# Cell 4 - Main loop: loop tiap channel dan kumpulkan artikel 5 hari terakhir
results = []
summary = {}

for channel in CHANNELS:
    print(f"\n=== CHANNEL: {channel} ===")

    base_domain = BASE_DOMAIN_TEMPLATE.format(channel=channel)
    current_url = base_domain + BASE_INDEX_PATH

    seen_urls = set()
    collected_count = 0
    page_no = 0

    pbar = tqdm(desc=f"pages {channel}", unit="page", leave=False)

    while current_url and page_no < MAX_PAGES_PER_CHANNEL:
        page_no += 1
        print(f"[{channel}] page {page_no} -> {current_url}")

        idx_html = fetch(current_url)
        if not idx_html:
            print(f"[{channel}] gagal ambil indeks -> stop")
            break

        article_links = extract_article_links(idx_html, base_domain)
        print(f"  ditemukan {len(article_links)} artikel")

        any_within_range = False

        for link in article_links:
            if link in seen_urls:
                continue
            seen_urls.add(link)

            time.sleep(DELAY_SEC)

            art_html = fetch(link)
            if not art_html:
                continue

            info = parse_article(art_html, url=link)
            parsed_dt = info.get("date_parsed")

            if not parsed_dt:
                continue

            if parsed_dt.tzinfo is None:
                parsed_dt = tz.localize(parsed_dt)

            if start_date <= parsed_dt < end_date:
                any_within_range = True
                collected_count += 1

                results.append({
                    "channel": channel,
                    "url": link,
                    "title": info["title"],
                    "date_raw": info["date_raw"],
                    "date_iso": parsed_dt.isoformat(),
                    "lead": info["lead"]
                })

                print(f"   + [{parsed_dt.isoformat()}] {info['title']}")

        pbar.update(1)

        if not any_within_range:
            print(f"[{channel}] tidak ada artikel dalam rentang → STOP")
            break

        next_page = find_next_index_page(idx_html, base_domain)
        if not next_page or next_page == current_url:
            print(f"[{channel}] tidak ada halaman berikutnya → STOP")
            break

        current_url = next_page

    summary[channel] = collected_count
    pbar.close()



=== CHANNEL: health ===


pages health: 0page [00:00, ?page/s]

[health] page 1 -> https://health.detik.com/indeks
  ditemukan 20 artikel
   + [2025-12-19T17:01:00+07:00] Jangan Ngaku Jago Matematika kalau Tak Bisa Jawab Soal Ini Hitungan Detik!
   + [2025-12-19T17:30:00+07:00] Tren Imunisasi di RI Ngedrop, Banyak Ortu Takut Anaknya Kena Efek Samping
   + [2025-12-19T18:00:00+07:00] Kronologi Wanita Diet Cuma Makan Ayam-Brokoli Rebus 6 Bulan, Berujung Masalah Pankreas
   + [2025-12-19T19:01:00+07:00] Punya Hipertensi? Sebaiknya Hindari Menu Sarapan Ini Agar Tekanan Darah Aman
   + [2025-12-19T20:01:00+07:00] Kondisi yang Tak Disadari Rentan Picu Serangan Stroke di Usia Muda
   + [2025-12-20T05:02:00+07:00] 7 Tanda Kolesterol Tinggi yang Bisa Muncul di Kaki, Termasuk Perubahan Warna
   + [2025-12-20T06:01:00+07:00] Benarkah Kusta Mudah Menular dan Bagaimana Mencegahnya? Simak Panduan Ini
   + [2025-12-20T07:01:00+07:00] Cara Mudah Mencegah Lonjakan Gula Darah usai Santap Nasi
   + [2025-12-20T09:30:00+07:00] Gejala SePeLe Mata Kering Jangan Disepele

pages health: 1page [00:43, 43.50s/page]

   + [2025-12-20T16:03:00+07:00] Langka! Ibu Ini Lahirkan Bayi dari Kehamilan di Luar Rahim dengan Kista Hampir 10 Kg
[health] page 2 -> https://health.detik.com/indeks?page=2
  ditemukan 20 artikel
   + [2025-12-19T13:01:00+07:00] Coba Hitung Ada Berapa Hewan yang Sembunyi! Kalau Salah Bisa Jadi Harus Cek Mata
   + [2025-12-18T20:00:00+07:00] Terungkap Lewat Studi, Kebiasaan Malam Hari Ini Bisa Bantu Turunkan Tekanan Darah
   + [2025-12-18T19:24:00+07:00] Indonesia-Tiongkok Perkuat Kerja Sama Kesehatan, BPOM Bidik Rp 10 Triliun
   + [2025-12-19T06:00:00+07:00] Dokter Harvard Beberkan Alasan Main Ponsel di Toilet Bisa Picu Ambeien
   + [2025-12-18T19:32:00+07:00] Kepala BPOM Curhat Sulitnya Berantas Produk Pangan Ilegal di RI, Ini Sebabnya
   + [2025-12-19T05:00:00+07:00] 4 Kebiasaan yang Bisa Merusak Otak, Sebaiknya Dihindari!
   + [2025-12-19T07:30:00+07:00] Dokter Ungkap Tanda Tubuh Kelebihan Gula yang Perlu Diperhatikan
   + [2025-12-19T07:02:00+07:00] Terungkap Sumber Penularan Ku

pages health: 2page [01:26, 43.49s/page]

   + [2025-12-19T09:03:00+07:00] Penampakan Pangan Ilegal yang Diamankan BPOM, Ada Kopi yang Bisa Rusak Ginjal
[health] page 3 -> https://health.detik.com/indeks?page=3
  ditemukan 19 artikel
   + [2025-12-18T17:03:00+07:00] Tes Gambar Ini Nggak Sesimpel yang Kamu Kira, Hanya yang Jeli Bisa Menjawab
   + [2025-12-18T07:32:00+07:00] Kemenkes Kirim 600 Tenaga Medis Relawan ke Wilayah Bencana Aceh
   + [2025-12-18T08:07:00+07:00] Kronologi WNI di Rumania Terkena Kusta, Jadi Kasus Pertama usai 44 Tahun
   + [2025-12-18T08:39:00+07:00] Skandal Praktik Medis Ilegal Korsel, Sederet Selebriti Terseret Termasuk Key SHINee
   + [2025-12-18T09:03:00+07:00] Ramai gegara Muncul Lagi di Rumania, Kusta di RI Ada 10.450 Kasus Sepanjang 2025
   + [2025-12-18T10:05:00+07:00] Wanita Ini Diet Cuma Makan Ayam-Brokoli Rebus dalam 6 Bulan, Endingnya Begini
   + [2025-12-18T11:07:00+07:00] Daftar 10 Provinsi Terbanyak Catat Kasus Kusta di RI, Jabar Nomor 1
   + [2025-12-18T11:16:00+07:00] BPOM Tegaskan Keaman

pages health: 3page [02:08, 42.49s/page]

   + [2025-12-18T18:32:00+07:00] BPOM RI Ungkap Produk Pangan Ilegal Jelang Nataru, Total Nilainya Sampai Rp 42 M
[health] page 4 -> https://health.detik.com/indeks?page=4
  ditemukan 19 artikel
   + [2025-12-18T05:05:00+07:00] Kabar Baik Buat Pecinta Kopi, Sering Minum Kafein Tunda Penuaan Tubuh
   + [2025-12-18T07:03:00+07:00] 5 Kebiasaan Simpel yang Bisa 'Restart' Otak dari Nol Versi Neurolog
   + [2025-12-18T06:02:00+07:00] Cerita Psikolog soal Pasien 'Capek Jadi WNI', Begini Rasanya Stres Mikir Negara
   + [2025-12-18T06:38:00+07:00] Benarkah Tubuh Manusia Perlu Berhari-hari untuk Mencerna Mi Instan? Ini Faktanya


pages health: 4page [02:49, 42.04s/page]

[health] page 5 -> https://health.detik.com/indeks?page=5
  ditemukan 19 artikel




[health] tidak ada artikel dalam rentang → STOP

=== CHANNEL: sport ===


pages sport: 0page [00:00, ?page/s]

[sport] page 1 -> https://sport.detik.com/indeks
  ditemukan 18 artikel
   + [2025-12-19T22:34:00+07:00] Vicky Tahumil Junior Sumbang Emas Tinju Indonesia di SEA Games 2025
   + [2025-12-20T14:40:00+07:00] Foto: Anthony Joshua Pukul KO Jake Paul
   + [2025-12-19T18:43:00+07:00] BWF World Tour Finals 2025: Jonatan Christie Kalah dari Popov
   + [2025-12-19T19:32:00+07:00] BWF World Tour Finals 2025: Fajar/Fikri Dikalahkan Liang/Wang
   + [2025-12-19T21:25:00+07:00] BWF World Tour Finals 2025: Sabar/Reza Tembus Semifinal
   + [2025-12-20T10:40:00+07:00] Jadwal BWF World Tour Finals 2025: Sabar/Reza Jumpa Unggulan Pertama
   + [2025-12-19T20:25:00+07:00] SEA Games 2025: Voli Putra Indonesia Gagal Raih Medali Emas
   + [2025-12-20T01:23:00+07:00] Kejurnas Panahan Antarklub 2025 Perkuat Fondasi Pembinaan
   + [2025-12-19T23:50:00+07:00] Senangnya Silafanus Ndiken Bawa Pulang Perak SEA Games 2025
   + [2025-12-19T22:18:00+07:00] Klasemen Medali SEA Games 2025: Hoki Es dan Futsal Tambah Emas 

pages sport: 1page [00:46, 46.28s/page]

   + [2025-12-20T15:52:00+07:00] Raih Peringkat Kedua SEA Games, Indonesia Ulangi Capaian 30 Tahun Lalu
[sport] page 2 -> https://sport.detik.com/indeks?page=2
  ditemukan 20 artikel
   + [2025-12-19T16:01:00+07:00] SEA Games 2025: Indonesia Tambah Dua Perunggu dari Basket
   + [2025-12-19T14:19:00+07:00] Bungkam Tuan Rumah, Aldila/Janice Raih Emas Ganda Putri Tenis SEA Games 2025
   + [2025-12-19T09:00:00+07:00] Jadwal BWF World Tour Finals 2025 Hari Ini: Laga Terakhir Babak Grup
   + [2025-12-19T09:27:00+07:00] Hasil BWF World Tour Finals 2025: Putri Menang di Laga Terakhir
   + [2025-12-19T11:17:00+07:00] Hasil BWF World Tour Finals 2025: Jafar/Felisha Atasi Ganda Malaysia
   + [2025-12-19T12:10:00+07:00] Ganda Putri Tenis Raih Emas SEA Games, RI Kini Koleksi 84 Keping
   + [2025-12-19T14:31:00+07:00] Putri KW Puas Main di World Tour Finals 2025
   + [2025-12-18T23:30:00+07:00] Kala Perenang Liquor Harrison Pecahkan Rekornas di SEA Games 2025
   + [2025-12-19T00:30:00+07:00] Akyko d

pages sport: 2page [01:29, 44.64s/page]

   + [2025-12-19T18:24:00+07:00] SEA Games 2025: Tinju dan Hoki Indoor Kompak Sumbang Emas
[sport] page 3 -> https://sport.detik.com/indeks?page=3
  ditemukan 19 artikel
   + [2025-12-18T09:03:00+07:00] Desa Olimpiade Cortina Siap Sambut Atlet Jelang Olimpiade Musim Dingin 2026
   + [2025-12-18T09:50:00+07:00] Hasil World Tour Finals: Putri KW Dikalahkan Yamaguchi
   + [2025-12-18T10:52:00+07:00] Hasil BWF World Tour Finals 2025: Jafar/Felisha Dikalahkan Juara Dunia
   + [2025-12-18T12:58:00+07:00] Hasil BWF World Tour Finals 2025: Sabar/Reza Takluk dari Kim/Seo
   + [2025-12-18T14:03:00+07:00] Tekad Putri KW & Jafar/Feli di Partai Ketiga World Tour Finals 2025
   + [2025-12-18T21:20:00+07:00] Hasil BWF World Tour Finals 2025: Jonatan Dikalahkan Antonsen
   + [2025-12-18T21:33:00+07:00] Hasil BWF World Tour Finals 2025: Fajar/Fikri Kalah dari Wakil India
   + [2025-12-18T22:30:00+07:00] Alwi dan Sabar Raih Emas SEA Games, Buah Kontribusi Vital PB Exist
   + [2025-12-18T07:15:00+07:00] 

pages sport: 3page [02:11, 43.39s/page]

   + [2025-12-18T07:57:00+07:00] Lari Bareng Kesayangan, Valentine Kian Berkesan di Love Run Yogyakarta 2026
[sport] page 4 -> https://sport.detik.com/indeks?page=4
  ditemukan 18 artikel
   + [2025-12-18T03:30:00+07:00] Pencak Silat Capai Target SEA Games, Menpora Dorong Lanjut ke Asian Games
   + [2025-12-18T06:15:00+07:00] Robi Syianturi: Bonus Emas SEA Games untuk Istri dan Calon Buah Hati


pages sport: 4page [02:53, 42.94s/page]

   + [2025-12-18T00:55:00+07:00] Atlet Soft Tennis RI Sabet Gelar di Thailand
[sport] page 5 -> https://sport.detik.com/indeks?page=5
  ditemukan 19 artikel


                                       

[sport] tidak ada artikel dalam rentang → STOP




In [13]:
# Cell 5 - Simpan hasil & preview
df = pd.DataFrame(results)

if not df.empty:
    df["date_iso_parsed"] = pd.to_datetime(df["date_iso"])
    df.sort_values(by="date_iso_parsed", ascending=False, inplace=True)
    df.drop(columns=["date_iso_parsed"], inplace=True)

csv_path = "/content/detik_articles_range.csv"
json_path = "/content/detik_articles_range.json"

df.to_csv(csv_path, index=False)
df.to_json(json_path, orient="records", force_ascii=False)

print("\n--- SUMMARY ---")
for ch, cnt in summary.items():
    print(f"{ch}: {cnt} artikel")

print(f"\nCSV  -> {csv_path}")
print(f"JSON -> {json_path}")
display(df.head(10))




--- SUMMARY ---
health: 63 artikel
sport: 60 artikel

CSV  -> /content/detik_articles_range.csv
JSON -> /content/detik_articles_range.json


Unnamed: 0,channel,url,title,date_raw,date_iso,lead
19,health,https://health.detik.com/true-story/d-8269491/...,Langka! Ibu Ini Lahirkan Bayi dari Kehamilan d...,"Sabtu, 20 Des 2025 16:03 WIB",2025-12-20T16:03:00+07:00,"Seorang wanita hamil di California, Amerika Se..."
80,sport,https://sport.detik.com/sport-lain/d-8269478/r...,"Raih Peringkat Kedua SEA Games, Indonesia Ulan...","Sabtu, 20 Des 2025 15:52 WIB",2025-12-20T15:52:00+07:00,Indonesia memastikan peringkat kedua di SEA Ga...
16,health,https://health.detik.com/berita-detikhealth/d-...,"Kondisi Puskesmas Pascabencana di Aceh, Masih ...","Sabtu, 20 Des 2025 15:03 WIB",2025-12-20T15:03:00+07:00,Menteri Kesehatan Budi Gunadi Sadikin menyebut...
64,sport,https://sport.detik.com/fotosport/d-8269369/fo...,Foto: Anthony Joshua Pukul KO Jake Paul,"Sabtu, 20 Des 2025 14:40 WIB",2025-12-20T14:40:00+07:00,Jakarta - Anthony Joshua menang KO atas Jake P...
15,health,https://health.detik.com/berita-detikhealth/d-...,Cerita Wanita Surabaya Idap Diabetes di Usia 2...,"Sabtu, 20 Des 2025 14:03 WIB",2025-12-20T14:03:00+07:00,"Bagi Lilla Syifa (29), perempuan asal Surabaya..."
79,sport,https://sport.detik.com/sport-lain/d-8269251/s...,SEA Games 2025: Sepak Takraw Tambah 2 Medali P...,"Sabtu, 20 Des 2025 12:58 WIB",2025-12-20T12:58:00+07:00,Sepak takraw menambah perolehan medali bagi In...
14,health,https://health.detik.com/berita-detikhealth/d-...,"Punya Kebiasaan Ini Tiap Pagi? Hati-hati, Kemu...","Sabtu, 20 Des 2025 12:57 WIB",2025-12-20T12:57:00+07:00,Serangan jantung masih menjadi penyebab kemati...
78,sport,https://sport.detik.com/sport-lain/d-8269234/c...,Canda Presiden Prabowo soal 91 Emas SEA Games:...,"Sabtu, 20 Des 2025 12:45 WIB",2025-12-20T12:45:00+07:00,Indonesia sudah raih 91 medali emas di SEA Gam...
77,sport,https://sport.detik.com/sport-lain/d-8269175/h...,Hasil Tinju: Anthony Joshua Menang KO Atas Jak...,"Sabtu, 20 Des 2025 12:00 WIB",2025-12-20T12:00:00+07:00,Laga Jake Paul vs Anthony Joshua digelar di AS...
13,health,https://health.detik.com/berita-detikhealth/d-...,"Akses Kesehatan Terputus, Pasien Stroke-Hipert...","Sabtu, 20 Des 2025 11:33 WIB",2025-12-20T11:33:00+07:00,"Pasien dengan penyakit kronis seperti stroke, ..."
