In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from tqdm import tqdm
from datetime import datetime

# Konfigurasi awal
kata_kunci = ['politik', 'hukum', 'nasional']
headers = {"User-Agent": "Mozilla/5.0"}
berita_data = []
maks_halaman = 100

# Rentang waktu (format YYYY-MM-DD) - updated to current month
start_date = datetime(2024, 7, 1)
end_date = datetime(2025, 7, 31)

def ambil_kalimat_pertama(text):
    """Extract first sentence from text"""
    kalimat = re.split(r'[.!?]', text)
    return kalimat[0].strip() if kalimat else text.strip()

def parse_indonesian_date(date_str):
    """Parse Indonesian date format like 'Rabu, 30 Jul 2025 13:45 WIB'"""
    try:
        # Remove day name and time info
        date_str = re.sub(r'^[A-Za-z]+,\s*', '', date_str)  # Remove "Rabu, "
        date_str = re.sub(r'\s+\d{2}:\d{2}.*$', '', date_str)  # Remove time part
        
        # Month mapping
        months = {
            'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
            'Mei': '05', 'Jun': '06', 'Jul': '07', 'Agu': '08',
            'Sep': '09', 'Okt': '10', 'Nov': '11', 'Des': '12'
        }
        
        # Parse format like "30 Jul 2025"
        parts = date_str.strip().split()
        if len(parts) >= 3:
            day = parts[0].zfill(2)
            month = months.get(parts[1], '01')
            year = parts[2]
            
            return datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d")
    except Exception:
        return None
    return None

def ekstrak_tanggal(bs):
    """Extract and parse date from various formats"""
    try:
        # Try meta tag first
        meta = bs.find("meta", {"property": "article:published_time"})
        if meta and meta.get("content"):
            date_str = meta["content"][:10]  # Get YYYY-MM-DD part
            return datetime.strptime(date_str, "%Y-%m-%d")
        
        # Try time tag
        time_tag = bs.find("time")
        if time_tag:
            date_text = time_tag.text.strip()
            return parse_indonesian_date(date_text)
        
        # Try detail date
        detail_date = bs.find("div", {"class": "detail__date"})
        if detail_date:
            date_text = detail_date.text.strip().split(" - ")[-1]
            return parse_indonesian_date(date_text)
            
    except Exception:
        return None
    return None

# Main scraping process
url_terambil = set()

for keyword in kata_kunci:
    print(f"\nScraping keyword: {keyword}")
    for halaman in range(1, maks_halaman + 1):
        url = f"https://www.detik.com/search/searchall?query={keyword}&siteid=2&sortby=time&page={halaman}"
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')
        links = soup.select('article a')
        urls = list(set([a['href'] for a in links if 'https://news.detik.com' in a['href']]))
        urls = [u for u in urls if u not in url_terambil]

        if not urls:
            print(f"Halaman {halaman} kosong untuk keyword '{keyword}', stop.")
            continue

        for link in tqdm(urls, desc=f'Scraping {keyword}, halaman {halaman}'):
            try:
                r = requests.get(link, headers=headers)
                bs = BeautifulSoup(r.text, 'html.parser')

                title = bs.find('h1').text.strip()
                content_tags = bs.select('div.detail__body-text p')
                if not content_tags:
                    continue
                full_text = ' '.join([p.text.strip() for p in content_tags])
                kalimat = ambil_kalimat_pertama(full_text)

                if not kalimat or '[Gambas:' in kalimat or len(kalimat.split()) < 5:
                    continue

                # Extract date using improved function
                tanggal_dt = ekstrak_tanggal(bs)
                if not tanggal_dt:
                    continue

                if not (start_date <= tanggal_dt <= end_date):
                    continue

                berita_data.append({
                    'tanggal': tanggal_dt.strftime('%Y-%m-%d'),
                    'sumber (url)': link,
                    'judul berita': title,
                    'konten': kalimat
                })

                url_terambil.add(link)
                time.sleep(1)  # Reduced delay
            except Exception:
                continue
        time.sleep(0.5)  # Reduced delay between pages

# Save results to CSV
df = pd.DataFrame(berita_data)
df.drop_duplicates(subset='sumber (url)', inplace=True)
df = df.sample(frac=1, random_state=42)
df.to_csv('berita_mentah.csv', index=False, encoding='utf-8-sig')

print(f"\nSelesai! Disimpan sebagai 'berita_mentah.csv' dengan {len(df)} entri.")
if len(df) > 0:
    print(f"\nSample data:")
    print(df.head())


Scraping keyword: politik


Scraping politik, halaman 1: 100%|██████████| 8/8 [00:10<00:00,  1.30s/it]
Scraping politik, halaman 2: 100%|██████████| 7/7 [00:08<00:00,  1.26s/it]
Scraping politik, halaman 3: 100%|██████████| 7/7 [00:09<00:00,  1.34s/it]
Scraping politik, halaman 4: 100%|██████████| 8/8 [00:10<00:00,  1.27s/it]
Scraping politik, halaman 5: 100%|██████████| 4/4 [00:04<00:00,  1.24s/it]
Scraping politik, halaman 6: 100%|██████████| 7/7 [00:09<00:00,  1.38s/it]
Scraping politik, halaman 7: 100%|██████████| 8/8 [00:12<00:00,  1.57s/it]
Scraping politik, halaman 8: 100%|██████████| 7/7 [00:09<00:00,  1.37s/it]
Scraping politik, halaman 9: 100%|██████████| 5/5 [00:09<00:00,  1.88s/it]
Scraping politik, halaman 10: 100%|██████████| 6/6 [00:08<00:00,  1.42s/it]
Scraping politik, halaman 11: 100%|██████████| 4/4 [00:05<00:00,  1.26s/it]
Scraping politik, halaman 12: 100%|██████████| 8/8 [00:10<00:00,  1.30s/it]
Scraping politik, halaman 13: 100%|██████████| 5/5 [00:06<00:00,  1.23s/it]
Scraping politik, hal


Scraping keyword: hukum


Scraping hukum, halaman 1: 100%|██████████| 5/5 [00:06<00:00,  1.35s/it]
Scraping hukum, halaman 2: 100%|██████████| 7/7 [00:11<00:00,  1.65s/it]
Scraping hukum, halaman 3: 100%|██████████| 6/6 [00:08<00:00,  1.37s/it]
Scraping hukum, halaman 4: 100%|██████████| 3/3 [00:05<00:00,  1.79s/it]
Scraping hukum, halaman 5: 100%|██████████| 6/6 [00:10<00:00,  1.77s/it]
Scraping hukum, halaman 6: 100%|██████████| 5/5 [00:06<00:00,  1.33s/it]
Scraping hukum, halaman 7: 100%|██████████| 3/3 [00:04<00:00,  1.44s/it]
Scraping hukum, halaman 8: 100%|██████████| 5/5 [00:06<00:00,  1.30s/it]
Scraping hukum, halaman 9: 100%|██████████| 8/8 [00:09<00:00,  1.20s/it]
Scraping hukum, halaman 10: 100%|██████████| 4/4 [00:06<00:00,  1.54s/it]
Scraping hukum, halaman 11: 100%|██████████| 7/7 [00:08<00:00,  1.29s/it]
Scraping hukum, halaman 12: 100%|██████████| 5/5 [00:06<00:00,  1.30s/it]
Scraping hukum, halaman 13: 100%|██████████| 5/5 [00:06<00:00,  1.37s/it]
Scraping hukum, halaman 14: 100%|██████████| 5/


Scraping keyword: nasional


Scraping nasional, halaman 1: 100%|██████████| 4/4 [00:04<00:00,  1.24s/it]
Scraping nasional, halaman 2: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]
Scraping nasional, halaman 3: 100%|██████████| 2/2 [00:02<00:00,  1.47s/it]


Halaman 4 kosong untuk keyword 'nasional', stop.


Scraping nasional, halaman 5: 100%|██████████| 3/3 [00:04<00:00,  1.42s/it]
Scraping nasional, halaman 6: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


Halaman 7 kosong untuk keyword 'nasional', stop.


Scraping nasional, halaman 8: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
Scraping nasional, halaman 9: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]
Scraping nasional, halaman 10: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]
Scraping nasional, halaman 11: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]
Scraping nasional, halaman 12: 100%|██████████| 3/3 [00:03<00:00,  1.23s/it]
Scraping nasional, halaman 13: 100%|██████████| 4/4 [00:05<00:00,  1.38s/it]
Scraping nasional, halaman 14: 100%|██████████| 4/4 [00:05<00:00,  1.44s/it]
Scraping nasional, halaman 15: 100%|██████████| 2/2 [00:02<00:00,  1.46s/it]
Scraping nasional, halaman 16: 100%|██████████| 3/3 [00:04<00:00,  1.35s/it]
Scraping nasional, halaman 17: 100%|██████████| 3/3 [00:04<00:00,  1.60s/it]
Scraping nasional, halaman 18: 100%|██████████| 5/5 [00:07<00:00,  1.59s/it]
Scraping nasional, halaman 19: 100%|██████████| 6/6 [00:08<00:00,  1.41s/it]
Scraping nasional, halaman 20: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]
S

Halaman 24 kosong untuk keyword 'nasional', stop.
Halaman 25 kosong untuk keyword 'nasional', stop.


Scraping nasional, halaman 26: 100%|██████████| 3/3 [00:04<00:00,  1.35s/it]
Scraping nasional, halaman 27: 100%|██████████| 1/1 [00:01<00:00,  1.19s/it]
Scraping nasional, halaman 28: 100%|██████████| 4/4 [00:05<00:00,  1.25s/it]
Scraping nasional, halaman 29: 100%|██████████| 4/4 [00:05<00:00,  1.30s/it]
Scraping nasional, halaman 30: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it]
Scraping nasional, halaman 31: 100%|██████████| 1/1 [00:01<00:00,  1.32s/it]
Scraping nasional, halaman 32: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
Scraping nasional, halaman 33: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it]
Scraping nasional, halaman 34: 100%|██████████| 6/6 [00:08<00:00,  1.33s/it]
Scraping nasional, halaman 35: 100%|██████████| 4/4 [00:05<00:00,  1.28s/it]
Scraping nasional, halaman 36: 100%|██████████| 2/2 [00:03<00:00,  1.93s/it]
Scraping nasional, halaman 37: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]
Scraping nasional, halaman 38: 100%|██████████| 2/2 [00:03<00:00,  1.66s/it]

Halaman 39 kosong untuk keyword 'nasional', stop.


Scraping nasional, halaman 40: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]
Scraping nasional, halaman 41: 100%|██████████| 1/1 [00:01<00:00,  1.14s/it]
Scraping nasional, halaman 42: 100%|██████████| 5/5 [00:06<00:00,  1.24s/it]


Halaman 43 kosong untuk keyword 'nasional', stop.


Scraping nasional, halaman 44: 100%|██████████| 4/4 [00:04<00:00,  1.23s/it]
Scraping nasional, halaman 45: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]
Scraping nasional, halaman 46: 100%|██████████| 4/4 [00:05<00:00,  1.33s/it]
Scraping nasional, halaman 47: 100%|██████████| 5/5 [00:07<00:00,  1.42s/it]
Scraping nasional, halaman 48: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]
Scraping nasional, halaman 49: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]
Scraping nasional, halaman 50: 100%|██████████| 4/4 [00:04<00:00,  1.08s/it]
Scraping nasional, halaman 51: 100%|██████████| 5/5 [00:06<00:00,  1.27s/it]
Scraping nasional, halaman 52: 100%|██████████| 5/5 [00:07<00:00,  1.41s/it]
Scraping nasional, halaman 53: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]
Scraping nasional, halaman 54: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
Scraping nasional, halaman 55: 100%|██████████| 5/5 [00:06<00:00,  1.30s/it]
Scraping nasional, halaman 56: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]

Halaman 64 kosong untuk keyword 'nasional', stop.


Scraping nasional, halaman 65: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]
Scraping nasional, halaman 66: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]


Halaman 67 kosong untuk keyword 'nasional', stop.


Scraping nasional, halaman 68: 100%|██████████| 3/3 [00:03<00:00,  1.26s/it]
Scraping nasional, halaman 69: 100%|██████████| 3/3 [00:04<00:00,  1.36s/it]
Scraping nasional, halaman 70: 100%|██████████| 2/2 [00:02<00:00,  1.37s/it]
Scraping nasional, halaman 71: 100%|██████████| 4/4 [00:05<00:00,  1.28s/it]
Scraping nasional, halaman 72: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]
Scraping nasional, halaman 73: 100%|██████████| 4/4 [00:05<00:00,  1.34s/it]
Scraping nasional, halaman 74: 100%|██████████| 1/1 [00:01<00:00,  1.32s/it]
Scraping nasional, halaman 75: 100%|██████████| 6/6 [00:08<00:00,  1.44s/it]
Scraping nasional, halaman 76: 100%|██████████| 2/2 [00:04<00:00,  2.22s/it]
Scraping nasional, halaman 77: 100%|██████████| 4/4 [00:04<00:00,  1.14s/it]
Scraping nasional, halaman 78: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]


Halaman 79 kosong untuk keyword 'nasional', stop.


Scraping nasional, halaman 80: 100%|██████████| 2/2 [00:02<00:00,  1.40s/it]


Halaman 81 kosong untuk keyword 'nasional', stop.


Scraping nasional, halaman 82: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]
Scraping nasional, halaman 83: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]
Scraping nasional, halaman 84: 100%|██████████| 1/1 [00:01<00:00,  1.67s/it]
Scraping nasional, halaman 85: 100%|██████████| 3/3 [00:03<00:00,  1.29s/it]
Scraping nasional, halaman 86: 100%|██████████| 4/4 [00:06<00:00,  1.59s/it]
Scraping nasional, halaman 87: 100%|██████████| 3/3 [00:03<00:00,  1.23s/it]
Scraping nasional, halaman 88: 100%|██████████| 1/1 [00:01<00:00,  1.75s/it]


Halaman 89 kosong untuk keyword 'nasional', stop.


Scraping nasional, halaman 90: 100%|██████████| 4/4 [00:05<00:00,  1.39s/it]
Scraping nasional, halaman 91: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


Halaman 92 kosong untuk keyword 'nasional', stop.


Scraping nasional, halaman 93: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]
Scraping nasional, halaman 94: 100%|██████████| 4/4 [00:05<00:00,  1.30s/it]
Scraping nasional, halaman 95: 100%|██████████| 3/3 [00:03<00:00,  1.26s/it]
Scraping nasional, halaman 96: 100%|██████████| 4/4 [00:06<00:00,  1.51s/it]
Scraping nasional, halaman 97: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]
Scraping nasional, halaman 98: 100%|██████████| 5/5 [00:09<00:00,  1.97s/it]
Scraping nasional, halaman 99: 100%|██████████| 4/4 [00:06<00:00,  1.50s/it]
Scraping nasional, halaman 100: 100%|██████████| 3/3 [00:03<00:00,  1.14s/it]



Selesai! Disimpan sebagai 'berita_mentah.csv' dengan 1004 entri.

Sample data:
        tanggal                                       sumber (url)  \
926  2025-06-10  https://news.detik.com/internasional/d-7956572...   
630  2025-02-18  https://news.detik.com/berita/d-7784363/tim-hu...   
682  2025-05-31  https://news.detik.com/berita/d-7941342/proses...   
514  2025-07-21  https://news.detik.com/berita/d-8021057/vonis-...   
365  2024-10-14  https://news.detik.com/pilkada/d-7587161/pramo...   

                                          judul berita  \
926  Trump Kirim Lagi 2.000 Pasukan Garda Nasional ...   
630  Tim Hukum PDIP Ngotot Minta KPK Tunda Periksa ...   
682  Proses Hukum 16 Mahasiswa Tersangka Demo Tetap...   
514  Vonis Bui dan Denda Tom Lembong Meski Disebut ...   
365  Pramono Temui Pendeta GKPS: Saya Tak Mau Berma...   

                                                konten  
926  Presiden Amerika Serikat (AS), Donald Trump, a...  
630  Tim Hukum PDIP meminta KPK menu

In [6]:
import pandas as pd

# Daftar keyword
keyword_dict = {
    'politik': [
        'presiden', 'wakil presiden', 'dpr', 'dpd', 'partai', 'politik', 'pemilu', 'kampanye',
        'caleg', 'kpu', 'bawaslu', 'koalisi', 'oposisi', 'menteri', 'kabinet', 'reshuffle',
        'parlemen', 'pemerintah pusat', 'isu politik', 'sikap politik', 'pemilihan umum',
        'politik identitas', 'politik uang', 'lobi politik', 'politik praktis', 'fraksi',
        'pencalonan', 'debat capres', 'pemilihan legislatif', 'birokrasi', 'pemerintahan',
        'visi misi', 'politik luar negeri', 'misi politik', 'calon independen', 'pendukung capres',
        'sosialisasi pemilu', 'agenda politik', 'strategi politik', 'kampanye hitam',
        'politik dinasti', 'suara terbanyak', 'kursi parlemen', 'partai oposisi',
        'komunikasi politik', 'masa kampanye', 'aturan pemilu', 'politik transaksional',
        'manuver politik', 'golongan politik', 'elite politik'
    ],
    'hukum': [
        'pengadilan', 'hakim', 'jaksa', 'kuasa hukum', 'vonis', 'hukuman', 'tuntutan', 'tersangka',
        'korupsi', 'penjara', 'pidana', 'perdata', 'mahkamah', 'konstitusi', 'putusan', 'pasal',
        'kuhp', 'gugatan', 'perkara', 'persidangan', 'putusan mk', 'hukum acara', 'tahanan',
        'penyidikan', 'penuntutan', 'barang bukti', 'putusan pengadilan', 'banding', 'kasasi',
        'putusan inkrah', 'grasi', 'amnesti', 'peradilan', 'advokat', 'notaris', 'legalitas',
        'pengacara', 'domisili hukum', 'pelanggaran hukum', 'kode etik', 'perundang-undangan',
        'hukum pidana', 'hukum perdata', 'hukum tata negara', 'peraturan pemerintah',
        'peraturan daerah', 'putusan hakim', 'putusan final', 'putusan tetap'
    ],
    'nasional': [
        'nasional', 'indonesia', 'negara', 'rakyat', 'pemerintah', 'istana', 'kenegaraan',
        'hari besar nasional', 'peringatan nasional', 'kebangsaan', 'isu nasional',
        'masyarakat indonesia', 'pembangunan nasional', 'program nasional', 'stabilitas nasional',
        'isu dalam negeri', 'kerja sama nasional', 'regulasi nasional', 'wilayah indonesia',
        'bencana nasional', 'anggaran nasional', 'kedaulatan negara', 'pertahanan negara',
        'identitas nasional', 'persatuan nasional', 'konflik sosial', 'nasionalisme',
        'pendidikan nasional', 'kesehatan nasional', 'ekonomi nasional', 'mobilisasi nasional',
        'proyek nasional', 'pemerataan pembangunan', 'infrastruktur nasional',
        'penanggulangan bencana', 'ketahanan nasional', 'sensus nasional', 'simbol negara',
        'warga negara', 'perayaan nasional', 'isu strategis', 'keamanan nasional', 'urusan dalam negeri',
        'kewarganegaraan', 'integrasi nasional', 'perubahan sosial', 'kebijakan nasional',
        'dampak nasional'
    ]
}

def label_konten(kalimat):
    label = {'politik': 0, 'hukum': 0, 'nasional': 0}
    teks = kalimat.lower()
    for kategori, keywords in keyword_dict.items():
        for k in keywords:
            if k.lower() in teks:
                label[kategori] = 1
                break
    return label

# Baca data mentah
df = pd.read_csv('berita_mentah.csv')

# Pelabelan
labels = df['konten'].apply(label_konten)
labels_df = pd.DataFrame(labels.tolist())
df = pd.concat([df, labels_df], axis=1)

# Hitung jumlah label kombinasi
df['jumlah label'] = (
    df['nasional'] * 1 +
    df['politik'] * 2 +
    df['hukum'] * 4
)

# Simpan hasil
df.to_csv('dataset_berita_multilabel.csv', index=False, encoding='utf-8-sig')
print(f"Labeling selesai. Disimpan sebagai 'dataset_berita_multilabel.csv' dengan {len(df)} entri.")


Labeling selesai. Disimpan sebagai 'dataset_berita_multilabel.csv' dengan 1004 entri.
