In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from tqdm import tqdm
from datetime import datetime

# Konfigurasi scraping
kata_kunci = ['politik', 'hukum', 'ekonomi']
headers = {"User-Agent": "Mozilla/5.0"}
berita_data = []
maks_halaman = 100

# Rentang waktu (format YYYY-MM-DD) - updated to current month
start_date = datetime(2024, 7, 1)
end_date = datetime(2025, 7, 31)

def ambil_kalimat_pertama(text):
    """Extract first sentence from text"""
    kalimat = re.split(r'[.!?]', text)
    return kalimat[0].strip() if kalimat else text.strip()

def parse_indonesian_date(date_str):
    """Parse Indonesian date format like 'Rabu, 30 Jul 2025 13:45 WIB'"""
    try:
        # Remove day name and time info
        date_str = re.sub(r'^[A-Za-z]+,\s*', '', date_str)  # Remove "Rabu, "
        date_str = re.sub(r'\s+\d{2}:\d{2}.*$', '', date_str)  # Remove time part
        
        # Month mapping
        months = {
            'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
            'Mei': '05', 'Jun': '06', 'Jul': '07', 'Agu': '08',
            'Sep': '09', 'Okt': '10', 'Nov': '11', 'Des': '12'
        }
        
        # Parse format like "30 Jul 2025"
        parts = date_str.strip().split()
        if len(parts) >= 3:
            day = parts[0].zfill(2)
            month = months.get(parts[1], '01')
            year = parts[2]
            
            return datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d")
    except Exception:
        return None
    return None

def ekstrak_tanggal(bs):
    """Extract and parse date from various formats"""
    try:
        # Try meta tag first
        meta = bs.find("meta", {"property": "article:published_time"})
        if meta and meta.get("content"):
            date_str = meta["content"][:10]  # Get YYYY-MM-DD part
            return datetime.strptime(date_str, "%Y-%m-%d")
        
        # Try time tag
        time_tag = bs.find("time")
        if time_tag:
            date_text = time_tag.text.strip()
            return parse_indonesian_date(date_text)
        
        # Try detail date
        detail_date = bs.find("div", {"class": "detail__date"})
        if detail_date:
            date_text = detail_date.text.strip().split(" - ")[-1]
            return parse_indonesian_date(date_text)
            
    except Exception:
        return None
    return None

# Main scraping process
url_terambil = set()

for keyword in kata_kunci:
    print(f"\nScraping keyword: {keyword}")
    for halaman in range(1, maks_halaman + 1):
        url = f"https://www.detik.com/search/searchall?query={keyword}&siteid=2&sortby=time&page={halaman}"
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')
        links = soup.select('article a')
        urls = list(set([a['href'] for a in links if 'https://news.detik.com' in a['href']]))
        urls = [u for u in urls if u not in url_terambil]

        if not urls:
            print(f"Halaman {halaman} kosong untuk keyword '{keyword}', stop.")
            continue

        for link in tqdm(urls, desc=f'Scraping {keyword}, halaman {halaman}'):
            try:
                r = requests.get(link, headers=headers)
                bs = BeautifulSoup(r.text, 'html.parser')

                title = bs.find('h1').text.strip()
                content_tags = bs.select('div.detail__body-text p')
                if not content_tags:
                    continue
                full_text = ' '.join([p.text.strip() for p in content_tags])
                kalimat = ambil_kalimat_pertama(full_text)

                if not kalimat or '[Gambas:' in kalimat or len(kalimat.split()) < 5:
                    continue

                # Extract date using improved function
                tanggal_dt = ekstrak_tanggal(bs)
                if not tanggal_dt:
                    continue

                if not (start_date <= tanggal_dt <= end_date):
                    continue

                berita_data.append({
                    'tanggal': tanggal_dt.strftime('%Y-%m-%d'),
                    'sumber (url)': link,
                    'judul berita': title,
                    'konten': kalimat
                })

                url_terambil.add(link)
                time.sleep(1)  # Reduced delay
            except Exception:
                continue
        time.sleep(0.5)  # Reduced delay between pages

# Save results to CSV
df = pd.DataFrame(berita_data)
df.drop_duplicates(subset='sumber (url)', inplace=True)
df = df.sample(frac=1, random_state=42)
df.to_csv('berita_mentah2.csv', index=False, encoding='utf-8-sig')

print(f"\nSelesai! Disimpan sebagai 'berita_mentah.csv' dengan {len(df)} entri.")
if len(df) > 0:
    print(f"\nSample data:")
    print(df.head())


Scraping keyword: politik


Scraping politik, halaman 1: 100%|██████████| 8/8 [00:09<00:00,  1.14s/it]
Scraping politik, halaman 2: 100%|██████████| 7/7 [00:08<00:00,  1.28s/it]
Scraping politik, halaman 3: 100%|██████████| 8/8 [00:10<00:00,  1.30s/it]
Scraping politik, halaman 4: 100%|██████████| 8/8 [00:11<00:00,  1.39s/it]
Scraping politik, halaman 5: 100%|██████████| 7/7 [00:09<00:00,  1.29s/it]
Scraping politik, halaman 6: 100%|██████████| 6/6 [00:08<00:00,  1.49s/it]
Scraping politik, halaman 7: 100%|██████████| 7/7 [00:10<00:00,  1.55s/it]
Scraping politik, halaman 8: 100%|██████████| 9/9 [00:12<00:00,  1.44s/it]
Scraping politik, halaman 9: 100%|██████████| 6/6 [00:08<00:00,  1.37s/it]
Scraping politik, halaman 10: 100%|██████████| 6/6 [00:07<00:00,  1.32s/it]
Scraping politik, halaman 11: 100%|██████████| 4/4 [00:05<00:00,  1.35s/it]
Scraping politik, halaman 12: 100%|██████████| 7/7 [00:09<00:00,  1.29s/it]
Scraping politik, halaman 13: 100%|██████████| 6/6 [00:07<00:00,  1.29s/it]
Scraping politik, hal


Scraping keyword: hukum


Scraping hukum, halaman 1: 100%|██████████| 6/6 [00:07<00:00,  1.27s/it]
Scraping hukum, halaman 2: 100%|██████████| 7/7 [00:08<00:00,  1.28s/it]
Scraping hukum, halaman 3: 100%|██████████| 6/6 [00:07<00:00,  1.28s/it]
Scraping hukum, halaman 4: 100%|██████████| 3/3 [00:03<00:00,  1.26s/it]
Scraping hukum, halaman 5: 100%|██████████| 6/6 [00:07<00:00,  1.29s/it]
Scraping hukum, halaman 6: 100%|██████████| 6/6 [00:07<00:00,  1.32s/it]
Scraping hukum, halaman 7: 100%|██████████| 3/3 [00:06<00:00,  2.01s/it]
Scraping hukum, halaman 8: 100%|██████████| 6/6 [00:07<00:00,  1.33s/it]
Scraping hukum, halaman 9: 100%|██████████| 8/8 [00:09<00:00,  1.18s/it]
Scraping hukum, halaman 10: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]
Scraping hukum, halaman 11: 100%|██████████| 7/7 [00:09<00:00,  1.30s/it]
Scraping hukum, halaman 12: 100%|██████████| 5/5 [00:06<00:00,  1.30s/it]
Scraping hukum, halaman 13: 100%|██████████| 6/6 [00:08<00:00,  1.46s/it]
Scraping hukum, halaman 14: 100%|██████████| 6/


Scraping keyword: ekonomi


Scraping ekonomi, halaman 1: 100%|██████████| 1/1 [00:00<00:00,  4.02it/s]


Halaman 2 kosong untuk keyword 'ekonomi', stop.


Scraping ekonomi, halaman 3: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Scraping ekonomi, halaman 4: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
Scraping ekonomi, halaman 5: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]
Scraping ekonomi, halaman 6: 100%|██████████| 3/3 [00:03<00:00,  1.29s/it]
Scraping ekonomi, halaman 7: 100%|██████████| 6/6 [00:07<00:00,  1.26s/it]
Scraping ekonomi, halaman 8: 100%|██████████| 3/3 [00:03<00:00,  1.28s/it]
Scraping ekonomi, halaman 9: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]
Scraping ekonomi, halaman 10: 100%|██████████| 3/3 [00:03<00:00,  1.28s/it]
Scraping ekonomi, halaman 11: 100%|██████████| 3/3 [00:03<00:00,  1.29s/it]
Scraping ekonomi, halaman 12: 100%|██████████| 3/3 [00:02<00:00,  1.04it/s]
Scraping ekonomi, halaman 13: 100%|██████████| 2/2 [00:02<00:00,  1.41s/it]
Scraping ekonomi, halaman 14: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
Scraping ekonomi, halaman 15: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
Scraping ekonomi, h

Halaman 31 kosong untuk keyword 'ekonomi', stop.


Scraping ekonomi, halaman 32: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
Scraping ekonomi, halaman 33: 100%|██████████| 4/4 [00:05<00:00,  1.32s/it]
Scraping ekonomi, halaman 34: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
Scraping ekonomi, halaman 35: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it]


Halaman 36 kosong untuk keyword 'ekonomi', stop.


Scraping ekonomi, halaman 37: 100%|██████████| 1/1 [00:01<00:00,  1.28s/it]
Scraping ekonomi, halaman 38: 100%|██████████| 3/3 [00:04<00:00,  1.37s/it]
Scraping ekonomi, halaman 39: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it]
Scraping ekonomi, halaman 40: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]
Scraping ekonomi, halaman 41: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]
Scraping ekonomi, halaman 42: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Scraping ekonomi, halaman 43: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]
Scraping ekonomi, halaman 44: 100%|██████████| 4/4 [00:05<00:00,  1.30s/it]
Scraping ekonomi, halaman 45: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]


Halaman 46 kosong untuk keyword 'ekonomi', stop.


Scraping ekonomi, halaman 47: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]


Halaman 48 kosong untuk keyword 'ekonomi', stop.


Scraping ekonomi, halaman 49: 100%|██████████| 2/2 [00:02<00:00,  1.40s/it]
Scraping ekonomi, halaman 50: 100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
Scraping ekonomi, halaman 51: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
Scraping ekonomi, halaman 52: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]
Scraping ekonomi, halaman 53: 100%|██████████| 1/1 [00:01<00:00,  1.32s/it]
Scraping ekonomi, halaman 54: 100%|██████████| 2/2 [00:03<00:00,  1.74s/it]
Scraping ekonomi, halaman 55: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]
Scraping ekonomi, halaman 56: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
Scraping ekonomi, halaman 57: 100%|██████████| 2/2 [00:02<00:00,  1.45s/it]
Scraping ekonomi, halaman 58: 100%|██████████| 3/3 [00:03<00:00,  1.29s/it]
Scraping ekonomi, halaman 59: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]
Scraping ekonomi, halaman 60: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it]
Scraping ekonomi, halaman 61: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]
Scraping eko

Halaman 79 kosong untuk keyword 'ekonomi', stop.


Scraping ekonomi, halaman 80: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
Scraping ekonomi, halaman 81: 100%|██████████| 1/1 [00:01<00:00,  1.28s/it]


Halaman 82 kosong untuk keyword 'ekonomi', stop.


Scraping ekonomi, halaman 83: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it]
Scraping ekonomi, halaman 84: 100%|██████████| 3/3 [00:03<00:00,  1.33s/it]
Scraping ekonomi, halaman 85: 100%|██████████| 3/3 [00:08<00:00,  2.84s/it]
Scraping ekonomi, halaman 86: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Scraping ekonomi, halaman 87: 100%|██████████| 4/4 [00:05<00:00,  1.49s/it]
Scraping ekonomi, halaman 88: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]
Scraping ekonomi, halaman 89: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]
Scraping ekonomi, halaman 90: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
Scraping ekonomi, halaman 91: 100%|██████████| 4/4 [00:04<00:00,  1.10s/it]
Scraping ekonomi, halaman 92: 100%|██████████| 4/4 [00:05<00:00,  1.31s/it]


Halaman 93 kosong untuk keyword 'ekonomi', stop.


Scraping ekonomi, halaman 94: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]


Halaman 95 kosong untuk keyword 'ekonomi', stop.


Scraping ekonomi, halaman 96: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Scraping ekonomi, halaman 97: 100%|██████████| 3/3 [00:02<00:00,  1.05it/s]
Scraping ekonomi, halaman 98: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]
Scraping ekonomi, halaman 99: 100%|██████████| 4/4 [00:05<00:00,  1.34s/it]
Scraping ekonomi, halaman 100: 100%|██████████| 5/5 [00:06<00:00,  1.30s/it]



Selesai! Disimpan sebagai 'berita_mentah.csv' dengan 968 entri.

Sample data:
        tanggal                                       sumber (url)  \
575  2025-04-24  https://news.detik.com/kolom/d-7883182/kualita...   
259  2025-01-17  https://news.detik.com/pilkada/d-7737155/jeje-...   
388  2024-09-05  https://news.detik.com/berita/d-7526786/pengac...   
495  2025-05-21  https://news.detik.com/berita/d-7924924/kejagu...   
70   2025-03-22  https://news.detik.com/foto-news/d-7835198/bin...   

                                          judul berita  \
575  Kualitas Ketertiban Umum Memburuk, Penegak Huk...   
259  Jeje Govinda Tepis Tudingan Hengky Kurniawan s...   
388  Pengacara Klaim Alasan Alice Guo ke Indonesia ...   
495   Kejagung Segera Tentukan Status Hukum Bos Sritex   
70   Bingkai Sepekan: Aksi Tolak RUU TNI hingga Tim...   

                                                konten  
575  Premanisme yang terus merajalela akhir-akhir i...  
259  KPU Kabupaten Bandung Barat dan 

In [5]:
import pandas as pd

# Daftar keyword
keyword_dict = {
    'politik': [
        'presiden', 'wakil presiden', 'dpr', 'dpd', 'partai', 'politik', 'pemilu', 'kampanye',
        'caleg', 'kpu', 'bawaslu', 'koalisi', 'oposisi', 'menteri', 'kabinet', 'reshuffle',
        'parlemen', 'pemerintah pusat', 'isu politik', 'sikap politik', 'pemilihan umum',
        'politik identitas', 'politik uang', 'lobi politik', 'politik praktis', 'fraksi',
        'pencalonan', 'debat capres', 'pemilihan legislatif', 'birokrasi', 'pemerintahan',
        'visi misi', 'politik luar negeri', 'misi politik', 'calon independen', 'pendukung capres',
        'sosialisasi pemilu', 'agenda politik', 'strategi politik', 'kampanye hitam',
        'politik dinasti', 'suara terbanyak', 'kursi parlemen', 'partai oposisi',
        'komunikasi politik', 'masa kampanye', 'aturan pemilu', 'politik transaksional',
        'manuver politik', 'golongan politik', 'elite politik'
    ],
    'hukum': [
        'pengadilan', 'hakim', 'jaksa', 'kuasa hukum', 'vonis', 'hukuman', 'tuntutan', 'tersangka',
        'korupsi', 'penjara', 'pidana', 'perdata', 'mahkamah', 'konstitusi', 'putusan', 'pasal',
        'kuhp', 'gugatan', 'perkara', 'persidangan', 'putusan mk', 'hukum acara', 'tahanan',
        'penyidikan', 'penuntutan', 'barang bukti', 'putusan pengadilan', 'banding', 'kasasi',
        'putusan inkrah', 'grasi', 'amnesti', 'peradilan', 'advokat', 'notaris', 'legalitas',
        'pengacara', 'domisili hukum', 'pelanggaran hukum', 'kode etik', 'perundang-undangan',
        'hukum pidana', 'hukum perdata', 'hukum tata negara', 'peraturan pemerintah',
        'peraturan daerah', 'putusan hakim', 'putusan final', 'putusan tetap'
    ],
    'ekonomi': [
    'ekonomi', 'ekonomi nasional', 'ekonomi global', 'perekonomian', 'sektor ekonomi',
    'inflasi', 'deflasi', 'pdb', 'pertumbuhan ekonomi', 'pendapatan per kapita',
    'daya beli', 'nilai tukar', 'kurs', 'utang luar negeri', 'cadangan devisa',
    'defisit anggaran', 'neraca perdagangan', 'neraca pembayaran',
    'anggaran', 'apbn', 'apbd', 'belanja negara', 'pendapatan negara',
    'kebijakan fiskal', 'kebijakan moneter', 'bank indonesia', 'bi rate', 'suku bunga',
    'subsidi', 'dana desa', 'pajak', 'retribusi', 'utang negara', 'obligasi negara',
    'bursa saham', 'ihsg', 'saham', 'indeks harga', 'perdagangan', 'ekspor', 'impor',
    'perdagangan internasional', 'free trade', 'perdagangan bebas', 'perjanjian dagang',
    'lapangan kerja', 'pengangguran', 'kemiskinan', 'upah minimum', 'gaji',
    'jaminan sosial', 'kesejahteraan', 'asuransi sosial', 'bantuan sosial',
    'industri', 'manufaktur', 'pertanian', 'perikanan', 'pertambangan',
    'pariwisata', 'energi', 'ekonomi maritim', 'ekonomi agraria', 'ekonomi sektor riil',
    'ekonomi digital', 'ekonomi kreatif', 'ekonomi hijau', 'fintech',
    'startup', 'blockchain', 'cryptocurrency', 'e-commerce', 'ekonomi sirkular',
    'ekonomi mikro', 'ekonomi makro', 'permintaan', 'penawaran', 'elastisitas',
    'konsumen', 'produsen', 'harga pasar', 'kompetisi pasar',
    'ojk', 'lembaga keuangan', 'perbankan', 'bank umum', 'koperasi',
    'badan usaha', 'bumn', 'bumn strategis', 'bursa berjangka',
    'investasi', 'modal asing', 'dana investasi', 'devisa', 'kapitalisasi pasar',
    'kredit usaha rakyat', 'umkm', 'ekonomi kerakyatan', 'ekonomi syariah',
    'ekonomi berbasis komunitas'
]
}

def label_konten(kalimat):
    label = {'politik': 0, 'hukum': 0, 'ekonomi': 0}
    teks = kalimat.lower()
    for kategori, keywords in keyword_dict.items():
        for k in keywords:
            if k.lower() in teks:
                label[kategori] = 1
                break
    return label

# Baca data mentah
df = pd.read_csv('berita_mentah2.csv')

# Pelabelan
labels = df['konten'].apply(label_konten)
labels_df = pd.DataFrame(labels.tolist())
df = pd.concat([df, labels_df], axis=1)

# Hitung jumlah label kombinasi
df['jumlah label'] = (
    df['ekonomi'] * 1 +
    df['politik'] * 2 +
    df['hukum'] * 4
)

# Simpan hasil
df.to_csv('dataset_berita_multilabel2.csv', index=False, encoding='utf-8-sig')
print(f"Labeling selesai. Disimpan sebagai 'dataset_berita_multilabel2.csv' dengan {len(df)} entri.")


Labeling selesai. Disimpan sebagai 'dataset_berita_multilabel2.csv' dengan 968 entri.
