# Analisis Sentimen pada Berita Nasional Google News

Set env untuk API KEY Serper

In [None]:
import os

# API key is read from environment variable API_KEY
API_KEY = os.environ.get('API_KEY')

## Data Collection Script
This notebook downloads news data from Serper.dev API and saves it to the `raw` folder.

In [1]:
# Install necessary libraries
!pip install requests

Collecting requests
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.4.4-cp313-cp313-win_amd64.whl.metadata (38 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Using cached certifi-2025.11.12-py3-none-any.whl.metadata (2.5 kB)
Using cached requests-2.32.5-py3-none-any.whl (64 kB)
Using cached charset_normalizer-3.4.4-cp313-cp313-win_amd64.whl (107 kB)
Using cached idna-3.11-py3-none-any.whl (71 kB)
Using cached urllib3-2.5.0-py3-none-any.whl (129 kB)
Using cached certifi-2025.11.12-py3-none-any.whl (159 kB)
Installing collected packages: urllib3, idna, charset_normalizer, certifi, requests

   ---------------------------------------- 0/5 [urllib3]
   ----------------------

In [2]:
import requests
import json
import os
import time

In [None]:
# Configuration
# API key is read from environment variable API_KEY
API_KEY = os.environ.get('API_KEY')
BASE_URL = "https://google.serper.dev/news"
OUTPUT_FOLDER = "raw"
QUERIES = ["djp", "direktorat jenderal pajak", "pajak", "coretax", "dirjen pajak"]
GL = "id"
HL = "id"

# Ensure output directory exists
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [4]:
def fetch_news(query, page):
    payload = json.dumps({
        "q": query,
        "gl": GL,
        "hl": HL,
        "page": page
    })
    headers = {
        'X-API-KEY': API_KEY,
        'Content-Type': 'application/json'
    }
    
    try:
        response = requests.post(BASE_URL, headers=headers, data=payload)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error fetching page {page} for query {query}: {e}")
        return None

In [5]:
# Main loop
for query in QUERIES:
    print(f"Starting collection for query: {query}")
    page = 1
    while True:
        print(f"  Fetching page {page}...")
        data = fetch_news(query, page)
        
        if not data:
            print("  Failed to retrieve data or API error. Stopping current query.")
            break
            
        news_items = data.get("news", [])
        
        if not news_items:
            print(f"  No more news found at page {page}. Moving to next query.")
            break
        
        # Save to file
        # Sanitize query for filename
        safe_query = query.replace(" ", "_")
        filename = f"{safe_query}_page_{page}.json"
        filepath = os.path.join(OUTPUT_FOLDER, filename)
        
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
            
        print(f"  Saved {len(news_items)} items to {filepath}")
        
        page += 1
        # Be nice to the API to avoid rate limits
        time.sleep(1)

print("All done.")

Starting collection for query: djp
  Fetching page 1...
  Saved 14 items to raw\djp_page_1.json
  Fetching page 2...
  Saved 10 items to raw\djp_page_2.json
  Fetching page 3...
  Saved 10 items to raw\djp_page_3.json
  Fetching page 4...
  Saved 10 items to raw\djp_page_4.json
  Fetching page 5...
  Saved 10 items to raw\djp_page_5.json
  Fetching page 6...
  Saved 10 items to raw\djp_page_6.json
  Fetching page 7...
  Saved 10 items to raw\djp_page_7.json
  Fetching page 8...
  Saved 10 items to raw\djp_page_8.json
  Fetching page 9...
  Saved 10 items to raw\djp_page_9.json
  Fetching page 10...
  Saved 10 items to raw\djp_page_10.json
  Fetching page 11...
  Saved 10 items to raw\djp_page_11.json
  Fetching page 12...
  Saved 8 items to raw\djp_page_12.json
  Fetching page 13...
  Saved 10 items to raw\djp_page_13.json
  Fetching page 14...
  Saved 10 items to raw\djp_page_14.json
  Fetching page 15...
  Saved 10 items to raw\djp_page_15.json
  Fetching page 16...
  Saved 10 items 

## Data Cleaning
Process the raw data to normalize dates and save to `raw_clean_date`.

In [6]:
import re
from datetime import datetime, timedelta

CLEAN_FOLDER = "raw_clean_date"
os.makedirs(CLEAN_FOLDER, exist_ok=True)

def parse_relative_date(date_str):
    if not date_str:
        return None
    
    base_time = datetime.now()
    date_str = date_str.lower()
    
    match = re.search(r'(\d+)\s+(detik|menit|jam|hari|minggu|bulan|tahun)', date_str)
    
    if not match:
        return None
    
    value = int(match.group(1))
    unit = match.group(2)
    
    delta = timedelta(0)
    if unit == 'detik': delta = timedelta(seconds=value)
    elif unit == 'menit': delta = timedelta(minutes=value)
    elif unit == 'jam': delta = timedelta(hours=value)
    elif unit == 'hari': delta = timedelta(days=value)
    elif unit == 'minggu': delta = timedelta(weeks=value)
    elif unit == 'bulan': delta = timedelta(days=value * 30)
    elif unit == 'tahun': delta = timedelta(days=value * 365)
        
    absolute_date = base_time - delta
    return absolute_date.strftime("%m-%Y")

Cleaned data with pajak.go.id links

In [7]:
processed_count = 0

for filename in os.listdir(OUTPUT_FOLDER):
    if not filename.endswith(".json"):
        continue
        
    raw_path = os.path.join(OUTPUT_FOLDER, filename)
    clean_path = os.path.join(CLEAN_FOLDER, filename)
    
    with open(raw_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            continue
            
    news_items = data.get("news", [])
    cleaned_items = []
    
    for item in news_items:
        # Filter out links containing pajak.go.id
        link = item.get("link", "")
        if "pajak.go.id" in link:
            continue
            
        raw_date = item.get("date")
        if raw_date:
            clean_date = parse_relative_date(raw_date)
            if clean_date:
                item["date_clean"] = clean_date
        
        cleaned_items.append(item)
    
    data["news"] = cleaned_items
            
    with open(clean_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        
    processed_count += 1

print(f"Processed {processed_count} files. Saved to {CLEAN_FOLDER}")

Processed 326 files. Saved to raw_clean_date


## Data Aggregation and Deduplication
Combine all cleaned files and remove duplicates based on the link.

In [None]:
all_news = []
seen_links = set()

for filename in os.listdir(CLEAN_FOLDER):
    if not filename.endswith(".json"):
        continue
        
    filepath = os.path.join(CLEAN_FOLDER, filename)
    with open(filepath, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            for item in data.get("news", []):
                link = item.get("link")
                if link and link not in seen_links:
                    seen_links.add(link)
                    all_news.append(item)
        except json.JSONDecodeError:
            continue

print(f"Total unique news items: {len(all_news)}")

# Save aggregated data
AGGREGATED_FILE = "aggregated_news.json"
with open(AGGREGATED_FILE, "w", encoding="utf-8") as f:
    json.dump(all_news, f, ensure_ascii=False, indent=4)
print(f"Aggregated data saved to {AGGREGATED_FILE}")

## Content Extraction to `raw_content`
Fetch full content and save individual JSON files to `raw_content` folder.

In [None]:
import hashlib
import os
import json
import time
import requests

RAW_CONTENT_FOLDER = "raw_content"
os.makedirs(RAW_CONTENT_FOLDER, exist_ok=True)

SCRAPE_URL = "https://scrape.serper.dev"
# API key is read from environment variable API_KEY
API_KEY = os.environ.get('API_KEY')

def scrape_content(url):
    payload = json.dumps({
        "url": url
    })
    headers = {
        'X-API-KEY': SCRAPE_API_KEY,
        'Content-Type': 'application/json'
    }
    
    try:
        response = requests.post(SCRAPE_URL, headers=headers, data=payload)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error scraping {url}: {response.status_code} - {response.text}")
            return None
    except Exception as e:
        print(f"Exception scraping {url}: {e}")
        return None

# Load aggregated news if not in memory
if 'all_news' not in locals():
    with open("aggregated_news.json", "r", encoding="utf-8") as f:
        all_news = json.load(f)

for i, item in enumerate(all_news):
    link = item.get("link")
    if not link:
        continue
        
    # Unique filename using hash
    link_hash = hashlib.md5(link.encode("utf-8")).hexdigest()
    filename = f"{link_hash}.json"
    filepath = os.path.join(RAW_CONTENT_FOLDER, filename)
    
    if os.path.exists(filepath):
        continue
        
    print(f"Processing {i+1}/{len(all_news)}: {link}")
    
    content_data = scrape_content(link)
    
    if content_data:
        item["full_content"] = content_data.get("text", "")
        item["scrape_metadata"] = content_data.get("metadata", {})
    else:
        item["full_content"] = ""
        item["scrape_error"] = True
        
    # Save individual file
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(item, f, ensure_ascii=False, indent=4)
            
    time.sleep(1)
    
print("Content extraction to raw_content complete.")

Processing 690/1174: https://www.viva.co.id/berita/nasional/1862721-geger-bos-djarum-hingga-eks-dirjen-pajak-dicegah-ke-luar-negeri-ternyata-terkait-kasus
Exception scraping https://www.viva.co.id/berita/nasional/1862721-geger-bos-djarum-hingga-eks-dirjen-pajak-dicegah-ke-luar-negeri-ternyata-terkait-kasus: HTTPSConnectionPool(host='scrape.serper.dev', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000022577C22490>: Failed to resolve 'scrape.serper.dev' ([Errno 11001] getaddrinfo failed)"))
Processing 691/1174: https://katadata.co.id/finansial/makro/6925bb2edd01a/anak-buah-purbaya-ungkap-banyak-eksportir-mainkan-data-demi-bebas-pajak
Processing 692/1174: https://ikpi.or.id/tak-ada-ruang-bagi-pegawai-pajak-yang-bermain-kotor-dirjen-pajak-saya-pecat/
Processing 693/1174: https://ortax.org/apa-saja-data-konkret-yang-bisa-menjadi-dasar-pemeriksaan-pajak
Processing 694/1174: https://ikpi.or.id/dirjen-pajak-tegaska