In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.support.ui import Select
from pymongo import MongoClient
import time
import logging

# Konfigurasi logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

def initialize_driver():
    """Inisialisasi WebDriver dengan konfigurasi optimal"""
    logging.info("Initializing WebDriver")
    edge_options = EdgeOptions()
    edge_options.add_argument("--headless=new")  # Mode headless terbaru
    edge_options.add_argument("--window-size=1920,1080")
    edge_options.add_argument("--disable-gpu")
    edge_options.add_argument("--disable-blink-features=AutomationControlled")
    edge_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    )
    
    service = EdgeService(executable_path=r"D:\driver\msedgedriver.exe")
    return webdriver.Edge(service=service, options=edge_options)

def get_mongo_collection(mongo_uri, db_name, collection_name):
    """Mendapatkan koleksi MongoDB"""
    client = MongoClient(mongo_uri)
    return client[db_name][collection_name]

def scrape_idx_list(driver, url, timeout=30):
    """Scraping data tabel perusahaan tercatat"""
    logging.info(f"Navigating to URL: {url}")
    driver.get(url)
    
    try:
        # Tunggu tabel muncul (perbaikan sintaks di sini)
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#vgt-table"))
        )
        logging.info("Page loaded successfully")
        
        # Scroll sedikit untuk memastikan render dropdown
        driver.execute_script("window.scrollBy(0, 300);")
        
        # Tunggu dan pilih dropdown "All"
        logging.info("Selecting 'All' in dropdown")
        select_element = WebDriverWait(driver, timeout).until(
            EC.element_to_be_clickable((By.NAME, "perPageSelect"))
        )
        Select(select_element).select_by_value("-1")
        
        # Tunggu data dimuat ulang (tunggu loading hilang)
        WebDriverWait(driver, timeout).until(
            EC.invisibility_of_element_located((By.CSS_SELECTOR, ".vgt-loading"))
        )
        logging.info("Table reloaded after selecting all rows")
        
        # Tunggu minimal 3 baris data muncul
        WebDriverWait(driver, timeout).until(
            lambda d: len(d.find_elements(By.CSS_SELECTOR, "#vgt-table tbody tr")) >= 3
        )
        
        # Ekstrak data tabel
        rows = driver.find_elements(By.CSS_SELECTOR, "#vgt-table tbody tr")
        logging.info(f"Found {len(rows)} rows in the table")
        
        data = []
        for i, row in enumerate(rows, 1):
            try:
                cols = row.find_elements(By.TAG_NAME, "td")
                if len(cols) < 3:
                    logging.warning(f"Skipping row {i}: insufficient columns")
                    continue
                    
                nama_el = cols[1].find_element(By.TAG_NAME, "a")
                company_data = {
                    "Kode": cols[0].text.strip(),
                    "Nama": nama_el.text.strip(),
                    "Tanggal Pencatatan": cols[2].text.strip(),
                    "profil_url": nama_el.get_attribute("href"),
                    "fl_get_profil": False,
                    "fl_get_financial_report": False
                }
                data.append(company_data)
                logging.debug(f"Processed row {i}: {company_data['Kode']}")
            except Exception as e:
                logging.warning(f"Error processing row {i}: {str(e)}")
        
        return data
        
    except Exception as e:
        logging.error(f"Scraping failed: {str(e)}")
        return None

def process():
    """Main processing function"""
    start_time = time.time()
    MONGO_URI = "mongodb://admin:%40dm1n%40123@localhost:27017/?authSource=admin"
    DB_NAME = "scraping"
    COLLECTION_NAME = "idx_list_perusahaan_tercatat"
    URL = "https://www.idx.co.id/id/perusahaan-tercatat/profil-perusahaan-tercatat"
    
    logging.info("Starting scraping process")
    
    driver = initialize_driver()
    try:
        data = scrape_idx_list(driver, URL)
    finally:
        driver.quit()
        logging.info("WebDriver closed")
    
    if not data:
        logging.error("No data scraped, exiting")
        return
        
    logging.info(f"Successfully scraped {len(data)} records")
    
    try:
        collection = get_mongo_collection(MONGO_URI, DB_NAME, COLLECTION_NAME)
        result = collection.insert_many(data)
        logging.info(f"Inserted {len(result.inserted_ids)} records into MongoDB")
    except Exception as e:
        logging.error(f"Failed to save data to MongoDB: {str(e)}")
    
    elapsed = time.time() - start_time
    logging.info(f"Process completed in {elapsed:.2f} seconds")

if __name__ == "__main__":
    process()

2025-08-14 11:22:04 - INFO - Starting scraping process
2025-08-14 11:22:04 - INFO - Initializing WebDriver
2025-08-14 11:22:05 - INFO - Navigating to URL: https://www.idx.co.id/id/perusahaan-tercatat/profil-perusahaan-tercatat
2025-08-14 11:22:06 - INFO - Page loaded successfully
2025-08-14 11:22:06 - INFO - Selecting 'All' in dropdown
2025-08-14 11:22:10 - INFO - Table reloaded after selecting all rows
2025-08-14 11:22:10 - INFO - Found 954 rows in the table
2025-08-14 11:22:53 - INFO - WebDriver closed
2025-08-14 11:22:53 - INFO - Successfully scraped 954 records
2025-08-14 11:22:53 - INFO - Inserted 954 records into MongoDB
2025-08-14 11:22:53 - INFO - Process completed in 49.60 seconds
