In [7]:
from bson import ObjectId
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.edge.options import Options as EdgeOptions
import polars as pl
from pymongo import MongoClient
import time
import logging

# Konfigurasi logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def scrape_idx_list_profile(driver, url, timeout=10):
    logging.info(f"Scraping data from: {url}")
    driver.get(url)
    
    data_dict = {}
    try:
        # Tunggu sampai elemen utama muncul (mengurangi penggunaan time.sleep)
        parent_locator = (By.CSS_SELECTOR, 'div.tab-content section.container.mb-48 div.bzg')
        WebDriverWait(driver, timeout).until(EC.presence_of_element_located(parent_locator))
        
        parent_div = driver.find_element(*parent_locator)
        raw_text = parent_div.text
        
        # Optimasi pengolahan teks
        for line in raw_text.splitlines():
            if ':' in line:
                key, value = line.split(':', 1)
                key = key.strip()
                value = value.strip()
                if key:  # Skip empty keys
                    data_dict[key] = value

        logging.info("Data scraped successfully")
        return data_dict
        
    except Exception as e:
        logging.error(f"Error scraping {url}: {str(e)}")
        return None

def initialize_driver():
    logging.info("Initializing WebDriver")
    edge_options = EdgeOptions()
    edge_options.add_argument("--headless=new")  # Mode headless terbaru
    edge_options.add_argument("--window-size=1920,1080")
    edge_options.add_argument("--disable-gpu")
    edge_options.add_argument("--disable-blink-features=AutomationControlled")
    edge_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    )
    
    # Gunakan context manager untuk manajemen resource driver
    service = EdgeService(executable_path=r"D:\driver\msedgedriver.exe")
    return webdriver.Edge(service=service, options=edge_options)

def get_mongo_collection(mongo_uri, db_name, collection_name):
    client = MongoClient(mongo_uri)
    return client[db_name][collection_name]

def process_data():
    # Konfigurasi MongoDB
    MONGO_URI = 'mongodb://admin:%40dm1n%40123@localhost:27017/?authSource=admin'
    DB_NAME = 'scraping'
    SOURCE_COLLECTION = 'idx_list_perusahaan_tercatat'
    TARGET_COLLECTION = 'idx_list_perusahaan_tercatat_profile'
    
    logging.info("Connecting to MongoDB")
    source_coll = get_mongo_collection(MONGO_URI, DB_NAME, SOURCE_COLLECTION)
    target_coll = get_mongo_collection(MONGO_URI, DB_NAME, TARGET_COLLECTION)
    
    # Ambil data langsung dari MongoDB tanpa Polars
    pending_docs = list(source_coll.find(
        {"fl_get_profil": False}, 
        {"_id": 1, "profil_url": 1}
    ).limit(1000))  # Batasi jumlah dokumen
    
    if not pending_docs:
        logging.info("No pending documents found")
        return

    logging.info(f"Found {len(pending_docs)} documents to process")
    
    with initialize_driver() as driver:
        for doc in pending_docs:
            doc_id = doc["_id"]
            url = doc["profil_url"]
            logging.info(f"Processing document ID: {doc_id}")
            
            data = scrape_idx_list_profile(driver, url)
            if not data:
                continue
                
            try:
                # Sisipkan data ke koleksi target
                data['ref_id'] = doc_id
                target_coll.insert_one(data)
                logging.info(f"Inserted data for document {doc_id}")
                
                # Update status di koleksi sumber
                source_coll.update_one(
                    {"_id": doc_id},
                    {"$set": {"fl_get_profil": True}}
                )
                logging.info(f"Updated status for document {doc_id}")
                
            except Exception as e:
                logging.error(f"Database operation failed for {doc_id}: {str(e)}")

if __name__ == "__main__":
    start_time = time.time()
    process_data()
    logging.info(f"Process completed in {time.time() - start_time:.2f} seconds")

2025-08-13 13:26:06,927 - INFO - Connecting to MongoDB
2025-08-13 13:26:06,959 - INFO - Found 844 documents to process
2025-08-13 13:26:06,960 - INFO - Initializing WebDriver
2025-08-13 13:26:08,173 - INFO - Processing document ID: 689c13c2e15b2737135bb6e6
2025-08-13 13:26:08,174 - INFO - Scraping data from: https://www.idx.co.id/id/perusahaan-tercatat/profil-perusahaan-tercatat/BCIP
2025-08-13 13:26:09,164 - INFO - Data scraped successfully
2025-08-13 13:26:09,184 - INFO - Inserted data for document 689c13c2e15b2737135bb6e6
2025-08-13 13:26:09,185 - INFO - Updated status for document 689c13c2e15b2737135bb6e6
2025-08-13 13:26:09,186 - INFO - Processing document ID: 689c13c2e15b2737135bb6e7
2025-08-13 13:26:09,187 - INFO - Scraping data from: https://www.idx.co.id/id/perusahaan-tercatat/profil-perusahaan-tercatat/BDKR
2025-08-13 13:26:09,720 - INFO - Data scraped successfully
2025-08-13 13:26:09,721 - INFO - Inserted data for document 689c13c2e15b2737135bb6e7
2025-08-13 13:26:09,723 - I