Web Scraping using Selenium and BeautifulSoup

In [10]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from openpyxl import Workbook
from bs4 import BeautifulSoup


# Function to scrape all text from a webpage
def scrape_webpage(url):
    # Set up Selenium WebDriver
    options = Options()
    options.headless = True  # Run in headless mode
    driver = webdriver.Chrome(
        service=ChromeService(ChromeDriverManager().install()), options=options
    )

    try:
        # Open the webpage
        driver.get(url)
        time.sleep(5)  # Wait for the page to load, adjust the time as necessary

        # Initialize a list to store extracted text elements
        text_elements = []

        # Find all text elements
        soup = BeautifulSoup(driver.page_source, "html.parser")
        for element in soup.find_all(string=True):
            if element.parent.name not in [
                "style",
                "script",
                "head",
                "title",
                "meta",
                "[document]",
            ]:
                text_elements.append(element.strip())

        # Get placeholder text from input tags
        for input_tag in soup.find_all("input"):
            placeholder = input_tag.get("placeholder")
            if placeholder:
                text_elements.append(placeholder)

        # Filter out empty and whitespace-only strings
        text_list = [text for text in text_elements if text.strip()]

        return text_list
    finally:
        driver.quit()


# Main function
def main():
    url = "https://css.jne.co.id/beranda"  # Replace with the URL of the webpage you want to scrape
    output_file = "beranda.xlsx"

    # Scrape the webpage to get all text content
    scraped_text = scrape_webpage(url)

    # Write to Excel file
    wb = Workbook()
    ws = wb.active
    ws.title = "Beranda"

    # Write each distinct piece of text to a new cell in column A
    for idx, text in enumerate(scraped_text, start=1):
        ws.cell(row=idx, column=1, value=text)

    # Save the workbook
    wb.save(output_file)
    print(f"Scraped data saved to {output_file}")


if __name__ == "__main__":
    main()

Scraped data saved to beranda.xlsx


Web Scraping using Selenium and BeautifulSoup (Updated Script with Login)

In [14]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from openpyxl import Workbook
from bs4 import BeautifulSoup


# Function to scrape all text from a webpage
def scrape_webpage(url_login, url, username, password):
    # Set up Selenium WebDriver
    options = Options()
    options.headless = True  # Run in headless mode
    driver = webdriver.Chrome(
        service=ChromeService(ChromeDriverManager().install()), options=options
    )

    try:
        # Open the login page
        driver.get(url_login)

        # Wait for the login page to load
        time.sleep(3)

        # Find the username and password input fields and log in
        username_field = driver.find_element(
            By.NAME, "login_username"
        )  # Adjust selector as needed
        password_field = driver.find_element(
            By.NAME, "login_password"
        )  # Adjust selector as needed
        username_field.send_keys(username)
        password_field.send_keys(password)
        password_field.send_keys(Keys.RETURN)

        # Wait for the login to complete and the page to load
        time.sleep(5)

        # Navigate to the desired page after login
        driver.get(url)
        time.sleep(3)  # Wait for the page to load

        # Initialize a list to store extracted text elements
        text_elements = []

        # Find all text elements
        soup = BeautifulSoup(driver.page_source, "html.parser")
        for element in soup.find_all(string=True):
            if element.parent.name not in [
                "style",
                "script",
                "head",
                "title",
                "meta",
                "[document]",
            ]:
                text_elements.append(element.strip())

        # Get placeholder text from input tags
        for input_tag in soup.find_all("input"):
            placeholder = input_tag.get("placeholder")
            if placeholder:
                text_elements.append(placeholder)

        # Filter out empty and whitespace-only strings
        text_list = [text for text in text_elements if text.strip()]

        return text_list
    finally:
        driver.quit()


# Main function
def main():
    url_login = "https://css.jne.co.id/login"  # Replace with the URL of the login page
    url = "https://css.jne.co.id/pesanan/kirim"  # Replace with the URL of the webpage you want to scrape
    username = "your_username"  # Replace with your username
    password = "your_password"  # Replace with your password
    output_file = "input_kirimanmu.xlsx"

    # Scrape the webpage to get all text content
    scraped_text = scrape_webpage(url_login, url, username, password)

    # Write to Excel file
    wb = Workbook()
    ws = wb.active
    ws.title = "Input Kirimanmu"

    # Write each distinct piece of text to a new cell in column A
    for idx, text in enumerate(scraped_text, start=1):
        ws.cell(row=idx, column=1, value=text)

    # Save the workbook
    wb.save(output_file)
    print(f"Scraped data saved to {output_file}")


if __name__ == "__main__":
    main()

Scraped data saved to input_kirimanmu.xlsx
