Web Scraping version 1

In [11]:
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook


# Function to scrape text from specific HTML tags on a webpage
def scrape_webpage(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Specify the HTML tags to scrape from
    tags_to_scrape = [
        "p",
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
        "li",
    ]  # Example tags (you can add/remove as needed)

    # Initialize list to store sentences
    sentences = []

    # Extract text from each specified tag
    for tag in tags_to_scrape:
        elements = soup.find_all(tag)
        for element in elements:
            # Split text into sentences and add to sentences list
            sentences.extend(element.get_text().strip().split(". "))

    return sentences


# Main function
def main():
    url = "https://css.jne.co.id/"  # Replace with the URL of the webpage you want to scrape
    output_file = "scraped_data.xlsx"

    # Scrape the webpage
    scraped_sentences = scrape_webpage(url)

    # Write to Excel file
    wb = Workbook()
    ws = wb.active
    ws.title = "Scraped Data"

    # Write each sentence to a new cell in column A
    for idx, sentence in enumerate(scraped_sentences, start=1):
        ws.cell(row=idx, column=1, value=sentence)

    # Save the workbook
    wb.save(output_file)
    print(f"Scraped data saved to {output_file}")


if __name__ == "__main__":
    main()

Scraped data saved to scraped_data.xlsx


Web Scraping version 2

In [22]:
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook


# Function to scrape all text from a webpage
def scrape_webpage(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all text elements
    text_elements = soup.find_all(string=True)

    # Get placeholder text from input tags
    for input_tag in soup.find_all("input"):
        placeholder = input_tag.get("placeholder")
        if placeholder:
            text_elements.append(placeholder)

    # Find the span element with class 'select2-selection__placeholder'
    for span_tag in soup.find_all("span", class_="select2-selection__placeholder"):
        if span_tag:
            # Extract text from the span tag and strip whitespace
            text = span_tag.get_text(strip=True)
            text_elements.append(text)

    # Filter out empty and whitespace-only strings
    text_list = [text.strip() for text in text_elements if text.strip()]

    return text_list


# Main function
def main():
    url = "https://css.jne.co.id/"  # Replace with the URL of the webpage you want to scrape
    output_file = "scraped_data.xlsx"

    # Scrape the webpage to get all text content
    scraped_text = scrape_webpage(url)

    # Write to Excel file
    wb = Workbook()
    ws = wb.active
    ws.title = "Scraped Data"

    # Write each distinct piece of text to a new cell in column A
    for idx, text in enumerate(scraped_text, start=1):
        ws.cell(row=idx, column=1, value=text)

    # Save the workbook
    wb.save(output_file)
    print(f"Scraped data saved to {output_file}")


if __name__ == "__main__":
    main()

Scraped data saved to scraped_data.xlsx


Web Scraping version 3

In [25]:
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook


# Function to scrape all text from a webpage
def scrape_webpage(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Initialize a list to store extracted text elements
    text_elements = []

    # Find all text elements
    for element in soup.find_all(text=True):
        if element.parent.name not in [
            "style",
            "script",
            "head",
            "title",
            "meta",
            "[document]",
        ]:
            text_elements.append(element.strip())

    # Get placeholder text from input tags
    for input_tag in soup.find_all("input"):
        placeholder = input_tag.get("placeholder")
        if placeholder:
            text_elements.append(placeholder)

    # Find the span element with class 'select2-selection__placeholder'
    span_tags = soup.find_all("span", class_="select2-selection__placeholder")
    if span_tags:
        for span_tag in span_tags:
            # Extract text from the span tag and strip whitespace
            text = span_tag.get_text(strip=True)
            text_elements.append(text)
    else:
        print("No span tags found with class 'select2-selection__placeholder'")

    # Filter out empty and whitespace-only strings
    text_list = [text for text in text_elements if text.strip()]

    return text_list


# Main function
def main():
    url = "https://css.jne.co.id/"  # Replace with the URL of the webpage you want to scrape
    output_file = "scraped_data.xlsx"

    # Scrape the webpage to get all text content
    scraped_text = scrape_webpage(url)

    # Write to Excel file
    wb = Workbook()
    ws = wb.active
    ws.title = "Scraped Data"

    # Write each distinct piece of text to a new cell in column A
    for idx, text in enumerate(scraped_text, start=1):
        ws.cell(row=idx, column=1, value=text)

    # Save the workbook
    wb.save(output_file)
    print(f"Scraped data saved to {output_file}")


if __name__ == "__main__":
    main()

No span tags found with class 'select2-selection__placeholder'
Scraped data saved to scraped_data.xlsx


  for element in soup.find_all(text=True):


Web Scraping using Selenium

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from openpyxl import Workbook


# Function to scrape all text from a webpage
def scrape_webpage(url):
    # Set up Selenium WebDriver
    options = Options()
    options.headless = True  # Run in headless mode
    driver = webdriver.Chrome(
        service=ChromeService(ChromeDriverManager().install()), options=options
    )

    try:
        # Open the webpage
        driver.get(url)
        time.sleep(5)  # Wait for the page to load, adjust the time as necessary

        # Initialize a list to store extracted text elements
        text_elements = []

        # Find all text elements
        soup = BeautifulSoup(driver.page_source, "html.parser")
        for element in soup.find_all(text=True):
            if element.parent.name not in [
                "style",
                "script",
                "head",
                "title",
                "meta",
                "[document]",
            ]:
                text_elements.append(element.strip())

        # Get placeholder text from input tags
        for input_tag in soup.find_all("input"):
            placeholder = input_tag.get("placeholder")
            if placeholder:
                text_elements.append(placeholder)

        # Find the span element with class 'select2-selection__placeholder'
        span_tags = soup.find_all("span", class_="select2-selection__placeholder")
        if span_tags:
            for span_tag in span_tags:
                text = span_tag.get_text(strip=True)
                text_elements.append(text)
        else:
            print("No span tags found with class 'select2-selection__placeholder'")

        # Filter out empty and whitespace-only strings
        text_list = [text for text in text_elements if text.strip()]

        return text_list
    finally:
        driver.quit()


# Main function
def main():
    url = "https://css.jne.co.id/"  # Replace with the URL of the webpage you want to scrape
    output_file = "scraped_data.xlsx"

    # Scrape the webpage to get all text content
    scraped_text = scrape_webpage(url)

    # Write to Excel file
    wb = Workbook()
    ws = wb.active
    ws.title = "Scraped Data"

    # Write each distinct piece of text to a new cell in column A
    for idx, text in enumerate(scraped_text, start=1):
        ws.cell(row=idx, column=1, value=text)

    # Save the workbook
    wb.save(output_file)
    print(f"Scraped data saved to {output_file}")


if __name__ == "__main__":
    main()