In [6]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from urllib.parse import urljoin, urlparse, urldefrag
import pandas as pd

def scrape_images(url):
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Ensure GUI is off
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Set path to chromedriver as per your configuration
    webdriver_service = Service("/kashif/local/bin/chromedriver")  # Update this path if needed
    driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
    
    # Open the webpage
    driver.get(url)

    # Allow the page to load completely
    time.sleep(5)

    # Scroll to the bottom to load all images
    driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
    time.sleep(5)

    # Find all image elements
    images = driver.find_elements(By.TAG_NAME, "img")
    image_urls = [img.get_attribute('src') for img in images if img.get_attribute('src')]

    driver.quit()
    return image_urls

def is_valid_url(url):
    parsed_url = urlparse(url)
    return parsed_url.scheme in ['http', 'https']

def download_images(image_urls, folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    for idx, image_url in enumerate(image_urls):
        try:
            image_response = requests.get(image_url, stream=True)
            if image_response.status_code == 200:
                image_path = os.path.join(folder_name, f"image_{idx+1}.jpg")
                with open(image_path, 'wb') as f:
                    for chunk in image_response.iter_content(1024):
                        f.write(chunk)
                print(f"Image {idx+1} downloaded to {folder_name}")
            else:
                print(f"Failed to download image {image_url}: Status code {image_response.status_code}")
        except Exception as e:
            print(f"Failed to download image {image_url}: {e}")

def save_data_to_excel(data, filename):
    df = pd.DataFrame(data)
    df.to_excel(filename, index=False)
    print(f"Data saved to {filename}")

def normalize_url(url):
    # Remove URL fragment and trailing slash
    url, _ = urldefrag(url)
    return url.rstrip('/')

def main():
    website_url = input("Please enter the website URL: ").strip()
    website_url = normalize_url(website_url)

    if not is_valid_url(website_url):
        print(f"Invalid URL: {website_url}")
        return

    print(f"Scraping images from: {website_url}")
    
    image_urls = scrape_images(website_url)
    image_urls_str = '\n'.join(image_urls) if image_urls else ""

    data_for_excel = [{'URL': website_url, 'Image URLs': image_urls_str}]

    folder_name = os.path.join("scraped_images", urlparse(website_url).netloc.replace('.', '_') + "_" + website_url.replace('http://', '').replace('https://', '').replace('/', '_'))
    download_images(image_urls, folder_name)

    save_data_to_excel(data_for_excel, "extracted_data.xlsx")

if __name__ == "__main__":
    main()


Please enter the website URL:  https://www.schrack.com/shop/distribution-board-equipment-mcb-rccb-spd-mccb-terminals-fuse-material.html


Scraping images from: https://www.schrack.com/shop/distribution-board-equipment-mcb-rccb-spd-mccb-terminals-fuse-material.html


NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


In [3]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip
