# Download all relevant images from a product page on Pas Normal Studios

## Setup

In [8]:
%pip install selenium beautifulsoup4 requests


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://datascience:****@pkgs.dev.azure.com/dlimi/datascience/_packaging/datascience/pypi/simple/
Note: you may need to restart the kernel to use updated packages.


In [9]:
import os
from tqdm import tqdm
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [None]:
# URLs
# URL of the product page
BASE_URL = "https://pasnormalstudios.com/dk/"
PRODUCT_URL = "products/"
PRODUCTS = [
    "off-race-logo-t-shirt-grape",
    "off-race-pants-light-brown",
    "off-race-bandana-classic-red",
    "off-race-cotton-twill-pants-limestone",
    "off-race-cap-dusty-orange",
    "off-race-cap",
    "off-race-logo-sweatshirt",
    "pns-x-diemme-movida92-sand",
    "off-race-logo-hoodie",
    "off-race-logo-t-shirt-smoke-green",
    "off-race-pants-classic-blue",
    "off-race-pants-black",
    "off-race-pants-off-white",
    "off-race-pants-beige",
    "off-race-pants-deep-green",
]

# Image lirary
# Extract company name dynamically (e.g., "pasnormalstudios")
company_name = urlparse(BASE_URL).netloc.split('.')[0]

# Construct the directory path
save_directory = os.path.join("../images", company_name, PRODUCT_URL)

# Ensure the directory exists
os.makedirs(save_directory, exist_ok=True)

## Download images

### Start selenium

In [11]:
# Set up Selenium with headless Chrome
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
service = Service("/usr/local/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)

### Get images

In [12]:
def extract_image_urls_from_carousel(soup, URL):
    """Extract image URLs from the carousel"""

    # Find the carousel container (adjust the selector based on the actual HTML structure)
    carousel = soup.find("div", class_="swiper-wrapper")

    image_urls = []

    img_tags = carousel.find_all("img")
    for img in img_tags:
        img_url = img.get("src")

        if img_url:
            # Construct the full URL
            full_url = urljoin(URL, img_url.split("?")[0])

            # Download and save the image
            try:
                image_urls.append(full_url)
                
            except Exception as e:
                print(f"Error downloading {full_url}: {e}")

    return image_urls


def extract_image_urls_from_lazy_selector(soup, URL):
    """Extract image URLs from the lazy-loaded selector"""

    image_urls = []

    # Locate the specific div that contains the lazy-loaded images
    lazy_selector_div = soup.select_one("div.order-1.flex.flex-row-reverse.items-end.justify-end.gap-2.md\\:order-2.md\\:flex-row")
        
    if not lazy_selector_div:
        print("Lazy selector div not found.")
        return image_urls  # Return an empty list if the div isn't found

    # Extract the currently displayed main image
    main_image = lazy_selector_div.select_one("figure img")
    if main_image and main_image.get("src"):
        full_url = urljoin(URL, main_image["src"].split("?")[0])
        image_urls.append(full_url)

    # Extract all thumbnails in the lazy selector
    thumbnails = lazy_selector_div.select(".cursor-pointer img")  # Adjust selector if necessary

    for img in thumbnails:
        img_url = img.get("src")
        if img_url:
            full_url = urljoin(URL, img_url.split("?")[0])
            image_urls.append(full_url)

    return list(set(image_urls))  # Remove duplicates


def save_image_from_url(url, product_directory, key):
    img_name = os.path.basename(url)

    # Make directory
    image_location_directory = os.path.join(product_directory, key)
    os.makedirs(image_location_directory, exist_ok=True)

    file_path = os.path.join(image_location_directory, img_name)

    response = requests.get(url, stream=True)
    
    if response.status_code == 200:
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(1024):   
                file.write(chunk)
        
        print(f"Downloaded: {img_name}")
    else:
        print(f"Failed to download: {url}")

In [13]:
for product in tqdm(PRODUCTS, desc="Processing Products"):
    URL = urljoin(urljoin(BASE_URL, PRODUCT_URL), product)

    # Open the webpage
    print(f'Opening: {URL}')
    driver.get(URL)

    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "figure img"))
        )
    except Exception as e:
        print(f"Error loading {URL}: {e}")
        continue  # Skip to the next URL if loading fails

    # Make product directory
    product_directory = os.path.join(save_directory, product)
    os.makedirs(product_directory, exist_ok=True)

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    image_urls = {
        "carousel": extract_image_urls_from_carousel(soup, URL),
        "lazy_selector": extract_image_urls_from_lazy_selector(soup, URL)
    }
    
    for key, urls in image_urls.items():
        for url in urls:
            save_image_from_url(url, product_directory, key)


Processing Products:   0%|          | 0/2 [00:00<?, ?it/s]

Opening: https://pasnormalstudios.com/dk/products/off-race-pants-beige
Downloaded: 3428d29fd62ba02207bbe21768cad66da0ef5baf-3000x3750.png
Downloaded: 4ae4dee0dcf31a92419783d72d78fa0762bfd2e8-3200x4000.jpg
Downloaded: 0fea28883bce14020acf602ea151527aea477059-3200x4000.jpg
Downloaded: 8b9e1d8037c03fdd6fad1638ef7567cbf3a248d8-3000x3750.png
Downloaded: 235d62ab9f1ee46b8196a7bca725e7662cbe479a-3000x3750.png
Downloaded: e0d8d9f27f5bcc8f10247eda6657ec6fb3cc409d-2048x2560.jpg


Processing Products:  50%|█████     | 1/2 [00:32<00:32, 32.43s/it]

Downloaded: 4d42faf97ea5d99ca3fdec453a8415cca2f0b040-2048x2560.jpg
Opening: https://pasnormalstudios.com/dk/products/off-race-pants-deep-green
Downloaded: 590b861999ee01c385c9475c7aa772da946e8244-3000x3750.png
Downloaded: 001dc77bec1d3631f31a215cedb5df0c1107e967-3200x4000.jpg
Downloaded: d298ee74f8552859f5153c1be9a990cd08501ecd-3000x3750.png
Downloaded: c0fb70fe20a62f7b6433c0c8230d4761ec54ee2b-3000x3750.png
Downloaded: 25b3ccc6367e0c7ad8643d52a3a277c010979b58-2048x2560.jpg


Processing Products: 100%|██████████| 2/2 [00:39<00:00, 19.62s/it]

Downloaded: 5ba7f75fd0a9680ebaa669819b3cd3edaa8c21d9-2048x2560.jpg





In [14]:
# Close the Selenium browser
driver.quit()