In [2]:
import requests
from bs4 import BeautifulSoup as bs
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver import ActionChains
import selenium
import pandas as pd
import numpy as np
import time
import os

In [3]:
#Define links in website to scrape, split by clothing type
# This assumes each link is in the same format and can be scraped using the same variables defined later.
saint_laurent = {
    'Mens Shirts': 'https://www.ysl.com/en-hk/shop-men/ready-to-wear/by-category/shirts',
    'Mens Knitwear': 'https://www.ysl.com/en-hk/shop-men/ready-to-wear/by-category/knitwear',
    'Mens Denim':'https://www.ysl.com/en-hk/shop-men/ready-to-wear/by-category/denim',
    'Mens T-shirt_and_Sweatshirts':'https://www.ysl.com/en-hk/shop-men/ready-to-wear/by-category/t-shirts-and-sweatshirts',
    'Mens Leather_and_Fur': 'https://www.ysl.com/en-hk/shop-men/ready-to-wear/by-category/leather-and-fur',
    'Mens Outerwear':'https://www.ysl.com/en-hk/shop-men/ready-to-wear/by-category/leather-and-fur',
    'Mens Jackets_and_Pants': 'https://www.ysl.com/en-hk/shop-men/ready-to-wear/by-category/jackets-and-pants',
    'Mens Jewelry':'https://www.ysl.com/en-hk/search?cgid=view-all-jewellry-men',
    'Mens Shoes':'https://www.ysl.com/en-hk/shop-men/shoes/view-all',
    'Mens Hats':'https://www.ysl.com/en-hk/shop-men/accessories/hats',
    'Mens Sunglasses':'https://www.ysl.com/en-hk/shop-men/sunglasses',
    'Mens Bags':'https://www.ysl.com/en-hk/shop-men/bags/view-all',

    'Womens T-Shirts_and_Sweatshirts': 'https://www.ysl.com/en-hk/shop-women/ready-to-wear/by-category/t-shirts-and-sweatshirts',
    'Womens Knitwear': 'https://www.ysl.com/en-hk/shop-women/ready-to-wear/by-category/knitwear',
    'Womens Leather_and_Fur':'https://www.ysl.com/en-hk/shop-women/ready-to-wear/by-category/leather-and-fur',
    'Womens Jackets':'https://www.ysl.com/en-hk/shop-women/ready-to-wear/by-category/jackets',
    'Womens Outerwear': 'https://www.ysl.com/en-hk/shop-women/ready-to-wear/by-category/outerwear',
    'Womens Trousers_and_Shorts':'https://www.ysl.com/en-hk/shop-women/ready-to-wear/by-category/trousers-and-shorts',
    'Womens Shirts_and_Blouses':'https://www.ysl.com/en-hk/shop-women/ready-to-wear/by-category/shirts-and-blouses',
    'Womens Denim': 'https://www.ysl.com/en-hk/shop-women/ready-to-wear/by-category/denim',
    'Womens Lingerie':'https://www.ysl.com/en-hk/shop-women/ready-to-wear/by-category/lingerie',
    'Womens Dresses_and_Skirts':'https://www.ysl.com/en-hk/shop-women/ready-to-wear/by-category/dresses-and-skirts',
    'Womens Shoes':'https://www.ysl.com/en-hk/shop-women/shoes/view-all',
    'Womens Handbags':'https://www.ysl.com/en-hk/shop-women/handbags/view-all',
    'Womens Hats_and_Gloves':'https://www.ysl.com/en-hk/shop-women/accessories/hats-and-gloves',
    'Womens Sunglasses':'https://www.ysl.com/en-hk/shop-women/sunglasses',
    'Womens Jewellry':'https://www.ysl.com/en-hk/shop-women/jewellery/view-all'
    }

#Set destination folder to save subfolder of images
#Additional sub folders for each clothing category will be automatically created to save images
folder = 'C:/Users/Yoon Hwan Kim/Documents/Shop_Recommendation_Project/Scrap_1'

#Create empty df to save all information to
df = pd.DataFrame(columns = ['ID', 'Clothing Category','Brand', 'Price', 'Image URL'])

#Loop through links in the dictionary 
for category in saint_laurent.keys():
    #Create a new folder for each clothing category, and change to this folder
    try:
        os.mkdir(os.path.join(folder, str(category)))
        os.chdir(os.path.join(folder, str(category)))
    except:
        pass
    #Use Selenium to begin scrape
    url = saint_laurent[category]
    driver = webdriver.Chrome()
    driver.get(url)
    subhtml = driver.page_source
    soup = bs(subhtml, "html.parser")
    
    #Scroll down and rescrape, stop when last_height = new_height, since no new images
    while True:
        last_height = driver.execute_script("return document.body.scrollHeight")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        try:
            element=driver.find_element_by_css_selector('button.c-loadmore__btn.c-button--animation')
            ActionChains(driver).click(element).perform()
        except selenium.common.exceptions.NoSuchElementException:
            pass

        time.sleep(3)
        new_height = driver.execute_script("return document.body.scrollHeight")

        
        if new_height == last_height:
            subhtml = driver.page_source
            soup = bs(subhtml, "html.parser")
            break
        
        last_height = new_height
        subhtml = driver.page_source
        soup = bs(subhtml, "html.parser")

    # except TimeoutException

    #Begin scrape once we have scrolled to the bottom of the page
    id_no = len(df)

    # Define the variables 'brand', 'product_category', 'price' and 'url / src' locations in the webpage
    # 'brand' = defined by shop to categorise groups of similar products, e.g. "Mens Sports Performance Training", 'HeatTech catalogue', 'Dry-Fit', 'ColdTech', 'Nike Yoga' etc
    # 'product category' = defined by shop for product types, e.g. 'long sleeved shirt', 'Men's Perfomance shorts', 'Women's Running Tights', 'Men's Basketball Hoodie' 
    # 'src' = url for link to the picture/image. This is used by the requests.get() function to download the image, check there is a 'http:' at the start of the string
    
    for product in soup.find_all(class_="c-product__inner"): # change 'class_ = ...' to how the each image / product is split in the website 
        try:
            brand = product.find(class_='c-product__name').text
            price = product.find(class_='c-price__value--current').text.strip()
            src = product.find(class_='c-product__image')['src']

            # Create a dummy dataframe and append to main df
            df1 = pd.DataFrame({'ID': int(id_no), 
                'Clothing Category':str(category),
                'Brand':[brand],
                'Price':[price], 
                'Image URL':[src]
                })
            df = df.append(df1)
            
            #Save image into folder
            image_name = str(id_no) + str(" ") + str(brand) + str(".jpg")
            image_link = src
            with open(str(image_name),'wb') as f:
                im = requests.get(image_link)
                f.write(im.content)
            id_no+=1
        #some will fail... this is expected
        except:
            print('Failed')
            id_no+=1
            pass

Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed


In [5]:
df.shape

(55, 5)

In [6]:
os.chdir(folder)
df.to_csv('Saint_Laurent.csv')