In [65]:
# I aim to scrape catalog pictures of throw pillows from Overstock
# filters: "solid color", "Square" or "rectangular", " Accent" pillows
# url is "https://www.overstock.com/Home-Garden/Throw-Pillows/Solid-Color,Rectangle~Square,Accent,
# /pattern,pillow-shape,pillow-type,/2011/subcat.html?page=2"  
# where page ranges between 1 to 32
# credit to "https://medium.com/the-andela-way/introduction-to-web-scraping-using-selenium-7ec377a8cf72"
# credit to "https://intoli.com/blog/running-selenium-with-headless-chrome/"
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import os
import pandas as pd

In [96]:
# Launch your browser
options = webdriver.ChromeOptions()

# Initialize the driver
driver = webdriver.Chrome(options = options)

# experiment with page 1
page = 1
url_prefix = 'https://www.overstock.com/Home-Garden/Throw-Pillows/Solid-Color,Rectangle~Square,Accent,/pattern,pillow-shape,pillow-type,/2011/subcat.html?page='
url = url_prefix + str(page)
driver.get(url)

# wait up to 60 sec for the elements to become available
driver.implicitly_wait(60)

In [97]:
height = driver.execute_script("return document.documentElement.scrollHeight")
scrolls = int(height/1080)

# scroll window step by step
for i in range(scrolls):
    driver.execute_script("window.scrollBy(0, 1080/3*i)") 
    time.sleep(1)
 

In [98]:
# find_elements_by_xpath returns an array of selenium objects
img_url_element=driver.find_elements_by_xpath("//img[@class='productCardFrontImage']")

# use list comprehension to get the actual palette titles and not the selenium objects
img_url = [x.get_attribute('src') for x in img_url_element]

# print out all the titles
print ('image urls:')
print ( img_url, '\n')
print (len(img_url))

image urls:
['https://ak1.ostkcdn.com/images/products/9412909/L16600539.jpg?imwidth=480&impolicy=medium', 'https://ak1.ostkcdn.com/images/products/13218518/L19936654.jpg?imwidth=480&impolicy=medium', 'https://ak1.ostkcdn.com/images/products/8816874/L16050950.jpg?imwidth=480&impolicy=medium', 'https://ak1.ostkcdn.com/images/products/8817153/L16051170.jpg?imwidth=480&impolicy=medium', 'https://ak1.ostkcdn.com/images/products/10694023/L17755976.jpg?imwidth=480&impolicy=medium', 'https://ak1.ostkcdn.com/images/products/11381991/L18350445.jpg?imwidth=480&impolicy=medium', 'https://ak1.ostkcdn.com/images/products/6624571/L14191181.jpg?imwidth=480&impolicy=medium', 'https://ak1.ostkcdn.com/images/products/25776704/L31070776.jpg?imwidth=480&impolicy=medium', 'https://ak1.ostkcdn.com/images/products/9937110/L17092429.jpg?imwidth=480&impolicy=medium', 'https://ak1.ostkcdn.com/images/products/12038041/L18909705.jpg?imwidth=480&impolicy=medium', 'https://ak1.ostkcdn.com/images/products/10438315/L1

In [99]:
# find_elements_by_xpath returns an array of selenium objects
wishlist_element=driver.find_elements_by_xpath("//div[@class='wishlistCount']")

# use list comprehension to get the actual palette titles and not the selenium objects
wishlist = [x.text for x in wishlist_element]

# print out all the wishlist counts
print ('wishlist counts:')
print ( wishlist, '\n')
print (len(wishlist))

wishlist counts:
['7.8K', '3.4K', '687', '774', '2.7K', '1.4K', '1K', '96', '173', '3K', '1.3K', '1.1K', '875', '2.3K', '552', '891', '61', '306', '249', '55', '117', '733', '45', '157', '126', '51', '1.1K', '43', '75', '283', '2.5K', '41', '360', '463', '554', '1.6K', '37', '577', '182', '141', '594', '580', '757', '456', '74', '81', '500', '51', '428', '617', '427', '1K', '1.7K', '172', '1.2K', '254', '479', '1.5K', '237', '203'] 

60


In [100]:
def OS_scraping(PageNumber) :
    
    # Launch your browser
    options = webdriver.ChromeOptions()
    
    # Initialize the driver and open the url
    driver = webdriver.Chrome(options = options)
    
    url_prefix = 'https://www.overstock.com/Home-Garden/Throw-Pillows/Solid-Color,Rectangle~Square,Accent,/pattern,pillow-shape,pillow-type,/2011/subcat.html?page='
    url = url_prefix + str(PageNumber)
    driver.get(url)

    # wait up to 60 sec for the elements to become available
    driver.implicitly_wait(60)   
    # prevent loading images
    # prefs = {'profile.managed_default_content_settings.images':2}
    # options.add_experimental_option("prefs", prefs)
    
    # scroll window step by step
    height = driver.execute_script("return document.documentElement.scrollHeight")
    scrolls = int(height/1080)
    for i in range(scrolls):
        driver.execute_script("window.scrollBy(0, 1080/3*i)") 
        time.sleep(2)

    # find_elements_by_xpath returns an array of selenium objects
    img_url_element=driver.find_elements_by_xpath("//img[@class='productCardFrontImage']")

    # use list comprehension to get the actual palette titles and not the selenium objects
    img_url = [x.get_attribute('src') for x in img_url_element]
    
    # find_elements_by_xpath returns an array of selenium objects
    wishlist_element=driver.find_elements_by_xpath("//div[@class='wishlistCount']")

    # use list comprehension to get the actual palette titles and not the selenium objects
    wishlist = [x.text for x in wishlist_element]
    
    # close the window after a scratch 
    driver.close()
    
    zipped_list = list(zip(img_url,wishlist))
    output = pd.DataFrame(zipped_list, columns = ['url','wishlist_count'])
    
    return output

In [101]:
# scrape through page 1-32
# first, initiate data with page 1; lists taken from the trial run above
zipped_list = list(zip(img_url,wishlist))
pd_data = pd.DataFrame(zipped_list, columns = ['url','wishlist_count'])
print(pd_data)

                                                  url wishlist_count
0   https://ak1.ostkcdn.com/images/products/941290...           7.8K
1   https://ak1.ostkcdn.com/images/products/132185...           3.4K
2   https://ak1.ostkcdn.com/images/products/881687...            687
3   https://ak1.ostkcdn.com/images/products/881715...            774
4   https://ak1.ostkcdn.com/images/products/106940...           2.7K
5   https://ak1.ostkcdn.com/images/products/113819...           1.4K
6   https://ak1.ostkcdn.com/images/products/662457...             1K
7   https://ak1.ostkcdn.com/images/products/257767...             96
8   https://ak1.ostkcdn.com/images/products/993711...            173
9   https://ak1.ostkcdn.com/images/products/120380...             3K
10  https://ak1.ostkcdn.com/images/products/104383...           1.3K
11  https://ak1.ostkcdn.com/images/products/859524...           1.1K
12  https://ak1.ostkcdn.com/images/products/103565...            875
13  https://ak1.ostkcdn.com/images

In [102]:
for i in range (2,33):
    pd_data = pd_data.append(OS_scraping(PageNumber=i))
    time.sleep(5)
print (pd_data.shape)

dir = os.getcwd ()
export_csv = pd_data.to_csv( os.path.join (dir, r'image_url_061119update2nd.csv'), index = True, header = True)

(1426, 2)


Only 1426/1888 image urls are scraped. Further slow down the scrolling to let pages get fully loaded.