# Webscrape Zappos.com

This notebook contains code to webscrape products off of [Zappos](https://www.zappos.com/) store.

#### Install Dependencies

In [31]:
!pip install -q beautifulsoup4
!pip install -q selenium 
!pip install -q webdriver-manager

#### Import Dependencies

In [54]:
from bs4 import BeautifulSoup
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import requests
from tqdm import tqdm

#### Define Functions

In [46]:
def setup():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    service=Service(ChromeDriverManager().install())
    return options, service

def getPage(options, service, url, wait=1.5):
    browser = webdriver.Chrome(options=options, service=service)
    browser.get(url)
    
    # Give time for page to load
    time.sleep(wait)
    
    html = browser.page_source

    browser.quit()
    return html

def retrieve_all_products(html, elem='article', class_='tB-z'):
    soup = BeautifulSoup(html, 'lxml')
    # print(soup.prettify())

    # soup.find_all('section', class_='raven-main-app')
    return soup.find_all(elem, class_=class_)



def decodeProducts(products, opts):
    productsJson = []
    
    for index, p in enumerate(products):
        
        numRatings = p.find("span", class_="Qi-z")
        if numRatings is None:
            numRatings = '0'
            rating = None
        else:
            numRatings = numRatings.find("meta", itemprop="reviewCount ratingCount")["content"] 
            rating = p.find("span", class_="Qi-z")["data-star-rating"]
        cur_href = p.a["href"]
        if 'https' in cur_href:
            full_product_url = cur_href
        else:
            full_product_url = baseUrl + p.a["href"]
            
        productsJson.append({
            "Label": p.find('dd', class_="ml-z").get_text(), # Did not choose to add brand..but that was an option
            "Sex": opts['sex'],
            "Price": p.find('span', class_="ns-z").get_text(),  # Does not get the MSRP. Some have MSRP next to them...
            "ImageURL": p.find('meta', itemprop="image")["content"],
            "ProductURL": full_product_url, 
            "Rating": rating,
            "RatingCount": numRatings,
            "Store": opts['store'],
        })
        
    return productsJson

#### Define URLs

In [47]:
baseUrl = 'https://www.zappos.com'

productUrls = {
    "Men": [
        '/men-coats-outerwear/CKvXARDH1wHAAQLiAgMBAhg.zso',
        '/men-shirts-tops/CKvXARDL1wHAAQLiAgMBAhg.zso',
        '/men-pants/CKvXARDK1wHAAQLiAgMBAhg.zso',
        '/men-hoodies-sweatshirts/CKvXARDF1wHAAQLiAgMBAhg.zso',
        '/men-jeans/CKvXARDI1wHAAQLiAgMBAhg.zso',
        '/men-shorts/CKvXARDM1wHAAQLiAgMBAhg.zso',
    ],
    "Women": [
        '/women-coats-outerwear/CKvXARDH1wHAAQHiAgMBAhg.zso',
        '/women-pants/CKvXARDK1wHAAQHiAgMBAhg.zso'
        '/women-shirts-tops/CKvXARDL1wHAAQHiAgMBAhg.zso',
        '/women-jeans/CKvXARDI1wHAAQHiAgMBAhg.zso',
        '/women-sweaters/CKvXARDQ1wHAAQHiAgMBAhg.zso',
        '/women-dresses/CKvXARDE1wHAAQHiAgMBAhg.zso',
    ]
}

#### Scrape

In [48]:
options, service = setup()

# Load pages
catalog = []
for sex, categoryUrls in productUrls.items():
    for catUrl in categoryUrls:
        url = baseUrl + catUrl

        html = getPage(options, service, url)

        products = retrieve_all_products(html)

        print(f'Scraped {len(products)} products from {url}')

        catalog += decodeProducts(products, opts={'sex': sex, 'store': 'Zappos'})
        
len(catalog)



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/98.0.4758.102/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\avento\.wdm\drivers\chromedriver\win32\98.0.4758.102]


Scraped 104 products from https://www.zappos.com/men-coats-outerwear/CKvXARDH1wHAAQLiAgMBAhg.zso
Scraped 104 products from https://www.zappos.com/men-shirts-tops/CKvXARDL1wHAAQLiAgMBAhg.zso
Scraped 104 products from https://www.zappos.com/men-pants/CKvXARDK1wHAAQLiAgMBAhg.zso
Scraped 104 products from https://www.zappos.com/men-hoodies-sweatshirts/CKvXARDF1wHAAQLiAgMBAhg.zso
Scraped 104 products from https://www.zappos.com/men-jeans/CKvXARDI1wHAAQLiAgMBAhg.zso
Scraped 104 products from https://www.zappos.com/men-shorts/CKvXARDM1wHAAQLiAgMBAhg.zso
Scraped 104 products from https://www.zappos.com/women-coats-outerwear/CKvXARDH1wHAAQHiAgMBAhg.zso
Scraped 104 products from https://www.zappos.com/women-pants/CKvXARDK1wHAAQHiAgMBAhg.zso/women-shirts-tops/CKvXARDL1wHAAQHiAgMBAhg.zso
Scraped 104 products from https://www.zappos.com/women-jeans/CKvXARDI1wHAAQHiAgMBAhg.zso
Scraped 104 products from https://www.zappos.com/women-sweaters/CKvXARDQ1wHAAQHiAgMBAhg.zso
Scraped 104 products from https:

1144

In [62]:
# Ensure All the Links Are Valid
for item in tqdm(catalog):
    link = item["ProductURL"]
    try:
        response = requests.get(link)
        if response.status_code != 200:
            print(f'ProductURL {link} does not exist')
    except:
        print(f'An error happened with ProductURL {link}')

    link = item["ImageURL"]
    try:
        response = requests.get(link)
        if response.status_code != 200:
            print(f'ImageURL {link} does not exist')
    except:
        print(f'An error happened with ImageURL {link}')

100%|██████████████████████████████████████████████████████████████████████████████| 1144/1144 [06:35<00:00,  2.89it/s]


In [60]:
catalog[777]

{'Label': 'Short Sleeve Peasant Top',
 'Sex': 'Women',
 'Price': '$138.00',
 'ImageURL': 'https://m.media-amazon.com/images/I/81AUmjIbo+L._AC_SR255,340_.jpg',
 'ProductURL': 'https://www.zappos.com/p/karen-kane-short-sleeve-peasant-top-print/product/9733179/color/17493',
 'Rating': None,
 'RatingCount': '0',
 'Store': 'Zappos'}

#### Save catalog to JSON

In [30]:
with open('catalog_zappos.json', 'w') as outfile:
    json.dump(catalog, outfile, indent=4)