# Webscrape Express.com

This notebook contains code to webscrape products off of [Express](https://www.express.com/) store.

#### Install Dependencies

In [1]:
!pip install -U -q beautifulsoup4
!pip install -U -q selenium
!pip install -U -q webdriver-manager

You should consider upgrading via the '/Users/kevinlee/anaconda3/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/kevinlee/anaconda3/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/kevinlee/anaconda3/bin/python3 -m pip install --upgrade pip' command.[0m


#### Import Dependencies

In [2]:
from bs4 import BeautifulSoup
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

#### Define Functions

In [3]:
def setup():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    service=Service(ChromeDriverManager().install())
    return options, service

def getPage(options, service, url, wait=1.5):
    browser = webdriver.Chrome(options=options, service=service)
    browser.get(url)
    
    # Give time for page to load
    time.sleep(wait)
    
    html = browser.page_source

    browser.quit()
    return html

def retrieve_all_products(html, elem='div', class_='dNZkhrDUNNoiB2qFg8Mujw=='):
    soup = BeautifulSoup(html, 'lxml')
    # print(soup.prettify())

    # soup.find_all('section', class_='raven-main-app')
    return soup.find_all(elem, class_=class_)



def decodeProducts(products, opts):
    productsJson = []
    
    for p in products:
        
        ratingElem = p.find('div', class_='_5rBR8ccNqs5TPS5Lwd9qbQ==').find('div', class_='xGO2JVAzdcc47FDaU3I+kQ==')
        if ratingElem is not None:
            rating = ratingElem.get_text().split()
        else:
            rating = (None, '(0)')
        
        productsJson.append({
            "Label": p.img['alt'],
            "Sex": opts['sex'],
            "Price": p.find('div', class_='price').span.get_text(),
            "ImageURL": p.img['src'],
            "ProductURL": baseUrl + p.a["href"], 
            "Rating": rating[0],
            "RatingCount": rating[1],
            "Store": opts['store'],
        })
        
    return productsJson

#### Define URLs

In [4]:
baseUrl = 'https://www.express.com'

productUrls = {
    "Men": [
        '/mens-clothing/suiting/suit-ensembles/cat2430045',
        '/mens-clothing/shirts/button-down-shirts/cat4500002?ICID=MLP_BUTTONDOWNSHIRTS',
        '/mens-clothing/shirts/tees-henleys/cat430030',
        '/mens-clothing/shirts/polos/cat1006?ICID=MLP_POLOS',
        '/mens-clothing/shirts/hoodies-sweatshirts/cat1490006'
        '/mens-clothing/Sweaters/cat1490005',
        '/mens-clothing/jeans/cat400003',
        '/mens-clothing/pants/cat1005',
        '/mens-clothing/pants/dress-pants/cat280012',
        '/mens-clothing/pants/chinos/cat1730039',
        '/mens-clothing/labels-we-love/upwest/cat4900006?ICID=MLP_UPWEST',
        '/mens-clothing/labels-we-love/fourlaps/cat4970022?ICID=MLP_FOURLAPS',
        '/mens-clothing/whats-hot/collection/cat5040040',
    ],
    "Women": [
        '/womens-clothing/tops/cat430028',
        '/womens-clothing/tops/Sweaters/cat2012',
        '/womens-clothing/dresses/cat550007',
        '/womens-clothing/dresses/jumpsuits-rompers/cat320051',
        '/womens-clothing/jackets/cat320022',
        '/womens-clothing/jeans/cat2005',
        '/womens-clothing/dress-pants/cat2008',
        '/womens-clothing/bottoms/curvy-bottoms/cat5050018',
        '/womens-clothing/bottoms/leggings/cat1620001',
        '/womens-clothing/bottoms/skirts/cat2011',
        '/womens-clothing/bottoms/skirts/cat2011',
        '/womens-clothing/petites/cat3340001',
        '/womens-clothing/whats-hot/lounge-pajama-sets/cat4770016',
        '/womens-clothing/whats-hot/sustainable-clothes/cat5050153',
        '/womens-clothing/whats-hot/Fashion-trends/cat770013',
    ]
}

#### Scrape

In [5]:
options, service = setup()

# Load pages
catalog = []
for sex, categoryUrls in productUrls.items():
    for catUrl in categoryUrls:
        url = baseUrl + catUrl

        html = getPage(options, service, url)

        products = retrieve_all_products(html)

        print(f'Scraped {len(products)} products from {url}')

        catalog += decodeProducts(products, opts={'sex': sex, 'store': 'Express'})
        
len(catalog)



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/kevinlee/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


Scraped 56 products from https://www.express.com/mens-clothing/suiting/suit-ensembles/cat2430045
Scraped 72 products from https://www.express.com/mens-clothing/shirts/button-down-shirts/cat4500002?ICID=MLP_BUTTONDOWNSHIRTS
Scraped 56 products from https://www.express.com/mens-clothing/shirts/tees-henleys/cat430030
Scraped 56 products from https://www.express.com/mens-clothing/shirts/polos/cat1006?ICID=MLP_POLOS
Scraped 56 products from https://www.express.com/mens-clothing/shirts/hoodies-sweatshirts/cat1490006/mens-clothing/Sweaters/cat1490005
Scraped 56 products from https://www.express.com/mens-clothing/jeans/cat400003
Scraped 56 products from https://www.express.com/mens-clothing/pants/cat1005
Scraped 56 products from https://www.express.com/mens-clothing/pants/dress-pants/cat280012
Scraped 35 products from https://www.express.com/mens-clothing/pants/chinos/cat1730039
Scraped 43 products from https://www.express.com/mens-clothing/labels-we-love/upwest/cat4900006?ICID=MLP_UPWEST
Scra

1313

In [6]:
catalog[42]

{'Label': 'Slim Navy Washable Wool Blend Suit',
 'Sex': 'Men',
 'Price': '$426.00',
 'ImageURL': 'https://images.express.com/is/image/expressfashion/0039_03252539_1378_2_fb?cache=on&wid=361&fmt=jpeg&qlt=85,1&resmode=sharp2&op_usm=1,1,5,0&defaultImage=Photo-Coming-Soon',
 'ProductURL': 'https://www.express.com/clothing/men/slim-navy-washable-wool-blend-suit-pant/pro/03252539_b28580f1b2/color/Navy Blue/color/Navy Blue',
 'Rating': '5',
 'RatingCount': '(2)',
 'Store': 'Express'}

#### Save catalog to JSON

In [7]:
with open('catalog_express.json', 'w') as outfile:
    json.dump(catalog, outfile, indent=4)