In [1]:
from time import sleep
import json
import re
import requests
from selenium import webdriver # !pip install selenium
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Nike

https://www.nike.com/w/shoes-y7ok

In [2]:
# this doesn't work because it scraped only the first 24 shoes
# => need page scrolling
nike_page = requests.get("https://www.nike.com/w/shoes-y7ok/")
print("Request status: ", nike_page.status_code)

Request status:  200


In [3]:
# have to use webdriver instead of requests because
# the page needs to be scrolled in order to get the list of all shoes..
# Otherwise, will get only the first 24 shoes.

driver = webdriver.Chrome()
driver.get("https://www.nike.com/w/shoes-y7ok")
sleep(2)  # Allow 2 seconds for the web page to open
scroll_pause_time = 1 # in seconds
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1

while True:
    # scroll one screen height each time
    driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
    i += 1
    sleep(scroll_pause_time)
    # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
    scroll_height = driver.execute_script("return document.body.scrollHeight;")  
    # Break the loop when the height we need to scroll to is larger than the total scroll height
    if (screen_height) * i > scroll_height:
        break 
        
nike_soup = BeautifulSoup(driver.page_source, "html.parser")
print (nike_soup.prettify()[:500])

<html class="js-focus-visible" data-js-focus-visible="" lang="en">
 <head>
  <script async="" src="https://www.googletagmanager.com/dclk/ns/v1.js" type="text/javascript">
  </script>
  <script async="" src="https://www.googletagmanager.com/dclk/ns/v1.js" type="text/javascript">
  </script>
  <script async="" src="https://www.googletagmanager.com/dclk/ns/v1.js" type="text/javascript">
  </script>
  <script async="" src="https://www.googletagmanager.com/dclk/ns/v1.js" type="text/javascript">
  </s


In [10]:
def get_label(tag): 
    # e.g. "Best Seller", "Coming Soon", "Just In", "Sold our", "Member Access",
    # "Sustainable Materials", "Launching in SNKRS", "Available in SNKRS", 
    # "Customize", etc.
    label_tag = tag.select_one('figure .product-card__info .product-card__messaging')
    if label_tag is None:
        return ""
    return label_tag.text

def get_title(tag):
    return tag.select_one("figure .product-card__title").text

def get_subtitle(tag):
    # e.g. "Shoes", "Men's Shoes", "Women's Shoes", 
    # "Big Kids' Shoes", "Basketball Shoes", etc.
    return tag.select_one("figure .product-card__subtitle").text

def get_count(tag):
    # Number of colors
    # e.g. "2 Colors"
    return tag.select_one("figure .product-card__product-count").text

def get_reduced_price(tag):
    reduced_pricetag = tag.find(attrs = {'data-test': 'product-price-reduced'})
    if reduced_pricetag is None:
        return "" # full price (see get_price())
    return reduced_pricetag.text

def get_price(tag):
    pricetag = tag.find(attrs = {'data-test': 'product-price'})
    if pricetag is None: # will result in N/A in dataframe => need to drop
        return None
    return pricetag.text

def get_url(tag):
    return tag.select_one('figure a').get('href')

### below is for individual shoe pages
def get_page(url):
    sleep(1)
    return BeautifulSoup(requests.get(url, 'html.parser').content)

def get_description(page):
    description = page.select_one('.description-preview p')
    if description is None:
        return None
    return description.text

def get_colors(page):
    all_colors = page.select('.colorway-images img')
    if all_colors is None:
        return None
    return '; '.join([c.get('alt') for c in all_colors])

def get_reviews(page):
    review_section = page.find(attrs = {'data-test': 'reviewsAccordionClick'})
    if review_section is None:
        return None, None
    review_title = review_section.select_one('h3').text
    n_reviews = review_title[review_title.index('(') + 1 : review_title.index(')')]
    avg_stars = review_section.select_one('div').get('aria-label')
    return n_reviews, avg_stars
    
    

In [8]:
def parse_nike_shoes(nike_soup):
    shoes = nike_soup.select('.product-card__body')
    shoes_list = [ {'label': get_label(s),
                     'title': get_title(s),
                     'subtitle': get_subtitle(s),
                     'num_colors': get_count(s),
                     'price': get_price(s),
                     'reduced_price': get_reduced_price(s),
                     'url': get_url(s)
                    } 
                  for s in shoes ]
    
    for s in shoes_list:
        page = get_page(s['url'])
        s['description'] = get_description(page)
        s['colors'] = get_colors(page) # each color is separated by "; "
        reviews_info = get_reviews(page)
        s['n_reviews'] = reviews_info[0]
        s['avg_stars'] = reviews_info[1]
        
    return shoes_list
    

In [11]:
nike_shoes_list = parse_nike_shoes(nike_soup)

nike_shoes_df = pd.DataFrame(nike_shoes_list)
print("Total number of shoes:", len(nike_shoes_df))
# Note: the full page has approximately 1.8K shoes. 
# Webdriver terminated a little early due to a subscription popup
nike_shoes_df.head()

Total number of shoes: 1368


Unnamed: 0,label,title,subtitle,num_colors,price,reduced_price,url,description,colors,n_reviews,avg_stars
0,Best Seller,Air Jordan 1 Mid,Shoes,2 Colors,$125,,https://www.nike.com/t/air-jordan-1-mid-shoes-...,"Inspired by the original AJ1, the Air Jordan 1...",Black/White/Fire Red; Black/Black/Black,2161.0,4.9
1,Best Seller,Nike Blazer Mid '77 Vintage,Women's Shoes,3 Colors,$105,,https://www.nike.com/t/blazer-mid-77-vintage-w...,Styled for the ‘70s. Loved in the ‘80s. Classi...,White/White/Peach/White; White/Sail/Peach/Blac...,519.0,4.8
2,Coming Soon,Nike Dunk Low Retro,Men's Shoes,1 Color,$110,,https://www.nike.com/t/dunk-low-retro-mens-sho...,Created for the hardwood but taken to the stre...,,,
3,Best Seller,Nike Air Force 1 '07,Men's Shoes,2 Colors,$110,,https://www.nike.com/t/air-force-1-07-mens-sho...,The radiance lives on in the Nike Air Force 1 ...,Black/White; White/Black; Design your own Nike...,128.0,4.9
4,Best Seller,Air Jordan 12 Retro,Men's Shoes,1 Color,$200,,https://www.nike.com/t/air-jordan-12-retro-men...,No need to call for a ride—MJ's game-winning l...,,333.0,4.9


In [12]:
nike_shoes_df.to_csv("data/nike.csv")