# Adidas

https://www.adidas.com/us/shoes?grid=true

In [1]:
from time import sleep
import json
# import re
import requests
from urllib.parse import urljoin
from selenium import webdriver # !pip install selenium
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

In [16]:
def get_title(tag):
    title = tag.select_one(".glass-product-card__title")
    if title is None:
        return None
    return title.text

def get_subtitle(tag):
    # e.g. "Women's Originals", "Sportswear", "Men's Essentials", "Running" etc.
    subtitle = tag.select_one(".glass-product-card__category")
    if subtitle is None:
        return None
    return subtitle.text

def get_count(tag):
    # Number of colors
    # e.g. "2 Colors"
    num_colors = tag.select_one(".glass-product-card__label span")
    if num_colors is None:
        return None
    return num_colors.text

def get_price(tag):
    pricetag = tag.select(".gl-price-item")
    return [p.text for p in pricetag] # usually starts with [original, sale]
    # may need to double check the order

def get_url(tag):
    # url = tag.select_one(".glass-product-card__assets a").get('href')
    return tag.select_one(".glass-product-card__assets a").get('href')

### below is for individual shoe pages
def get_page(url):
    driver = webdriver.Chrome()
    driver.get(url)
    sleep(3)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.close()
    return soup

def get_description(page):
    # some shoes don't have description
    description = page.select_one("#navigation-target-description .gl-accordion__content p")
    if description is None:
        return None
    return description.text

def get_details(page):
    details = page.select_one("#navigation-target-specifications .gl-accordion__content")
    if details is None:
        return None
    bullets = [x.select(".gl-vspace-bpall-small") for x in details.select(".gl-list")]
    bullet_text = [x_sub.text for x in bullets for x_sub in x]
    return "; ".join(bullet_text)

def get_reviews(page):
    review_section = page.select_one("#navigation-target-reviews")
    if review_section is None:
        return None, None
    review_title = review_section.select_one('.gl-accordion__header .gl-accordion__title')
    if review_title:
        review_title = review_title.text
        n_reviews = review_title[review_title.index('(') + 1 : review_title.index(')')]
        avg_stars = review_section.find(attrs = {'data-auto-id': 
                                    'ratings-reviews'})
        if avg_stars:
            avg_stars = avg_stars.select_one('.out-of-5___i5A3q')
        if avg_stars:
            return n_reviews, avg_stars.text
        else:
            return n_reviews, None
    else:
        return None, None
                                      
def get_colors(page):
    all_colors = page.find(attrs = {"aria-labelledby": 
                            "available-colors-label"})
    if all_colors is None: 
        # products with one color 
        # -> none here but can get the color from product details
        return None
    
    all_colors = all_colors.find(attrs = {"data-testid": "color-variation"})
    if all_colors is None:
        return None
    
    color_list = [c.get('alt') for c in all_colors] 
    # e.g. "Product color: Core Black / Core Black / Cloud White" is one color
    # => remove "Product color:" in the front
    color_list = [c[c.index(":") + 1: ]  for c in color_list]
    return '; '.join(color_list) # each color is separated by a "; "


In [17]:
def parse_adidas_shoes(adidas_soup, url_root = "https://www.adidas.com"):
    shoes = adidas_soup.select('.glass-product-card')
    shoes_list = []
    for s in shoes:
        shoe_dict = {'title': get_title(s),
                     'subtitle': get_subtitle(s),
                     'num_colors': get_count(s),
                     'url': get_url(s)}
        price_listed = get_price(s)
        if len(price_listed) == 1: 
            shoe_dict['price'] = price_listed[0]
        elif len(price_listed) == 2:
            shoe_dict['price'] = price_listed[0]
            shoe_dict['reduced_price'] = price_listed[1]
        elif len(price_listed) == 0:
            shoe_dict['price'] = None # missing price
        else:
            print(f"More than two prices listed: {get_title(s)}")
            print(price_listed)
            # raise ValueError("More than two prices listed")
        shoes_list.append(shoe_dict)
    
    for s in shoes_list:
        if s['url'] is not None:
            abs_url = s['url']
            if "adidas.com" not in abs_url:
                abs_url = url_root + s['url']
            page = get_page(abs_url)
            s['description'] = get_description(page)
            s['details'] = get_details(page)
            s['colors'] = get_colors(page) # each color is separated by "; "
            reviews_info = get_reviews(page)
            s['n_reviews'] = reviews_info[0]
            s['avg_stars'] = reviews_info[1]
        
    return shoes_list

In [4]:
# adidas_page = requests.get("https://www.adidas.com/us/shoes?grid=true%2F")
# print("Request status: ", adidas_page.status_code)

driver = webdriver.Chrome()
driver.get("https://www.adidas.com/us/shoes?grid=true%2F")
sleep(6)
print("Connected to the page")

adidas_soup = BeautifulSoup(driver.page_source, "html.parser")
# adidas_soup = BeautifulSoup(adidas_page.content, "html.parser")
driver.close()

adidas_shoes_list = parse_adidas_shoes(adidas_soup)

url_root = 'https://www.adidas.com'
page_num = 1
while True:
    # Find the next page to scrape in the pagination.
    next_page_element = adidas_soup.find(attrs = {'data-auto-id': 'plp-pagination-next'})
    if not next_page_element: # no next page
        break

    page_num += 1
    
    next_page_url = next_page_element.get('href')
    print(next_page_url)
    # url = urljoin(url_root, next_page_url)
    # response = requests.get(url)
    driver = webdriver.Chrome()
    driver.get(url_root + next_page_url)
    sleep(6)
    adidas_soup = BeautifulSoup(driver.page_source, "html.parser")
    # adidas_soup = BeautifulSoup(response.text, "lxml")
    driver.close()
    
    current_list = parse_adidas_shoes(adidas_soup)
    print(f"Number of shoes on page {page_num}: {len(current_list)}")
    adidas_shoes_list.extend(current_list)

adidas_shoes_df = pd.DataFrame(adidas_shoes_list)
print(f"\nTotal number of Adidas shoes: {len(adidas_shoes_list)}")

Connected to the page
/us/shoes?grid=true%2F&start=48
Number of shoes on page 2: 48
/us/shoes?grid=true%2F&start=96
Number of shoes on page 3: 48
/us/shoes?grid=true%2F&start=144
Number of shoes on page 4: 48
/us/shoes?grid=true%2F&start=192
Number of shoes on page 5: 0

Total number of Adidas shoes: 192


In [5]:
display(adidas_shoes_df.head())
adidas_shoes_df.to_csv("data/adidas.csv")

Unnamed: 0,title,subtitle,num_colors,url,price,reduced_price,description,details,colors,n_reviews,avg_stars
0,Start Your Run Shoes,Women's Running,4 colors,/us/start-your-run-shoes/GY9233.html,$65,$33,You'll want these adidas running shoes the nex...,,Dash Grey / Matte Silver / Core Black,5,4.8
1,NMD_R1 Shoes,Youth Originals,,/us/nmd_r1-shoes/H03994.html,$130,$91,"One shoe to rule them all. School, work or kic...",,,131,4.6
2,Edge Lux Shoes,Women's Training,5 colors,/us/edge-lux-shoes/GZ6741.html,$90,$45,"Comfort is key, whether you're racing to catch...",,Core Black / Core Black / Iron Metallic,191,4.0
3,Adilette Comfort Slides,Sportswear,19 colors,/us/adilette-comfort-slides/GW9647.html,$40,$24,Classics for a reason. These adidas slides are...,,Core Black / Core White / Grey Six,9735,4.7
4,Fluidflow 2.0 Shoes,Men's Sportswear,3 colors,https://www.adidas.com/us/fluidflow-2.0-shoes/...,$85,$51,It doesn't really matter whether or not a run ...,,Legend Ink / Cloud White / Shadow Maroon,866,4.6


### Extract and save by page

In [20]:
page_num = 41

url_root = "https://www.adidas.com"
url_page = f"/us/shoes?grid=true%2F&start={48 * (page_num - 1)}"

driver = webdriver.Chrome()
driver.get(url_root + url_page)
sleep(6)
print(f"Connected to the page {page_num}")
adidas_soup = BeautifulSoup(driver.page_source, "html.parser")
driver.close()

adidas_shoes_list = parse_adidas_shoes(adidas_soup)
adidas_shoes_df = pd.DataFrame(adidas_shoes_list)
print(f"\nTotal number of Adidas shoes on page {page_num}: {len(adidas_shoes_list)}")
adidas_shoes_df.to_csv(f"data/adidas_page{page_num}.csv")

while True:
    # Find the next page to scrape in the pagination.
    next_page_element = adidas_soup.find(attrs = {'data-auto-id': 'plp-pagination-next'})
    if not next_page_element: # no next page
        break

    page_num += 1
    
    next_page_url = next_page_element.get('href')
    print(f"url page {page_num}: {next_page_url}")
    driver = webdriver.Chrome()
    driver.get(url_root + next_page_url)
    sleep(6)
    adidas_soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.close()
    
    adidas_shoes_list = parse_adidas_shoes(adidas_soup)
    adidas_shoes_df = pd.DataFrame(adidas_shoes_list)
    print(f"\nTotal number of Adidas shoes on page {page_num}: {len(adidas_shoes_list)}")
    adidas_shoes_df.to_csv(f"data/adidas_page{page_num}.csv")

Connected to the page 41

Total number of Adidas shoes on page 41: 48
url page 42: /us/shoes?grid=true%2F&start=1968

Total number of Adidas shoes on page 42: 48
url page 43: /us/shoes?grid=true%2F&start=2016

Total number of Adidas shoes on page 43: 48
url page 44: /us/shoes?grid=true%2F&start=2064

Total number of Adidas shoes on page 44: 48
url page 45: /us/shoes?grid=true%2F&start=2112

Total number of Adidas shoes on page 45: 48
url page 46: /us/shoes?grid=true%2F&start=2160

Total number of Adidas shoes on page 46: 48
url page 47: /us/shoes?grid=true%2F&start=2208

Total number of Adidas shoes on page 47: 48
url page 48: /us/shoes?grid=true%2F&start=2256

Total number of Adidas shoes on page 48: 48
url page 49: /us/shoes?grid=true%2F&start=2304

Total number of Adidas shoes on page 49: 48
url page 50: /us/shoes?grid=true%2F&start=2352

Total number of Adidas shoes on page 50: 33
