# Adidas

https://www.adidas.com/us/shoes?grid=true

In [1]:
from time import sleep
import json
# import re
import requests
from urllib.parse import urljoin
# from selenium import webdriver # !pip install selenium
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

In [2]:
def get_title(tag):
    return tag.select_one(".glass-product-card__title").text

def get_subtitle(tag):
    # e.g. "Women's Originals", "Sportswear", "Men's Essentials", "Running" etc.
    return tag.select_one(".glass-product-card__category").text

def get_count(tag):
    # Number of colors
    # e.g. "2 Colors"
    return tag.select_one(".glass-product-card__label span").text

def get_price(tag):
    pricetag = tag.select(".gl-price-item")
    return [p.text for p in pricetag] # usually starts with [original, sale]
    # may need to double check the order

def get_url(tag):
    return tag.get('href')

### below is for individual shoe pages
def get_page(url):
    sleep(1)
    return BeautifulSoup(requests.get(url, 'html.parser').content)

def get_description(page):
    # some shoes don't have description
    description = page.select_one("#navigation-target-description .gl-accordion__content p")
    if description is None:
        return None
    return description.text

def get_details(page):
    details = page.select_one("#navigation-target-specifications .gl-accordion__content p")
    if details is None:
        return None
    bullets = [x.select("li") for x in details.select(".gl-list")]
    bullet_text = [x.text for x in bullets]
    return "; ".join(bullet_text)

def get_reviews(page):
    review_section = page.select_one("#navigation-target-reviews")
    if review_section is None:
        return None, None
    review_title = review_section.select_one('.gl-accordion__header .gl-accordion__title').text
    n_reviews = review_title[review_title.index('(') + 1 : review_title.index(')')]
    avg_stars = review_section.find(attrs = {'data-auto-id': 
                                    'ratings-reviews'}).select_one('.out-of-5___i5A3q')
    if avg_stars:
        return n_reviews, avg_stars.text
    return n_reviews, None
                                      
def get_colors(page):
    all_colors = page.find(attrs = {"aria-labelledby": 
                            "available-colors-label"}).find(attrs = {"data-testid": "color-variation"})
    if all_colors is None:
        return None
    color_list = [c.get('alt') for c in all_colors] 
    # e.g. "Product color: Core Black / Core Black / Cloud White" is one color
    # => remove "Product color:" in the front
    color_list = [c[c.index(":") + 1: ]  for c in color_list]
    return '; '.join(color_list) # each color is separated by a "; "


In [3]:
def parse_adidas_shoes(adidas_soup):
    shoes = adidas_soup.select('.glass-product-card')
    for s in shoes:
        shoe_dict = {'label': get_label(s),
                     'title': get_title(s),
                     'subtitle': get_subtitle(s),
                     'num_colors': get_count(s),
                     'url': get_url(s)}
        price_listed = get_price(s)
        if len(price_listed) == 1: 
            shoe_dict['price'] = price_listed[0]
        elif len(price_listed) == 2:
            shoe_dict['reduced_price'] = price_listed[1]
        else:
            raise ValueError("More than two prices listed")
    
    for s in shoes_list:
        page = get_page(s['url'])
        s['description'] = get_description(page)
        s['details'] = get_details(page)
        s['colors'] = get_colors(page) # each color is separated by "; "
        reviews_info = get_reviews(page)
        s['n_reviews'] = reviews_info[0]
        s['avg_stars'] = reviews_info[1]
        
    return shoes_list

In [None]:
adidas_page = requests.get("https://www.adidas.com/us/shoes?grid=true/")
print("Request status: ", adidas_page.status_code)

adidas_soup = BeautifulSoup(adidas_page.content, "html.parser")

adidas_shoes_list = parse_adidas_shoes(adidas_soup)

url_root = 'https://www.adidas.com'
page_num = 1
while True:
    # Find the next page to scrape in the pagination.
    next_page_element = adidas_soup.find(attrs = {'data-auto-id': 'plp-pagination-next'})
    if not next_page_element: # no next page
        break

    page_num += 1
    
    next_page_url = next_page_element.get('href')
    url = urljoin(url_root, next_page_url)
    response = requests.get(url)
    adidas_soup = BeautifulSoup(response.text, "lxml")
    
    current_list = parse_adidas_shoes(adidas_soup)
    print(f"Number of shoes on page {page_num}: {len(current_list)}")
    adidas_shoes_list.extend(current_list)

adidas_shoes_df = pd.DataFrame(adidas_shoes_list)
print(f"\nTotal number of Adidas shoes: {len(adidas_shoes_list)}")

In [None]:
display(adidas_shoes_df.head())
adidas_shoes_df.to_csv("data/adidas.csv")