In [1]:
from time import sleep

import requests
from selenium import webdriver # !pip install selenium
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

In [2]:
nike_df = pd.read_csv("data/nike.csv", index_col = 0)
display(nike_df.head(3))
nike_df.info()

Unnamed: 0,label,title,subtitle,num_colors,price,reduced_price,url,description,colors,n_reviews,avg_stars
0,Best Seller,Air Jordan 1 Mid,Shoes,2 Colors,$125,,https://www.nike.com/t/air-jordan-1-mid-shoes-...,"Inspired by the original AJ1, the Air Jordan 1...",Black/White/Fire Red; Black/Black/Black,2161.0,4.9
1,Best Seller,Nike Blazer Mid '77 Vintage,Women's Shoes,3 Colors,$105,,https://www.nike.com/t/blazer-mid-77-vintage-w...,Styled for the ‘70s. Loved in the ‘80s. Classi...,White/White/Peach/White; White/Sail/Peach/Blac...,519.0,4.8
2,Coming Soon,Nike Dunk Low Retro,Men's Shoes,1 Color,$110,,https://www.nike.com/t/dunk-low-retro-mens-sho...,Created for the hardwood but taken to the stre...,,,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1368 entries, 0 to 1367
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   label          409 non-null    object 
 1   title          1368 non-null   object 
 2   subtitle       1367 non-null   object 
 3   num_colors     1368 non-null   object 
 4   price          1366 non-null   object 
 5   reduced_price  579 non-null    object 
 6   url            1368 non-null   object 
 7   description    1340 non-null   object 
 8   colors         840 non-null    object 
 9   n_reviews      1271 non-null   float64
 10  avg_stars      1271 non-null   float64
dtypes: float64(2), object(9)
memory usage: 128.2+ KB


In [3]:
# There are some gift cards in the data set, which will be dropped
# as the project focuses solely on footwear products
display(nike_df.loc[nike_df['title'].str.contains("Gift Card")])
print("Dropping gift card items...")
nike_df.drop(nike_df.loc[nike_df['title'].str.contains("Gift Card")].index,
             inplace = True)

Unnamed: 0,label,title,subtitle,num_colors,price,reduced_price,url,description,colors,n_reviews,avg_stars
248,,Nike Digital Gift Card,Emailed in Approximately 2 Hours or Less,9 Colors,,,https://www.nike.com/t/digital-gift-card-email...,"Emailed in approximately 2 hours or less, this...",; ; ; ; ; ; ; ; ; ; ;,,
978,Just In,Nike Gift Card,,5 Colors,,,https://www.nike.com/t/gift-card-RpyEVO/GIFTCA...,"Mailed in a mini Nike shoebox, this gift card ...",; ; ; ; ; ; ; ; ; ; ; ; ; ;,,


Dropping gift card items...


In [4]:
def get_page(url):
    sleep(0.5)
    return BeautifulSoup(requests.get(url, 'html.parser').content)

def get_one_color(url):
    page = get_page(url)
    color = page.select_one('.description-preview__color-description')
    if color is None:
        return None
    return color.text

In [6]:
nike_df['colors'] = nike_df.apply(lambda row: get_one_color(row['url']) 
                                  if row['num_colors'] == "1 Color" 
                                  else row['colors'], 
                                  axis = 1)

In [7]:
nike_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1366 entries, 0 to 1367
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   label          408 non-null    object 
 1   title          1366 non-null   object 
 2   subtitle       1366 non-null   object 
 3   num_colors     1366 non-null   object 
 4   price          1366 non-null   object 
 5   reduced_price  579 non-null    object 
 6   url            1366 non-null   object 
 7   description    1338 non-null   object 
 8   colors         1231 non-null   object 
 9   n_reviews      1271 non-null   float64
 10  avg_stars      1271 non-null   float64
dtypes: float64(2), object(9)
memory usage: 128.1+ KB


In [19]:
missing_colors = nike_df.loc[nike_df['colors'].isna()]
na_1color = missing_colors.loc[missing_colors['num_colors'] == "1 Color"]
na_2plus_color = missing_colors.loc[missing_colors['num_colors'] != "1 Color"]
print(f"There are {na_1color.shape[0]} observations with 1 color but their corresponding webpages/urls are no longer available.")
print(f"There are {na_2plus_color.shape[0]} observations with more than one color available when the data was first scraped from the Nike website. We will drop these rows because some products are no longer available and some now have a different number of colors.")

There are 86 observations with 1 color but their corresponding webpages/urls are no longer available.
There are 49 observations with more than one color available when the data was first scraped from the Nike website. We will drop these rows because some products are no longer available and some now have a different number of colors.


In [20]:
nike_df.to_csv("data/nike_cleaned.csv")