# Capstone Project: Dog Toy Recommendation System 
### Data Collection 
In order to collect my data, I will use Selenium and ChromeDriver in order to scrape dog toy reviews from Chewy. 

Sources used throughout this pages:
https://selenium-python.readthedocs.io/locating-elements.html#locating-hyperlinks-by-link-text
https://www.scrapingbee.com/blog/selenium-python/
https://www.scrapingbee.com/blog/practical-xpath-for-web-scraping/
https://www.scrapingbee.com/blog/scraping-single-page-applications/
https://stackoverflow.com/questions/11549647/getting-the-url-of-the-current-page-using-selenium-webdriver
https://towardsdatascience.com/in-10-minutes-web-scraping-with-beautiful-soup-and-selenium-for-data-professionals-8de169d36319
https://medium.com/ymedialabs-innovation/web-scraping-using-beautiful-soup-and-selenium-for-dynamic-page-2f8ad15efe25
https://towardsdatascience.com/web-scraping-using-selenium-python-8a60f4cf40ab
https://towardsdatascience.com/5-top-tips-for-data-scraping-using-selenium-d8b83804681c

In [1]:
# Code in my selenium_practice.py file so far for scraping data

import pandas as pd

# imports 
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

In [2]:
def scrape_toy_title(page_source):
    toy_list = []
    soup = BeautifulSoup(page_source, 'lxml')
    
    # Getting the toy's title 
    section = soup.find('section', id='right-column')
    title = section.find('div', id='product-title').find('h1').get_text().strip()
    return title

In [3]:
def scrape_toy_price(page_source):
    # Getting the toy's price 
    soup = BeautifulSoup(page_source, 'lxml')
    price = soup.find('div', id='pricing').find(
        'ul', class_='product-pricing').find(
        'li', class_='our-price').find(
        'p', class_='price').find(
        'span', class_='ga-eec__price').get_text().strip()
    return price

In [4]:
def scrape_toy_description(page_source):
    soup = BeautifulSoup(page_source, 'lxml')
    try:
        descriptions =  soup.find(
            'div', class_='cw-tabs__body container').find(
            'article', id='descriptions').find(
            'section', class_='descriptions__content cw-tabs__content--left').find_all(
            'p')
        text_list = []
        for description in descriptions:
            text = description.get_text()
            text_list.append(text)
    
    except:
        description =  soup.find(
                'div', class_='cw-tabs__body container').find(
                'article', id='descriptions').find(
                'section', class_='descriptions__content cw-tabs__content--left').find(
                'p')
        text_list = []
        text = description.get_text()
        text_list.append(text)
    
    else: 
        pass
    return text_list
    

In [5]:
# def scrape_toy_description(page_source):
#     soup = BeautifulSoup(page_source, 'lxml')
#     descriptions =  soup.find(
#             'div', class_='cw-tabs__body container').find(
#             'article', id='descriptions').find(
#             'section', class_='descriptions__content cw-tabs__content--left').find_all(
#             'p')
#     text_list = []
#     for description in descriptions:
#         text = description.get_text()
#         text_list.append(text)
#     return text_list

In [6]:
def scrape_toy_keybenefits(page_source):
    soup = BeautifulSoup(page_source, 'lxml')
    ul = soup.find(
        'div', class_='cw-tabs__body container').find(
        'article', id='descriptions').find(
        'section', class_='descriptions__content cw-tabs__content--left').find(
        'ul')
    lis = ul.find_all('li')
    text_list = []
    for li in lis:
        text = li.get_text()
        text_list.append(text)

#             If you want each key benefit to be in its own list run this instead 
#             text_item = []
#             text = li.get_text()
#             text_item.append(text)
#             text_list.append(text_item)

    return text_list

In [7]:
def scrape_toy_rating(page_source):
    soup = BeautifulSoup(page_source, 'lxml')
    picture = soup.find(
        'div', class_='product-header-extras').find(
        'div', class_='ugc ugc-head').find(
        'picture')
    img = picture.find('img') # How do I access the img and then the stuff inside the img? 
    rating = img['src']
    return rating[-7:-4] # Grabbing the number itself from the 'src' attribute 

In [8]:
def scrape_toy_reviews(page_source): 
    soup = BeautifulSoup(page_source, 'lxml')
    reviews = soup.find_all('span', class_='ugc-list__review__display')
#     print(len(reviews))
#     print(reviews[0].get_text())
    text_list = []
    for review in reviews:
        review.get_text()
        text_list.append(review)
    return text_list

# Need to figure out the best ways to get all the reviews 

In [9]:
def scrape_toy(page_source):
    # Getting elements off page
    toy_dict = {}
    
    # toy title
    toy_title = scrape_toy_title(page_source)
    toy_dict['title'] = toy_title

    # toy price 
    toy_price = scrape_toy_price(page_source)
    toy_dict['price'] = toy_price
    
    # toy description 
    toy_description = scrape_toy_description(page_source)
    toy_dict['descriptions'] = toy_description
    
    try:
        # toy key benefits 
        toy_keybenefits = scrape_toy_keybenefits(page_source)
        toy_dict['key_benefits'] = toy_keybenefits
    except:
        pass
    
    # toy rating -- NEEDS FIXING
    toy_rating = scrape_toy_rating(page_source)
    toy_dict['rating'] = toy_rating

    # toy reviews
    toy_reviews = scrape_toy_reviews(page_source)
    toy_dict['reviews'] = toy_reviews
    return toy_dict

In [10]:
def scrape_toy_page(toy_cat_dict, toy_subcat, toy_links): #products
#     # Lopping through all products and scraping
#     toys_links =[]
#     for product in products:
#         link = product.get_attribute('href')
#         toys_links.append(link)

    toy_subcat_dict = {}
    for link in toy_links:
        driver.get(link)
        page_source = driver.page_source
        toy_dict = scrape_toy(page_source)
        toy_subcat_dict[link] = toy_dict

    toy_cat_dict[toy_subcat] = toy_subcat_dict

In [11]:
def number_of_toys(page_source):
    soup = BeautifulSoup(page_source, 'lxml')
    numbers = soup.find_all('span', class_='category-count')
#     print(numbers[0].text)
    subcat_numbers = []
    for span in numbers:
        number = span.text
        subcat_numbers.append(int(number[1:-1]))
    return subcat_numbers

In [13]:
def grab_subcat_links(link, number_of_toys):
    # https://www.chewy.com/b/moderate-2718
    # https://www.chewy.com/b/moderate_c2718_p5
    
    main_href = f'{link[:-5]}_c{link[-4:]}_p'    
    subcat_pages = []
    subcat_pages.append(link)
    for i in range(2, round(number_of_toys / 36)+1):
        href = f'{main_href}{i}'
        subcat_pages.append(href)
    return subcat_pages

In [15]:
def get_links(page_source):
    soup = BeautifulSoup(page_source, 'lxml')
    subcats = soup.find_all('a', class_='facet_selection')
    links_list = []
    for subcat in subcats:
        link = subcat['href']
        full_link = f'https://www.chewy.com{link}'
        links_list.append(full_link)
    return links_list

In [16]:
def grab_toy_links(subcat_pages):
    toys_links =[]
    for page in subcat_pages:
        driver.get(page)
        products = driver.find_elements_by_class_name('product')
        # Lopping through all products on first page 
        for product in products:
            link = product.get_attribute('href')
            toys_links.append(link)
    return toys_links

In [17]:
# CHEW TOYS 


DRIVER_PATH = '/Users/haleytaft/Downloads/chromedriver'
driver = webdriver.Chrome( executable_path=DRIVER_PATH) 
original_link = "https://www.chewy.com/b/toys-315"
driver.get(original_link)

# To first just look at CHEW TOYS
chew_toys_link = driver.find_element_by_link_text('Chew Toys')
chew_toys_link.click()

# Defining a larger dictionary to hold subcat dictionaries
chew_toys = {}

# Going to MODERATE chew toys
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Moderate")))
element.click()

# Checking for number of toys in each subcategory (looking at side bar)
cat_page_source = driver.page_source
chew_numbers = number_of_toys(cat_page_source)

# Getting all first page links for each subcategory
chew_links = get_links(cat_page_source)
print(chew_links)

# Getting links for all pages for moderate toys 
mod_pages_links = grab_subcat_links(chew_links[0], chew_numbers[0])
all_moderate_links = grab_toy_links(mod_pages_links)

# Collecting all MODERATE chew toys 
check = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Chew Toys")))
scrape_toy_page(chew_toys, 'moderate', all_moderate_links)

# Back to Chew Toys
driver.get('https://www.chewy.com/b/chew-toys-316')

print('Done with Moderate Toys')

################################################################################################

# To get to TOUGH chew toys
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Tough")))
element.click()

# Getting links for all pages for tough toys 
tough_pages_links = grab_subcat_links(chew_links[1], chew_numbers[1])
print(tough_pages_links)
all_tough_links = grab_toy_links(tough_pages_links)

# Collecting all TOUGH chew toys 
check = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Chew Toys")))
scrape_toy_page(chew_toys, 'tough', all_tough_links)

#To get back to Chew Toys
driver.get('https://www.chewy.com/b/chew-toys-316')

print("Done with Tough Toys")

################################################################################################
# To get to EXTREME chew toys
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Extreme")))
element.click()

# Getting links for all pages for extreme toys 
extreme_pages_links = grab_subcat_links(chew_links[2], chew_numbers[2])
all_extreme_links = grab_toy_links(extreme_pages_links)

# To look at the extreme chew toys 
check = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Chew Toys")))
scrape_toy_page(chew_toys, 'extreme', all_extreme_links)

print('Done with Extreme Toys and Chew Toys')

KeyboardInterrupt: 

In [283]:
chew_toy_list = []
for subcat in ['moderate', 'tough', 'extreme']:
    for index, link in enumerate(chew_toys[subcat]):
        chew_toys[subcat][link]['subcat'] = subcat
        chew_toys[subcat][link]['cat'] = 'chew toys'
        chew_toy_list.append(chew_toys[subcat][link])
chew_toy_df = pd.DataFrame(chew_toy_list)

In [289]:
# convert chew toy data frame to csv -- uncomment to rerun 
# chew_toy_df.to_csv('./data/chewtoy_df.csv', index=False)

In [24]:
plush_toys = {}

99

In [96]:
# PlUSH TOYS

DRIVER_PATH = '/Users/haleytaft/Downloads/chromedriver'
driver = webdriver.Chrome( executable_path=DRIVER_PATH) 
driver.get("https://www.chewy.com/b/toys-315")

# To first just look at CHEW TOYS
chew_toys_link = driver.find_element_by_link_text('Plush Toys')
chew_toys_link.click()

# plush_toys = {}

# Looking a the Stuffed toys
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Stuffed Toys")))
element.click()

# Checking for number of toys in each subcategory (looking at side bar)
cat_page_source = driver.page_source
plush_numbers = number_of_toys(cat_page_source)

# Getting all first page links for each subcategory
plush_links = get_links(cat_page_source)
print(plush_links)

# Getting links for all pages for stuffed toys 
stuffed_pages_links = grab_subcat_links(plush_links[0], plush_numbers[0])
print(len(stuffed_pages_links))
all_stuffed_links = grab_toy_links(stuffed_pages_links)
print(len(all_stuffed_links))

# To look at the STUFFED plush toys 
check = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Plush Toys")))

# # Scraping
# plush_toys_1 = {}
# scrape_toy_page(plush_toys_1, 'stuffed', all_stuffed_links[:100])
# print('Done with 1st round')
# plush_toys_2 = {}
# scrape_toy_page(plush_toys_2, 'stuffed', all_stuffed_links[101:200])
# print('Done with 2nd round')
# plush_toys_3 = {}
# scrape_toy_page(plush_toys_3, 'stuffed', all_stuffed_links[201:300])
# print('Done with 3rd round')
# plush_toys_4 = {}
# scrape_toy_page(plush_toys_4, 'stuffed', all_stuffed_links[301:400])
# print('Done with 4th round')
# plush_toys_5 = {}
# scrape_toy_page(plush_toys_5, 'stuffed', all_stuffed_links[401:500])
# print('Done with 5th round')
# plush_toys_6 = {}
# scrape_toy_page(plush_toys_6, 'stuffed', all_stuffed_links[501:600])
# print('Done with 6th round')
# plush_toys_7 = {}
# scrape_toy_page(plush_toys_7, 'stuffed', all_stuffed_links[601:700])
# print('Done with 7th round')
# plush_toys_8 = {}
# scrape_toy_page(plush_toys_8, 'stuffed', all_stuffed_links[701:800])
# print('Done with 8th round')
# plush_toys_9 = {}
# scrape_toy_page(plush_toys_9, 'stuffed', all_stuffed_links[801:900])
# print('Done with 9th round')
# plush_toys_10 = {}
# scrape_toy_page(plush_toys_10, 'stuffed', all_stuffed_links[901:1000])
# print('Done with 10th round')
# plush_toys_11 = {}
# scrape_toy_page(plush_toys_11, 'stuffed', all_stuffed_links[1001:1100])
# print('Done with 11th round')
# plush_toys_12 = {}
# scrape_toy_page(plush_toys_12, 'stuffed', all_stuffed_links[1101:1200])
# print('Done with 12th round')
# plush_toys_13 = {}
# scrape_toy_page(plush_toys_13, 'stuffed', all_stuffed_links[1201:1300])
# print('Done with 13th round')
plush_toys_14 = {}
scrape_toy_page(plush_toys_14, 'stuffed', all_stuffed_links[1301:1403])
print('Done with 14th round')
    
#To get back to Chew Toys
driver.get('https://www.chewy.com/b/plush-toys-320')

print("Done with Stuffed subcategory!")

# ##########################################################################################################

# # Looking a the Unstuffed toys
# element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Unstuffed Toys")))
# element.click()

# # Getting links for all pages for unstuffed toys 
# unstuffed_pages_links = grab_subcat_links(plush_links[1], plush_numbers[1])
# print(unstuffed_pages_links)
# all_unstuffed_links = grab_toy_links(unstuffed_pages_links)
# print(len(all_unstuffed_links))

# # To look at the unstuffed plush toys 
# check = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Plush Toys")))

# # Scraping 
# plush_toys_15 = {}
# scrape_toy_page(plush_toys_15, 'unstuffed', all_unstuffed_links[:50])
# print("Done with 1st unstuffed toys")
# plush_toys_16 = {}
# scrape_toy_page(plush_toys_16, 'unstuffed', all_unstuffed_links[51:100])
# print("Done with 2nd unstuffed toys")
# plush_toys_16 = {}
# scrape_toy_page(plush_toys_16, 'unstuffed', all_unstuffed_links[101:173])
# print("Done with 3rd unstuffed toys")

# print("Done with Unstuffed subcategory!")

# print('Done with Plush category!')


['https://www.chewy.com/b/stuffed-toys-2333', 'https://www.chewy.com/b/unstuffed-toys-2334']
38
1368
Done with 11th round
Done with 12th round
Done with 13th round


AttributeError: 'NoneType' object has no attribute 'find'

In [80]:
plush_toys['stuffed'] = {}

In [97]:
plushlist = [plush_toys_1, plush_toys_2, plush_toys_3, plush_toys_4, plush_toys_5, plush_toys_6,
            plush_toys_7, plush_toys_8, plush_toys_9, plush_toys_10, plush_toys_11,
            plush_toys_12, plush_toys_13]

In [98]:
for toydict in plushlist:
    for item, link in enumerate(toydict['stuffed']):
        plush_toys['stuffed'][link] = toydict['stuffed'][link]

In [100]:
len(plush_toys['stuffed'])

1273

In [102]:
plush_toy_list = []
for subcat in ['stuffed']: #, 'unstuffed'
    for index, link in enumerate(plush_toys[subcat]):
        plush_toys[subcat][link]['subcat'] = subcat
        plush_toys[subcat][link]['cat'] = 'plush toys'
        plush_toy_list.append(plush_toys[subcat][link])
plush_toy_df = pd.DataFrame(plush_toy_list)

In [103]:
plush_toy_df.to_csv('./data/plushtoy_df.csv')

In [23]:
# ROPE & TUG TOYS

DRIVER_PATH = '/Users/haleytaft/Downloads/chromedriver'
driver = webdriver.Chrome( executable_path=DRIVER_PATH) 
driver.get("https://www.chewy.com/b/toys-315")

# Getting the total number of toys
page_source = driver.page_source 
soup = BeautifulSoup(page_source, 'lxml')
numbers = soup.find_all('span', class_='category-count')
print(numbers[0])

# getting all the links to the pages to scrape
rope_toy_link = 'https://www.chewy.com/b/rope-tug-toys-321'
main_href = f'{link[:-4]}_c{link[-3:]}_p'    
pages = []
pages.append(link)
for i in range(2, round(numbers / 36)+1):
    href = f'{main_href}{i}'
    pages.append(href)
print(pages)

stuffed_pages_links = grab_subcat_links(plush_links[0], plush_numbers[0])
print(len(stuffed_pages_links))
all_stuffed_links = grab_toy_links(stuffed_pages_links)

# To first just look at ROPE & TUG TOYS
rope_toys_link = driver.find_element_by_link_text('Rope & Tug Toys')
rope_toys_link.click()

rope_tug_toys = {}

# To look at the rope & tug toys 
check = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Toys")))

# Getting all the actual toy links


scrape_toy_page(rope_tug_toys, 'rope_toy_tugs', rope_tug_links )



36


In [51]:
# INTERACTIVE TOYS

DRIVER_PATH = '/Users/haleytaft/Downloads/chromedriver'
driver = webdriver.Chrome( executable_path=DRIVER_PATH) 
driver.get("https://www.chewy.com/b/toys-315")

# To first just look at CHEW TOYS
chew_toys_link = driver.find_element_by_link_text('Interactive Toys')
chew_toys_link.click()

interactive_toys = {}

# The interactive toys
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Treat Toys & Dispensers")))
element.click()

# Checking for number of toys in each subcategory (looking at side bar)
cat_page_source = driver.page_source
interactive_numbers = number_of_toys(cat_page_source)

# Getting all first page links for each subcategory
interactive_links = get_links(cat_page_source)
print(interactive_links)

# Getting links for all pages for treat toys & dispensers
dispenser_pages_links = grab_subcat_links(interactive_links[0], interactive_numbers[0])
all_dispenser_links = grab_toy_links(dispenser_pages_links)

# To look at the dog treat toys & dispenser interactive toys
check = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Interactive Toys")))

# Scraping
scrape_toy_page(interactive_toys, 'treat toys & dispensers', all_dispenser_links)

driver.get('https://www.chewy.com/b/interactive-toys-319')

print('Done with Treat Toys & Dispensers subcategory')

############################################################################################################

# Treat Dispenser Refills
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Treat Dispenser Refills")))
element.click()

# Getting links for all pages for treat toys & refills
refills_pages_links = grab_subcat_links(interactive_links[1], interactive_numbers[1])
all_refills_links = grab_toy_links(refills_pages_links)

# To look at the dog treat dispensers refills interactive toys
check = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Interactive Toys")))

# Scraping
scrape_toy_page(interactive_toys, 'treat dispenser refills', all_refills_links)

driver.get('https://www.chewy.com/b/interactive-toys-319')

print('Done with Treat Dispenser Refills subcategory!')

########################################################################################################

# Puzzle toys and Games 
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Puzzle Toys & Games")))
element.click()

# Getting links for all pages for puzzle toys & games
game_pages_links = grab_subcat_links(interactive_links[2], interactive_numbers[2])
all_game_links = grab_toy_links(game_pages_links)

# To look at the dog puzzle toys & games
check = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Interactive Toys")))

# Scraping
scrape_toy_page(interactive_toys, 'puzzle toys & games', all_game_links)

driver.get('https://www.chewy.com/b/interactive-toys-319')

print('Done with Puzzle Toys & Games')

#########################################################################################################

# # Automatic Ball Launchers
# element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Automatic Ball Launchers")))
# element.click()

# # Getting links for all pages for automatic ball launchers
# auto_pages_links = grab_subcat_links(interactive_links[3], interactive_numbers[3])
# all_auto_links = grab_toy_links(auto_pages_links)

# # To look at the dog automatic ball launchers
# check = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Interactive Toys")))

# interactive_toys_2 = {}

# # Scraping
# scrape_toy_page(interactive_toys_2, 'automatic ball launchers', all_auto_links)

# print("Done with Automatic Ball Launchers subcategory!")

print("Done with Interactive Toys category!")

['https://www.chewy.com/b/treat-toys-dispensers-2336', 'https://www.chewy.com/b/treat-dispenser-refills-11139', 'https://www.chewy.com/b/puzzle-toys-games-2335']
Done with Treat Toys & Dispensers subcategory
Done with Treat Dispenser Refills subcategory!
Done with Puzzle Toys & Games
Done with Interactive Toys category!


In [50]:
interactive_toys

{}

In [None]:
# interactive_toys_2

In [53]:
interactive_toy_list = []
for subcat in ['treat toys & dispensers', 'treat dispenser refills', 'puzzle toys & games']: #, 'automatic ball launchers'
    for index, link in enumerate(interactive_toys[subcat]):
        interactive_toys[subcat][link]['subcat'] = subcat
        interactive_toys[subcat][link]['cat'] = 'interactive toys'
        interactive_toy_list.append(interactive_toys[subcat][link])
interactive_toy_df = pd.DataFrame(interactive_toy_list)

In [54]:
# interactive_df_list = []
# for cat in interactive_toys:
#     for toy in cat:
#         interactive_df_list.append(toy)
# interactive_df = pd.DataFrame(interactive_df_list)
# interactive_df

In [55]:
# interactive_toy_df.to_csv('./data/interactivetoy_df.csv')