## Getting the list of product URLs from the main page

In [1]:
import pandas as pd
import numpy as np
import time
import re
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# create an instance of webdriver object, which is like opening a browser when the driver is called
driver = webdriver.Chrome()

# navigate to the page
main_url = "https://shop.lululemon.com/c/women/_/N-7vf?icid=lp-story:women;1a;lphero;cdp:womens-clothes;seasonal:fa21"
driver.get(main_url)

# wait 5 seconds
time.sleep(5)

# scroll to the bottom of the page to load all of the products
elem = driver.find_element_by_tag_name("body")

no_of_pagedowns = 20

while no_of_pagedowns:
    elem.send_keys(Keys.PAGE_DOWN)
    time.sleep(0.2)
    no_of_pagedowns-=1
    
# clicking the 'View more products' button
load_button_xpath = '//*[@id="main-content"]/div/section/div/div[2]/div/button'
click_count = 0
max_clicks = 3

while click_count < max_clicks:
    try:
        loadMoreButton = driver.find_element_by_xpath(load_button_xpath)
        time.sleep(2)
        loadMoreButton.click()
        click_count += 1
        print(click_count)
        time.sleep(5)
    except Exception as e:
        print(e)
        break

    
# attribute of where the source code of page is stored in a string format
html = driver.page_source 

# create an instance of BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

link_xpath = '//a[@href]'

links = driver.find_elements_by_xpath(link_xpath)

link_list = []

for link in links:
    #print(link.get_attribute('href'))
    if link.get_attribute('href') not in link_list:
        link_list.append(link.get_attribute('href'))
    else:
        continue
    
# filter out all non-product links    
reg = re.compile('/p/')    

filtered_link_list = list(filter(reg.search, link_list))

len(filtered_link_list)

1
2
3


180

## Scraping each product

In [44]:
product_name = []
rating_list = []
title_list = []
review_list = []

# loop through each product link; launch and scrape one-by-one
for product in filtered_link_list[0:2]:
    url = product
    driver2 = webdriver.Chrome()
    driver2.get(url)
    
    # execute Javascript inside the browser to scroll the webpage
    driver2.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # wait 5 seconds
    time.sleep(5)
    
    # clicking the 'load more' button
    load_button_xpath = '//*[@id="BVRRContainer"]/div/div/div/div/div[3]/div/button'
    click_count = 0
    max_clicks = 3
    
    while click_count < max_clicks:
        try:
            loadMoreButton = driver2.find_element_by_xpath(load_button_xpath)
            time.sleep(2)
            loadMoreButton.click()
            click_count += 1
            print(click_count)
            time.sleep(5)
        except Exception as e:
            print(e)
            break


    soup = BeautifulSoup(driver2.page_source, 'html.parser')
    
    product = soup.find("div", class_="col-xs-12")
    
    stars = soup.find_all("span", class_= "bv-rating-stars-container")
    
    titles = soup.find_all("div", class_="bv-content-title-container")
    
    reviews = soup.find_all("div", class_="bv-content-summary-body-text")
    
    for s in stars:
        text = s.text
        print(text)
        match = re.search(r'(\d+) out of 5 stars', text)
        if match:
            rating_list.append(match.group(1))
    
    for t in titles:
        title_list.append(t.text)
        #print(t.text)
        
        product_name.append(product.text)
        #print(product.text)
    
    for r in reviews:
        review_list.append(r.text)
        #print(r.text)

    

Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="BVRRContainer"]/div/div/div/div/div[3]/div/button"}
  (Session info: chrome=95.0.4638.54)

  ★★★★★  No rating value 
1
2
3
  ★★★★★   ★★★★★  
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  4 out of 5 stars. 
  ★★★★★   ★★★★★  2 out of 5 stars. 
  ★★★★★   ★★★★★  2 out of 5 stars. 
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  4 out of 5 stars. 
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  2 out of 5 stars. 
  ★★★★★   ★★★★★  4 out of 5 stars. 
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  3 out of 5 stars. 
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  3 out of 5 stars. 
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  5 out of 5 stars. 
  ★★★★★   ★★★★★  4 out of 5 stars. 
  ★★★★★   ★★

## Creating the DataFrame

In [45]:
len(rating_list), len(title_list), len(review_list), len(product_name)

(98, 107, 107, 107)

In [None]:
df = pd.DataFrame()

# add columns
df['rating'] = rating_list
df['title'] = title_list
df['review'] = review_list
df['product'] = product_name

# remove tags from product name
df['product'] = df['product'].str.replace(r'New', '')
df['product'] = df['product'].str.replace(r'Nulu', '')
df['product'] = df['product'].str.replace(r'\u2122', '', regex=True)

# remove customer support responses
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.dropna()

print(len(df))
df.head()

In [None]:
df['product'].value_counts()

## Export DataFrame to CSV

In [69]:
file_name = 'lululemon_website_reviews_v2.csv'
df.to_csv(file_name)

## Test code

In [4]:
# create an instance of webdriver object, which is like opening a browser when the driver is called
test_driver = webdriver.Chrome()

# navigate to the page
test_driver.get("https://shop.lululemon.com/p/women-tanks/Swiftly-Tech-RB-2-Race/_/prod9750460?color=45849")

# exceute Javascript inside the browser to scroll the webpage
test_driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# wait 5 seconds
import time
time.sleep(5)

# clicking the 'load more' button
load_button_xpath = '//*[@id="BVRRContainer"]/div/div/div/div/div[3]/div/button/span'

while True:
    try:
        loadMoreButton = test_driver.find_element_by_xpath(load_button_xpath)
        time.sleep(2)
        loadMoreButton.click()
        time.sleep(5)
    except Exception as e:
        print(e)
        break

# attribute of where the source code of page is stored in a string format
html = test_driver.page_source 

# create an instance of BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

title_list_test = []

titles = soup.find_all("div", class_='bv-content-title-container')

for t in titles:
    title_list_test.append(t.text)
    print(t.text)
    
test_df = pd.DataFrame()

test_df['title'] = title_list_test

test_df

Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="BVRRContainer"]/div/div/div/div/div[3]/div/button/span"}
  (Session info: chrome=94.0.4606.71)

     My favorite tank    
     Best Length for Petites!    
     Half marathon approved!    
     Overpriced and tight    
     Swiftly tech    
     Disappointed fit and material    
     Swifly Tech tops are my fav!    
     LOVE THIS HAVE TONS OF THESE    
     Perfect length for 5’4”    
     Not My Favorite Tank    
     So comfy but droops    
     Cute but rides up    
     Buy it    
     Size Down    
     Pleasantly Surprised    
     Swiftly tank top    
     Okay but not great    
     Love this top!!!    
     Big chested women beware.    
     love    
     Crop top    
     Tiny Top, Big Money    
     super cute but..    
     Racer top    
     Love the fit but poor quality    
 
     cute, comfy, love the length    
     Great product    
     Race or Casual, This is the Best    
    

Unnamed: 0,title
0,My favorite tank
1,Best Length for Petites!
2,Half marathon approved!
3,Overpriced and tight
4,Swiftly tech
...,...
499,Perfect length!
500,Perfect length!
501,Didn't care for the fit.
502,Love this length!


In [38]:
product_name = []
title_list = []
review_list = []

# create an instance of webdriver object, which is like opening a browser when the driver is called
test_driver = webdriver.Chrome()

test_driver.get(filtered_link_list[29])

# exceute Javascript inside the browser to scroll the webpage
test_driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# wait 5 seconds
import time
time.sleep(5)

# clicking the 'load more' button
load_button_xpath = '//*[@id="BVRRContainer"]/div/div/div/div/div[3]/div/button/span'
click_count = 0
max_clicks = 3

while click_count < max_clicks:
    try:
        loadMoreButton = test_driver.find_element_by_xpath(load_button_xpath)
        time.sleep(2)
        loadMoreButton.click()
        click_count += 1
        print(click_count)
        time.sleep(5)
    except Exception as e:
        print(e)
        break


soup = BeautifulSoup(test_driver.page_source, 'html.parser')

product = soup.find("div", class_="col-xs-12")

titles = soup.find_all("div", class_="bv-content-title-container")

reviews = soup.find_all("div", class_="bv-content-summary-body-text")



for t in titles:
    title_list.append(t.text)
    #print(t.text)

    product_name.append(product.text)
    #print(product.text)

for r in reviews:
    review_list.append(r.text)
    #print(r.text)

len(title_list), len(review_list), len(product_name)

1
2
3


(132, 130, 132)

In [39]:
post = soup.find_all("div", class_= "bv-content-core")

post[1].text
post[2].text

'                ★★★★★   ★★★★★  5 out of 5 stars.      Jeanette805     · 9 months ago \xa0           High quality           Super soft material, tight knit, easy zipper, flattering      In a few words, what did you like?  High quality, soft knit    Size Purchased  small    Usual Size  small            How did your gear fit?            Smaller  Rating of 1 means Smaller  Larger  Rating of 5 means Larger  How did your gear fit?, average rating value is 3 of 5.     '

In [42]:
rate = soup.find_all("span", class_= "bv-rating-stars-container")

text = rate[1].text

num = re.search(r'(\d+) out of 5 stars', text)

if num:
    print(num.group(1))

5


In [28]:
# create an instance of webdriver object, which is like opening a browser when the driver is called
driver = webdriver.Chrome()

# navigate to the page
main_url = "https://shop.lululemon.com/c/women/_/N-7vf?icid=lp-story:women;1a;lphero;cdp:womens-clothes;seasonal:fa21"
driver.get(main_url)

# wait 5 seconds
time.sleep(5)

# scroll to the bottom of the page to load all of the products
elem = driver.find_element_by_tag_name("body")

no_of_pagedowns = 20

while no_of_pagedowns:
    elem.send_keys(Keys.PAGE_DOWN)
    time.sleep(0.2)
    no_of_pagedowns-=1
    
# clicking the 'View more products' button
load_button_xpath = '//*[@id="main-content"]/div/section/div/div[2]/div/button'

while True:
    try:
        loadMoreButton = driver.find_element_by_xpath(load_button_xpath)
        time.sleep(2)
        loadMoreButton.click()
        time.sleep(5)
    except Exception as e:
        print(e)
        break

    
# attribute of where the source code of page is stored in a string format
html = driver.page_source 

# create an instance of BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

link_xpath = '//a[@href]'

links = driver.find_elements_by_xpath(link_xpath)

link_list = []

for link in links:
    #print(link.get_attribute('href'))
    if link.get_attribute('href') not in link_list:
        link_list.append(link.get_attribute('href'))
    else:
        continue
    
reg = re.compile('/p/')    

filtered_link_list = list(filter(reg.search, link_list))

len(filtered_link_list)

Message: chrome not reachable
  (Session info: chrome=94.0.4606.71)



WebDriverException: Message: chrome not reachable
  (Session info: chrome=94.0.4606.71)


# Old Code

In [None]:
# create an instance of webdriver object, which is like opening a browser when the driver is called
driver = webdriver.Chrome()

# navigate to the page
main_url = "https://shop.lululemon.com/c/women/_/N-7vf?icid=lp-story:women;1a;lphero;cdp:womens-clothes;seasonal:fa21"
driver.get(main_url)

# wait 5 seconds
time.sleep(5)
    
# attribute of where the source code of page is stored in a string format
html = driver.page_source 

# create an instance of BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# create a list where the product links will be stored
product_links = []

# loop through each product link and add the unique links to the list
for link in soup.find_all('a', href=re.compile('/p/')):
    # removes duplicate URLs
    if link['href'] not in product_links:
        print(link['href'])
        product_links.append(link['href'])
    else:
        continue
    
print(product_links)