In [None]:
from bs4 import BeautifulSoup
import requests
import time 
import csv 
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By

In [None]:
def get_reviews(html, csv_writer):
    '''Function to get review information (reviewer name, rating, title, review date, review text, color name) from a specific page

    :param html: The HTML content of the page
    :param csv_writer: CSV writer object to write review data into the CSV file
    :return: None'''
    
    soup = BeautifulSoup(html, 'html.parser')
    reviews = soup.find_all('div', {'data-hook': 'review'})

    for review in reviews:
        #get information from page
        reviewer_name = review.find('span', {'class': 'a-profile-name'})
        rating = review.find('i', {'data-hook': 'review-star-rating'}) or review.find('i', {'data-hook': 'cmps-review-star-rating'})
        title = review.find('a', {'data-hook': 'review-title'}) or review.find('span', {'data-hook': 'review-title'})
        review_date = review.find('span', {'data-hook': 'review-date'})
        review_text = review.find('span', {'data-hook': 'review-body'})
        colour_name = review.find('a', {'data-hook': 'format-strip' })

        #get text or return NULL
        reviewer_name = reviewer_name.get_text(strip=True) if reviewer_name else 'NULL'
        rating = rating.get_text(strip=True) if rating else 'NULL'
        title = title.get_text(strip=True) if title else 'NULL'
        review_date = review_date.get_text(strip=True) if review_date else 'NULL'
        review_text = review_text.get_text(strip=True) if review_text else 'NULL'
        colour_name = colour_name.get_text(strip=True) if colour_name else 'NULL'

        #write the review data to the CSV file
        csv_writer.writerow([reviewer_name, title, rating, review_date, review_text, colour_name])
        print(f"{reviewer_name}\n{title}\n\n{rating}\n{review_date}\n{colour_name}\n{review_text}")
        print("****************************************************************************")

In [ ]:
#setup Chrome options
chromeOptions = uc.ChromeOptions()
chromeOptions.headless = False

#Chrome driver
driver = uc.Chrome(use_subprocess=True, options=chromeOptions)

#open Amazon sign-in page
driver.get("https://www.amazon.ca/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.ca%2FWashable-Medium-Durable-Non-Slip-26%25C3%259719%25C3%259713%2Fproduct-reviews%2FB0C5DMLPNC%2Fref%3Dnav_ya_signin%3Fie%3DUTF8%26reviewerType%3Dall_reviews%26pageNumber%3D2&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=caflex&openid.mode=checkid_setup&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0")

time.sleep(5) # Wait for a few seconds for the page to load

#login email
email = driver.find_element(By.ID, "ap_email")
email.send_keys("email@email.com") 
driver.find_element(By.ID, "continue").click()

time.sleep(5)

#login password
password = driver.find_element(By.ID, "ap_password")
password.send_keys("xxxxxxxxx")
driver.find_element(By.ID, "signInSubmit").click()

time.sleep(10)

# Open a CSV file to write the review data
with open('AmazonCatCouchDataset2.csv', 'w', newline = '', encoding = 'utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Reviewer Name', 'Title', 'Rating', 'Review Date', 'Review Text', 'Colour Name'])

    # There are 10 pages of reviews to iterate through
    for i in range(1, 11):
        url = f"https://www.amazon.ca/Washable-Medium-Durable-Non-Slip-26%C3%9719%C3%9713/product-reviews/B0C5DMLPNC/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber={i}"
        driver.get(url)
        time.sleep(5) 
        page_source = driver.page_source
        get_reviews(page_source, writer)

driver.quit()

In [None]:
df = pd.read_csv(r"/Users/johannaschmidle/Desktop/AmazonCatCouchDataset.csv")

df

Unnamed: 0,Reviewer Name,Title,Rating,Review Date,Review Text,Colour Name
0,Shu,5.0 out of 5 starsTrue Value,5.0 out of 5 stars,"Reviewed in Canada on February 19, 2024","The mini sofa is great, big enough to fit my l...",Colour Name: Yellow
1,Rebecca Baker,5.0 out of 5 starsElderly cat loves it!,5.0 out of 5 stars,"Reviewed in Canada on May 22, 2024",I bought this in the hopes that one of my olde...,Colour Name: Green
2,CMac,5.0 out of 5 starsBetter than I expected!,5.0 out of 5 stars,"Reviewed in Canada on June 8, 2024",I was so skeptical even after I read the revie...,Colour Name: Green
3,Cj,4.0 out of 5 starsBetter than I thought,4.0 out of 5 stars,"Reviewed in Canada on March 4, 2024","It's soft, taking apart for washing might be a...",Colour Name: Yellow
4,MIKE,5.0 out of 5 starsMY CAT LOVES IT,5.0 out of 5 stars,"Reviewed in Canada on June 22, 2024",CAT WAS NOT SURE OF IT ON DAY ONE BUT ON DAY T...,Colour Name: Green
...,...,...,...,...,...,...
95,Yael Rootberg,Cat loves it,5.0 out of 5 stars,"Reviewed in the United States on June 21, 2024",My cat loves it so much that he won't let his ...,Colour Name: Pink
96,Zanna Kazen,Cat LOVES it!,5.0 out of 5 stars,"Reviewed in the United States on June 28, 2024","I was hesitant at first, but once this was ass...",Colour Name: Blue
97,Baylie Nixon,Soo cute!,5.0 out of 5 stars,"Reviewed in the United States on June 27, 2024",Both my dogs and cats love this little couch s...,Colour Name: Green
98,Daniel Garcia,Cute,4.0 out of 5 stars,"Reviewed in the United States on June 26, 2024","I bought this for my puppy, it looks adorable ...",Colour Name: Pink
