In [None]:
from bs4 import BeautifulSoup
import requests
import time 
import csv 
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By

In [None]:
def get_reviews(html, csv_writer):
    '''Function to get review information (reviewer name, rating, title, review date, review text, color name) from a specific page

    :param html: The HTML content of the page
    :param csv_writer: CSV writer object to write review data into the CSV file
    :return: None'''
    
    soup = BeautifulSoup(html, 'html.parser')
    reviews = soup.find_all('div', {'data-hook': 'review'})

    for review in reviews:
        #get information from page
        reviewer_name = review.find('span', {'class': 'a-profile-name'})
        rating = review.find('i', {'data-hook': 'review-star-rating'}) or review.find('i', {'data-hook': 'cmps-review-star-rating'})
        title = review.find('a', {'data-hook': 'review-title'}) or review.find('span', {'data-hook': 'review-title'})
        review_date = review.find('span', {'data-hook': 'review-date'})
        review_text = review.find('span', {'data-hook': 'review-body'})
        colour_name = review.find('a', {'data-hook': 'format-strip' })

        #get text or return NULL
        reviewer_name = reviewer_name.get_text(strip=True) if reviewer_name else 'NULL'
        rating = rating.get_text(strip=True) if rating else 'NULL'
        title = title.get_text(strip=True) if title else 'NULL'
        review_date = review_date.get_text(strip=True) if review_date else 'NULL'
        review_text = review_text.get_text(strip=True) if review_text else 'NULL'
        colour_name = colour_name.get_text(strip=True) if colour_name else 'NULL'

        #write the review data to the CSV file
        csv_writer.writerow([reviewer_name, title, colour_name, rating, review_date, review_text])

In [None]:
#setup Chrome options
chromeOptions = uc.ChromeOptions()
chromeOptions.headless = False

#Chrome driver
driver = uc.Chrome(use_subprocess=True, options=chromeOptions)

#open Amazon sign-in page
driver.get("https://www.amazon.ca/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.ca%2FWashable-Medium-Durable-Non-Slip-26%25C3%259719%25C3%259713%2Fproduct-reviews%2FB0C5DMLPNC%2Fref%3Dnav_ya_signin%3Fie%3DUTF8%26reviewerType%3Dall_reviews%26pageNumber%3D2&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=caflex&openid.mode=checkid_setup&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0")

time.sleep(5) # Wait for a few seconds for the page to load

#login email
email = driver.find_element(By.ID, "ap_email")
email.send_keys("email@email.com") 
driver.find_element(By.ID, "continue").click()

time.sleep(5)

#login password
password = driver.find_element(By.ID, "ap_password")
password.send_keys("xxxxxxxxx")
driver.find_element(By.ID, "signInSubmit").click()

time.sleep(10)

# Open a CSV file to write the review data
with open('AmazonCatCouchDataset.csv', 'w', newline = '', encoding = 'utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Reviewer Name', 'Title','Colour Name', 'Rating', 'Review Date', 'Review Text'])

    # There are 10 pages of reviews to iterate through
    for i in range(1, 11):
        url = f"https://www.amazon.ca/Washable-Medium-Durable-Non-Slip-26%C3%9719%C3%9713/product-reviews/B0C5DMLPNC/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber={i}"
        driver.get(url)
        time.sleep(5) 
        page_source = driver.page_source
        get_reviews(page_source, writer)

driver.quit()

In [5]:
df = pd.read_csv(r"/Users/.../AmazonCatCouchDataset.csv")

df

Unnamed: 0,Reviewer Name,Title,Colour Name,Rating,Review Date,Review Text
0,Shu,5.0 out of 5 starsTrue Value,Colour Name: Yellow,5.0 out of 5 stars,"Reviewed in Canada on February 19, 2024","The mini sofa is great, big enough to fit my l..."
1,Rebecca Baker,5.0 out of 5 starsElderly cat loves it!,Colour Name: Green,5.0 out of 5 stars,"Reviewed in Canada on May 22, 2024",I bought this in the hopes that one of my olde...
2,CMac,5.0 out of 5 starsBetter than I expected!,Colour Name: Green,5.0 out of 5 stars,"Reviewed in Canada on June 8, 2024",I was so skeptical even after I read the revie...
3,Cj,4.0 out of 5 starsBetter than I thought,Colour Name: Yellow,4.0 out of 5 stars,"Reviewed in Canada on March 4, 2024","It's soft, taking apart for washing might be a..."
4,MIKE,5.0 out of 5 starsMY CAT LOVES IT,Colour Name: Green,5.0 out of 5 stars,"Reviewed in Canada on June 22, 2024",CAT WAS NOT SURE OF IT ON DAY ONE BUT ON DAY T...
...,...,...,...,...,...,...
95,DeeBee,My cat loves this couch!,Colour Name: Blue,5.0 out of 5 stars,"Reviewed in the United States on June 26, 2024",This couch is super cute. And my cat loves it....
96,S. Reese,Instant success!,Colour Name: White,5.0 out of 5 stars,"Reviewed in the United States on June 2, 2024",My girl is a hefty kitty. I was worried she wo...
97,Susie,Super Little Sofa,Colour Name: Green,5.0 out of 5 stars,"Reviewed in the United States on June 19, 2024",My 14-year-old cat seems to have pain worse th...
98,Nathan Lee,As advertised,,5.0 out of 5 stars,"Reviewed in the United States on June 3, 2024","Looks comfortable, feels great. it's bigger th..."


### Add Sentiment Column
This part is just adding a simple sentiment column for later use. I will create a seperate database from the one above. I am using the code from my UFO project but I am making it simpler here, if you would like to see the code you can check it out __[here](https://www.kaggle.com/code/johannaschmidle7/ufo-project-part-2/notebook)__.
First I am going to tokenzie the reviews, remove stopwords, lemmatize, and get sentiment. Then I will drop the extra columns.

In [None]:
import nltk 
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import subjectivity
import string
import emoji

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [None]:
def tokenize_comments(comment):
    if isinstance(comment, str):
        # Remove punctuation
        comment = comment.translate(str.maketrans('', '', string.punctuation))
        
        # Remove emojis
        comment = emoji.get_emoji_regexp().sub(r'', comment)
        
        try:
            return list(set(word_tokenize(comment.lower())))
        except:
            return []
    else:
        return []

df['Tokenized Review'] = df['Review Text'].apply(tokenize_comments)

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokenized):
    filtered = []
    for word in tokenized:
        if word.lower() not in stop_words:
            filtered.append(word)
    return filtered
    
df['Filtered Review'] = df['Tokenized Review'].apply(remove_stopwords)

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(tagged_comment):
    lemmatized_tokens = []
    
    for token in tagged_comment:
        lemmatized_tokens.append(lemmatizer.lemmatize(token))

    return lemmatized_tokens

df['Lemmed Review'] = df['Filtered Review'].apply(lemmatize_words)

In [None]:
def sentiment_analysis(comment):
    sentence = ' '.join(comment)
    sentiment = SentimentIntensityAnalyzer().polarity_scores(sentence)
    
    if sentiment['compound'] >= 0.15:
        sentiment_category = "Positive"
    elif sentiment['compound'] <= -0.15:
        sentiment_category = "Negative"
    else:
        sentiment_category = "Neutral"

    return sentiment_category
    
df['Sentiment'] = df['Lemmed Review'].apply(sentiment_analysis)

In [None]:
df.drop(['Tokenized Review', 'Filtered Review', 'Lemmed Review'], axis=1, inplace=True)

In [22]:
df

Unnamed: 0,Reviewer Name,Title,Colour Name,Rating,Review Date,Review Text,Sentiment
0,Shu,5.0 out of 5 starsTrue Value,Colour Name: Yellow,5.0 out of 5 stars,"Reviewed in Canada on February 19, 2024","The mini sofa is great, big enough to fit my l...",Positive
1,Rebecca Baker,5.0 out of 5 starsElderly cat loves it!,Colour Name: Green,5.0 out of 5 stars,"Reviewed in Canada on May 22, 2024",I bought this in the hopes that one of my olde...,Positive
2,CMac,5.0 out of 5 starsBetter than I expected!,Colour Name: Green,5.0 out of 5 stars,"Reviewed in Canada on June 8, 2024",I was so skeptical even after I read the revie...,Positive
3,Cj,4.0 out of 5 starsBetter than I thought,Colour Name: Yellow,4.0 out of 5 stars,"Reviewed in Canada on March 4, 2024","It's soft, taking apart for washing might be a...",Positive
4,MIKE,5.0 out of 5 starsMY CAT LOVES IT,Colour Name: Green,5.0 out of 5 stars,"Reviewed in Canada on June 22, 2024",CAT WAS NOT SURE OF IT ON DAY ONE BUT ON DAY T...,Positive
...,...,...,...,...,...,...,...
95,DeeBee,My cat loves this couch!,Colour Name: Blue,5.0 out of 5 stars,"Reviewed in the United States on June 26, 2024",This couch is super cute. And my cat loves it....,Positive
96,S. Reese,Instant success!,Colour Name: White,5.0 out of 5 stars,"Reviewed in the United States on June 2, 2024",My girl is a hefty kitty. I was worried she wo...,Positive
97,Susie,Super Little Sofa,Colour Name: Green,5.0 out of 5 stars,"Reviewed in the United States on June 19, 2024",My 14-year-old cat seems to have pain worse th...,Negative
98,Nathan Lee,As advertised,,5.0 out of 5 stars,"Reviewed in the United States on June 3, 2024","Looks comfortable, feels great. it's bigger th...",Positive


In [None]:
df.to_csv('AmazonCatCouchSentiment.csv', encoding='utf-8')