# Getting the reviews of a product (Amazon)

In [1]:
import pandas as pd
import requests as req
from bs4 import BeautifulSoup

In [83]:
def getAmazonSearchQuery(searchQuery, pageNumber, header):
    urlQuery = "https://www.amazon.co.uk/s?k=" + searchQuery + "&page=" + str(pageNumber)
    pageRequest = req.get(urlQuery, headers = header)

    if pageRequest.status_code == 200:
        return pageRequest
    else:
        return "Error while making the query search on Amazon."

In [84]:
def getProductReviews(asin, pageNumber, header, cookies):
    urlQuery = "https://www.amazon.co.uk/product-reviews/" + asin + "/reviewerType=all_reviews?sortBy=recent&pageNumber=" + str(pageNumber)
    pageRequest = req.get(urlQuery, headers = header, cookies = cookies)

    if pageRequest.status_code == 200:
        return pageRequest
    else:
        return "Error while getting the product's reviews on Amazon."

In [85]:
productsNames = list()
productsLinks = list()
productsAsin = list()

searchQuery = "notebook asus"
searchQuery = searchQuery.replace(" ", "+")

In [87]:
headerResponse = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}

responseGet = getAmazonSearchQuery(searchQuery, 1, headerResponse)
cookiesResponse = responseGet.cookies
contentResponse = responseGet.content


In [88]:
soupGet = BeautifulSoup(contentResponse)

for product in soupGet.findAll("h2", {"class": "a-size-mini a-spacing-none a-color-base s-line-clamp-4"}):
    productsNames.append(product.text)

    productHref = product.find("a", {"class": "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"})
    productLink = "https://www.amazon.com.br/" + productHref['href']
    productsLinks.append(productLink)

    indexDp = productLink.index("dp/")
    indexRef = productLink.index("/ref")
    productId = productLink[indexDp + 3:indexRef]

    productsAsin.append(productId)

In [89]:
reviewsTitles = list()
reviewsStars = list()
reviewsText = list()

responseReviewsGet = getProductReviews(productsAsin[0], 1, headerResponse, cookiesResponse)
contentResponseReviews = responseReviewsGet.content

In [90]:
soupReviewsGet = BeautifulSoup(contentResponseReviews)

for review in soupReviewsGet.findAll("a", {"data-hook": "review-title"}):
    reviewsTitles.append(review.text)

for review in soupReviewsGet.findAll("div", {"id": "cm_cr-review_list"}):
    for title in review.findAll("a", {"data-hook": "review-title"}):
        reviewsTitles.append(title.text.replace("\n", " "))
    for body in review.findAll("span", {"data-hook": "review-body"}):
        reviewsText.append(body.text)
    for stars in review.findAll("i", {"data-hook": "review-star-rating"}):
        reviewsStars.append(stars.text)

In [91]:
reviewsFinal = {"Stars": reviewsStars, "Title": reviewsTitles, "Text": reviewsText}

reviewsData = pd.DataFrame.from_dict(reviewsFinal, orient='index')
pd.set_option("max_colwidth", 8000)
reviewsData = reviewsData.transpose()
reviewsData

Unnamed: 0,Stars,Title,Text
0,1.0 out of 5 stars,\nNot as described throughout\n,\nThe item is not as described entirely. Just Read only the top of the page. Do not be mislead with all of the photos or write ups halfway down in the description area. Totally different item. You are purchasing a purely bare minimum one. Ignore any of the pictures of it doing snazzy things!\n
1,4.0 out of 5 stars,\nGreat for £129\n,\nBought for our little one and perfect for school work. Parent controls are very easy to set up. Quick for what using it for. Can't ask for more.Slightly larger than previous 11.6 laptop so doesn't fit Amazon basic case at that size\n
2,4.0 out of 5 stars,\necellent for the price\n,"\nwould be 5stars but it freezes on multiple apps, and battery is not the best, maybe 8hours or so.buy I was working it hard\n"
3,5.0 out of 5 stars,\nVery good\n,\nVery goodReliable\n
4,5.0 out of 5 stars,\nA useful laptop\n,\nThis laptop does almost everything that you want it to do. My only criticism is that the sound is a little quiet. Otherwise it's fine.\n
5,5.0 out of 5 stars,\nVery good for the price\n,\nFor my daughters school work\n
6,5.0 out of 5 stars,\nNice Macine\n,\nGreat laptop delivered as promised on time. worked straight out of the box after initial charge. Was delivered by Amazon with a code to give to driver for security which was great. am very pleased with it\n
7,5.0 out of 5 stars,\nAmazing laptop\n,\nDelivery on time . This laptop is perfect for any task needed easy to use and exactly what it said it can do ! Would definitely buy again\n
8,4.0 out of 5 stars,\nIts ok\n,"\nThe screen doesn't have touch on it, have to buy new mini mouse to go with this because its little difficult to touch on pad to move. Battery 🔋 is good, last long time before charge it.\n"
9,3.0 out of 5 stars,\nThis is not as described\n,\nBe aware this is not a touchscreen or 2 in 1 as described. This is just a standard Chromebook. There is a more expensive model that is the model in the pictures. As a Chromebook however it is probably great at the price.\n


In [95]:
df = pd.DataFrame(reviewsFinal)
df.to_csv(index=False)

ValueError: All arrays must be of the same length

# Sentiment analysis

In [16]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# Using the pipeline
classifier = pipeline("sentiment-analysis")
responseClassifier = classifier("We are happy today!")
print(responseClassifier)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998750686645508}]


In [43]:
# Using a model
sequence = reviewsText[4]
modelName = "citizenlab/twitter-xlm-roberta-base-sentiment-finetunned"

model = AutoModelForSequenceClassification.from_pretrained(modelName)
tokenizer = AutoTokenizer.from_pretrained(modelName)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
responseClassifier = classifier(sequence)
print(responseClassifier)

[{'label': 'Positive', 'score': 0.99078369140625}]


In [44]:
# Tokenizer
response = tokenizer(sequence)
print(response)

{'input_ids': [0, 44704, 13, 4, 194155, 3533, 6, 100820, 38, 148668, 6, 100820, 41, 1175, 23, 1824, 188, 732, 4, 28, 1601, 17159, 34, 10, 8845, 5, 62, 7002, 3390, 112, 48, 73287, 28, 21734, 8, 124, 393, 191863, 38, 241, 36, 69537, 1027, 393, 23890, 46624, 12479, 4, 162102, 10, 7002, 3390, 112, 38, 21670, 6, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [45]:
tokens = tokenizer.tokenize(sequence)
print(tokens)

['▁Gent', 'e', ',', '▁adorei', '▁esse', '▁', 'fone', '!', '▁Primeiro', '▁', 'fone', '▁que', '▁eu', '▁in', 'vis', 'to', '▁mais', ',', '▁e', '▁super', '▁vale', 'u', '▁a', '▁pena', '.', '▁A', '▁qual', 'dia', 'de', '▁da', '▁saída', '▁e', '▁entrada', '▁de', '▁som', '▁é', '▁maravilhosa', '!', '▁E', '▁o', '▁preço', '▁não', '▁é', '▁tão', '▁caro', '▁assim', ',', '▁considerando', '▁a', '▁qual', 'dia', 'de', '!', '▁<3', '▁']


In [46]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[44704, 13, 4, 194155, 3533, 6, 100820, 38, 148668, 6, 100820, 41, 1175, 23, 1824, 188, 732, 4, 28, 1601, 17159, 34, 10, 8845, 5, 62, 7002, 3390, 112, 48, 73287, 28, 21734, 8, 124, 393, 191863, 38, 241, 36, 69537, 1027, 393, 23890, 46624, 12479, 4, 162102, 10, 7002, 3390, 112, 38, 21670, 6]


In [47]:
decodedSequence = tokenizer.decode(ids)
print(decodedSequence)

Gente, adorei esse fone! Primeiro fone que eu invisto mais, e super valeu a pena. A qualdiade da saída e entrada de som é maravilhosa! E o preço não é tão caro assim, considerando a qualdiade! <3 
