### Imports

In [2]:
import requests
import os
import json
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator

### Utils

In [3]:
def extract(ancestor, selector=None, attribute=None, multiple=False):
    if selector:
        if multiple:
            if attribute:
                return [tag[attribute].strip() for tag in ancestor.select(selector)]
            return [tag.get_text().strip() for tag in ancestor.select(selector)]
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None
        try:
            return ancestor.select_one(selector).get_text().strip()
        except AttributeError:
            return None
    try:
        return ancestor[attribute]
    except (TypeError, KeyError):
        return None

In [4]:
def translate(text, source = "pl", target = "en"):
    return GoogleTranslator(source, target).translate(text=text)

In [5]:
selectors = {
    "opinion_id": (None, "data-entry-id"),
    "author": ("span.user-post__author-name",),
    "recommendation": ("span.user-post__author-recomendation > em",),
    "stars": ("span.user-post__score-count",),
    "content_pl": ("div.user-post__text",),
    "pros_pl": ("div.review-feature__item--positive", None, True),
    "cons_pl": ("div.review-feature__item--negative", None, True),
    "vote_yes": ("button.vote-yes","data-total-vote"),
    "vote_no": ("button.vote-no","data-total-vote"),
    "published": ("span.user-post__published > time:nth-child(1)","datetime"),
    "purchased": ("span.user-post__published > time:nth-child(2)","datetime")
}

### Exraction of opinions

In [6]:
with open('./cookie.json', 'r') as file:
    headers = json.load(file)

In [7]:
product_id = "84514582"
next_page = f"https://www.ceneo.pl/{product_id}#tab=reviews"
all_opinions = []
while next_page:
    response = requests.get(next_page, headers = headers)
    if response.status_code == 200:
        print(next_page)
        page_dom = BeautifulSoup(response.text, 'html.parser')
        opinions = page_dom.select("div.js_product-review:not(.user-post--highlight)")
        print(len(opinions))
        for opinion in opinions:
            single_opinion = {
                key: extract(opinion, *value)
                for key, value in selectors.items()
            }
            single_opinion["content_en"] = translate(single_opinion["content_pl"])
            single_opinion["pros_en"] = [translate(pros) for pros in single_opinion["pros_pl"]]
            single_opinion["cons_en"] = [translate(cons) for cons in single_opinion["cons_pl"]]
            single_opinion["recommendation"] = True if single_opinion["recommendation"] == "Polecam" else False if single_opinion["recommendation"] == "Nie polecam" else None
            single_opinion["stars"] = float(single_opinion["stars"].split("/")[0].replace(",", "."))
            single_opinion["vote_yes"] = int(single_opinion["vote_yes"])
            single_opinion["vote_no"] = int(single_opinion["vote_no"])
            all_opinions.append(single_opinion)
        try:
            next_page = "https://www.ceneo.pl" + page_dom.select_one("a.pagination__next")["href"]
        except TypeError:
            next_page = None

https://www.ceneo.pl/84514582#tab=reviews
10


ValueError: could not convert string to float: '4,5'

In [None]:
if not os.path.exists('./opinions'):
    os.mkdir('./opinions')
with open(f"./opinions/{product_id}.json", 'w', encoding='UTF-8') as file:
    json.dump(all_opinions, file, ensure_ascii=False, indent=4)