### Imports

In [7]:
import os
import json
import requests
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator

### Utils


In [8]:
def extract(ancestor, selector=None, attribute=None, multiple=False):
    if selector:
        if multiple:
            if attribute:
                return [tag[attribute].strip() for tag in ancestor.select(selector)]
            return [tag.get_text().strip() for tag in ancestor.select(selector)]
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None
        try:
            return ancestor.select_one(selector).get_text().strip()
        except AttributeError:
            return None
    try:
        return ancestor[attribute].strip()
    except (TypeError, KeyError):
        return None 

In [9]:
def translate(text,source='pl', target='en'):
    return GoogleTranslator(source,target).translate(text=text)

In [10]:
selectors = {
            'opinion_id' : (None,'data-entry-id'),
            'author' : ('span.user-post__author-name',),
            'reccommendation' : ('span.user-post__author-recomendation > em',),
            'stars' : ('span.user-post__score-count',),
            'content_pl' : ('div.user-post__text',),
            'pros_pl' : ('div.review-feature__item--positive',None,True),
            'cons_pl' : ('div.review-feature__item--negative',None,True),
            'vote_yes' : ('button.vote-yes','data-total-vote'),
            'vote_no' : ('button.vote-no','data-total-vote'),
            'published' : ('span.user-post__published > time:nth-child(1)',"datetime"),
            'purchased' : ('span.user-post__published > time:nth-child(2)',"datetime"),
}

### Extraction of opinions


In [11]:
with open("./cookie.json", 'r') as jf:
    headers = json.load(jf)

In [12]:
product_id = input("Enter product code,please: ")
next_page = f"https://www.ceneo.pl/{product_id}#tab=reviews"
all_opinions = []
while next_page:
    response = requests.get(next_page,headers= headers)
    if response.status_code == 200:
        print(next_page)
        page_dom = BeautifulSoup( response.text, 'html.parser')
        opinions = page_dom.select('div.js_product-review:not(.user-post--highlight)')
        print(len(opinions))
        for opinion in opinions:
            single_opinion = {
                key: extract(opinion, *value)
                for key, value in selectors.items()
            }
            single_opinion['content_en'] = translate(single_opinion['content_pl'])
            single_opinion['pros_en'] = [translate(pros) for pros in single_opinion['pros_pl']]
            single_opinion['cons_en'] = [translate(cons) for cons in single_opinion['cons_pl']]
            single_opinion['reccommendation'] = True if single_opinion['reccommendation']== 'Polecam' else False if  single_opinion['reccommendation']=='Nie polecam' else None
            single_opinion['stars'] = float(single_opinion['stars'].split('/')[0].replace(',','.'))
            single_opinion['vote_yes'] = int(single_opinion['vote_yes'])
            single_opinion['vote_no'] = int(single_opinion['vote_no'])
            all_opinions.append(single_opinion)
        try:
            next_page = 'https://www.ceneo.pl' + page_dom.select_one('a.pagination__next')['href']
        except TypeError: 
            next_page = None
print(all_opinions)

https://www.ceneo.pl/113304112#tab=reviews
10
https://www.ceneo.pl/113304112/opinie-2
10
https://www.ceneo.pl/113304112/opinie-3
10
https://www.ceneo.pl/113304112/opinie-4
10
https://www.ceneo.pl/113304112/opinie-5
10
https://www.ceneo.pl/113304112/opinie-6
10
https://www.ceneo.pl/113304112/opinie-7
10
https://www.ceneo.pl/113304112/opinie-8
10
https://www.ceneo.pl/113304112/opinie-9
10
https://www.ceneo.pl/113304112/opinie-10
10
https://www.ceneo.pl/113304112/opinie-11
10
https://www.ceneo.pl/113304112/opinie-12
10
https://www.ceneo.pl/113304112/opinie-13
1
[{'opinion_id': '17095943', 'author': 'j...7', 'reccommendation': True, 'stars': 4.5, 'content_pl': 'Wygląda na to, że wybór tej drukarki w gąszczu modeli był strzałem w dziesiątkę. Świetny sprzęt a Eco tank rewelacja - szkoda,że nie pojawił się 20 lat temu.', 'pros_pl': ['ekonomia wydruków', 'jakość wydruków', 'szybkość wydruku', 'wifi', 'wydajność'], 'cons_pl': [], 'vote_yes': 5, 'vote_no': 2, 'published': '2023-02-02 16:27:26', 

In [13]:
if not os.path.exists("./opinions"):
    os.mkdir("./opinions")
with open(f"./opinions/{product_id}.json", "w", encoding="UTF-8") as jf:
    json.dump(all_opinions, jf, ensure_ascii=False, indent=4)