### Import bibliotek

In [1]:
import json
import os
import requests
from bs4 import BeautifulSoup

### Narzędzia

In [2]:
def extract(ancestor, selector = None, attribute = None, many = False):
    if selector:
        if many:
            if attribute:
                return [item[attribute].strip() for item in ancestor.select(selector)]
            return [item.text.strip() for item in ancestor.select(selector)]
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None
        try:
            return ancestor.select_one(selector).text.strip()
        except AttributeError:
            return None     
    if attribute:
        try:
            return ancestor[attribute]
        except TypeError:
            return None
    return ancestor.text.strip()

In [3]:
review_scheme = {
    "review_id": (None, "data-entry-id"),
    "author": ("span.user-post__author-name",),
    "recomendation": ("span.user-post__author-recomendation > em",),
    "stars": ("span.user-post__score-count",),
    "content": ("div.user-post__text",),
    "pros": ("div.review-feature__item--positive", None, True,),
    "cons": ("div.review-feature__item--negative", None, True,),
    "likes": ("button.vote-yes > span",),
    "dislikes": ("button.vote-no > span",),
    "publish_date": ("span.user-post__published > time:nth-child(1)", 'datetime',),
    "purchase_date": ("span.user-post__published > time:nth-child(2)", 'datetime',),
            }

### Pobranie ze strony Ceneo.pl opinii o konkretnym produkcie


In [4]:
with open("./headers.json", "r")as jf:
    headers = json.load(jf)



In [5]:
product_id = "148578611"
next_page = f"https://www.ceneo.pl/{product_id}#tab=reviews"
all_reviews = []
while next_page:
    response = requests.get(next_page, headers=headers)
    print(next_page)
    if response.status_code == 200:
        page_dom = BeautifulSoup(response.text, "html.parser")
        reviews = page_dom.select("div.js_product-review:not(.user-post--highlight)")
        print(len(reviews))
        for review in reviews:
            single_review = {
                key: extract(review, *value)
                for key, value in review_scheme.items()
            }
            all_reviews.append(single_review)

        try:
            next_page = "https://www.ceneo.pl" + extract(page_dom, "a.pagination__next", "href")
        except TypeError:
            next_page = None
print(all_reviews)
        

https://www.ceneo.pl/148578611#tab=reviews
10
https://www.ceneo.pl/148578611/opinie-2
0
[{'review_id': '15312267', 'author': 'd...5', 'recomendation': 'Polecam', 'stars': '5/5', 'content': 'Apple tu nie ma wad a o zaletach wiedza nawet użytkownicy androida', 'pros': ['czas pracy na baterii', 'funkcjonalność', 'jakość ekranu'], 'cons': [], 'likes': '2', 'dislikes': '1', 'publish_date': '2021-12-06 12:10:14', 'purchase_date': '2021-11-22 17:48:54'}, {'review_id': '17903230', 'author': 'j...s', 'recomendation': 'Polecam', 'stars': '5/5', 'content': 'Dużo lżejszy od poprzednich generacji , działa bardzo szybko ale bateria szybciej się rozładowuje niż we wcześniejszych generacjach', 'pros': ['funkcjonalność', 'jakość ekranu'], 'cons': ['czas pracy na baterii'], 'likes': '0', 'dislikes': '0', 'publish_date': '2023-09-14 22:31:27', 'purchase_date': '2023-09-06 13:30:35'}, {'review_id': '18513776', 'author': 'k...3', 'recomendation': 'Polecam', 'stars': '5/5', 'content': 'Całkiem fajna zabawka

### Zapisanie wszystkich opinii w bazie danych

In [6]:
if not os.path.exists("./opinions"):
    os.mkdir("./opinions")

In [7]:
with open(f"./opinions/{product_id}.json", "w", encoding = "UTF-8") as jf:
    json.dump(all_reviews, jf, indent=4, ensure_ascii=False)