In [35]:
import httpx
from cytoolz import pipe, curried

In [36]:
resp = httpx.get('https://www.coffeedesk.pl/widgets/cms/navigation/ac9d993721af3405c2059beb5e91569d/filter')
filters = resp.json()

In [37]:
list(filters)


['manufacturer', 'price', 'rating', 'shipping-free', 'properties']

In [38]:
print("\n".join([r['name'] for r in filters['properties']['entities']]))

Producent
Kolor
Tworzywo
Ilość espresso
Płyty indukcyjne
Pojemność
Opakowanie
Pochodzenie
Rodzaj kawy
Arabica / Robusta
Obróbka
Stopień palenia ziaren
Przeznaczenie
Zastosowanie


In [41]:
from operator import itemgetter
from typing_extensions import Self

ORIGIN = 'Pochodzenie'
ARABICA = 'Arabica / Robusta'
PACKAGING = 'Opakowanie'
COFFEE_TYPE = 'Rodzaj kawy'
ROASTING = 'Stopień palenia ziaren'
METHOD = 'Przeznaczenie'


def extract_filters(attribute_name: str, raw_filters: dict) -> dict[str, str]:
    return pipe(
        raw_filters['properties']['entities'],
        curried.filter(lambda r: r['name'] == attribute_name),
        list,
        itemgetter(0),
        itemgetter('options'),
        curried.map(lambda r: (r['name'], r['groupId'])),
        dict
    )


class CoffeeDeskFilters:
    def __init__(self, filters: dict[str, dict[str, str]]):
        self.filters = filters

    @classmethod
    def download_and_parse(cls) -> Self:
        resp = httpx.get('https://www.coffeedesk.pl/widgets/cms/navigation/ac9d993721af3405c2059beb5e91569d/filter')
        filters = resp.json()


# extract_filters(ORIGIN, filters)
# extract_filters(ARABICA, filters)
extract_filters(COFFEE_TYPE, filters)


{'Mielona': '4cc333f76170852e16e1de8777aa695a',
 'Rozpuszczalna': '4cc333f76170852e16e1de8777aa695a',
 'W kapsułkach': '4cc333f76170852e16e1de8777aa695a',
 'Ziarnista': '4cc333f76170852e16e1de8777aa695a'}

In [42]:
from furl import furl

u = furl(
    'https://www.coffeedesk.pl/widgets/cms/navigation/ac9d993721af3405c2059beb5e91569d?no-aggregations=1&order=dostepnosc&p=1&properties=577c9e2997af4ede887246c7afd0213e%7C9087b0efc7c7acd1ef7e153678809c77%7Cf9fd2624beefbc7808e4e405d73f57ab%7Ca36b0dcd1e6384abc0e1867860ad3ee3&slots=0603c3ed87a34617bf1cda0443835ab1')


In [46]:
u.query.params


omdict1D([('no-aggregations', '1'), ('order', 'dostepnosc'), ('p', '1'), ('properties', '577c9e2997af4ede887246c7afd0213e|9087b0efc7c7acd1ef7e153678809c77|f9fd2624beefbc7808e4e405d73f57ab|a36b0dcd1e6384abc0e1867860ad3ee3'), ('slots', '0603c3ed87a34617bf1cda0443835ab1')])

In [50]:
r = httpx.get(
    'https://www.coffeedesk.pl/widgets/cms/navigation/ac9d993721af3405c2059beb5e91569d?no-aggregations=1&order=dostepnosc&p=2')
html = r.text

In [52]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html)


In [57]:
with open("index.html", "w") as f:
    f.write(str(soup))

In [60]:
len(soup.select(".product-info"))

30

In [78]:
node = soup.select(".product-info")[10]
node.select('a.product-name')[0].attrs['title']

'Illy Classico - Filter Roast - Kawa mielona'

In [None]:
import dataclasses
from bs4.element import Tag
from datetime import date, datetime
import re

DATE_RE = re.compile(r'\d{2}.\d{2}.\d{4}')


@dataclasses.dataclass
class Coffee:
    name: str
    link: str
    roasting_date: date | None

    @classmethod
    def parse_from_soup(cls, node: Tag) -> Self:
        a_tag = node.select('a.product-name')[0]
        link = a_tag.attrs['href']
        name = a_tag.attrs['title']
        image_node = node.select('img.product-image')[0]
        image_url = image_node.attrs['src']
        roasting_date = None
        if roasting_data := node.select('p.product-box__roasting-data'):
            raw_date = DATE_RE.findall(roasting_data[0].text)[0]
            roasting_date = datetime.strptime(raw_date, '%d.%m.%Y').date()
        return cls(
            name=name,
            link=link,
            roasting_date=roasting_date,
            image_url=image_url
        )



In [85]:
soup.select(".product-info")[10]

<div class="product-info">
<a class="product-name" data-dl-options='{"event":"eec.impressionClick","ecommerce":{"click":{"actionField":{"list":""},"products":[{"id":"0101000739","name":"Illy\\u0020Classico\\u0020\\u002D\\u0020Filter\\u0020Roast\\u0020\\u002D\\u0020Kawa\\u0020mielona","category":"Mielona\\\/Speciality\\\/Nasze\\u0020marki\\\/Kawa\\\/Single\\u0020origin","position":11}]}}}' href="https://www.coffeedesk.pl/product/13901/Illy-Classico-Filter-Roast-Kawa-Mielona" title="Illy Classico - Filter Roast - Kawa mielona">
				Illy Classico - Filter Roast - Kawa mielona
			</a>
<div class="product-box__data">
<p class="product-box__roasting-data">Data palenia: 18.05.2021</p>
<p class="product-box__manufacturer-name">Producent: Illy</p>
</div>
<div class="product-description">
                                    
	Illy Classico Filter to włoska kawa mielona o ciemnym profilu palenia. To 100% Arabika o przyjemnej słodyczy z łagodnymi nutami karmelu, kwiatu pomarańczy i jaśminu. 
	
	Ze

In [81]:
import json

with open("/Users/iv/Code/coffeedesk-crawler/coffee.json") as f:
    coffee = json.load(f)

sorted((c for c in coffee if c.get('roasting_date')), reverse=True, key=itemgetter('roasting_date'))[:3]

[{'link': 'https://www.coffeedesk.pl/product/24446/Coffee-Plant-Gwatemala-Piedra-Azul-Washed-Filter-250G',
  'name': 'COFFEE PLANT - Gwatemala Piedra Azul Washed Filter 250g',
  'roasting_date': '2022-12-20'},
 {'link': 'https://www.coffeedesk.pl/product/23789/Coffee-Plant-Salwador-Finca-El-Cerro-Natural-Espresso-1Kg',
  'name': 'COFFEE PLANT - Salwador Finca El Cerro Natural Espresso 1kg',
  'roasting_date': '2022-12-20'},
 {'link': 'https://www.coffeedesk.pl/product/24445/Coffee-Plant-Kenia-David-Ndirangu-Washed-Filter-250G',
  'name': 'COFFEE PLANT - Kenia David Ndirangu Washed Filter 250g',
  'roasting_date': '2022-12-20'}]