In [1]:
import requests
from pyquery import PyQuery as pq
import json
from json import JSONDecodeError
from tqdm import tqdm_notebook
import pandas as pd

In [6]:
url = "https://eda.ru/recepty?page={}"
res = requests.get(url.format(1))

In [47]:
recipes_subcat_list = []
for recipes_cat in pq(res.text).find(".seo-footer .seo-footer__list"):
    recipes_subcats = pq(recipes_cat).find("li.seo-footer__list-item, li.seo-footer__list-title _empty")
    for recipes_subcat in recipes_subcats:
        d = {
            "title": pq(recipes_subcat).text().split("\xa0\xa0")[0],
            "href": pq(recipes_subcat).find("a").attr("href"),
            "num": int(pq(recipes_subcat).text().split("\xa0\xa0")[1])
        }
        recipes_subcat_list.append(d)

In [None]:
data = {}

for rec_cat in recipes_subcat_list:
    print(rec_cat["title"])
    has_items = True
    page = 1
    
    while has_items:
        res = requests.get("https://eda.ru/{cat}?page={page}".format(cat=rec_cat["href"], page=page)).text
        
        if len(pq(res).find(".recipes-page__recipes .tile-list__horizontal-tile")):
            page += 1
            for recipe in pq(res).find(".recipes-page__recipes .tile-list__horizontal-tile"):
                title = pq(recipe).find(".horizontal-tile__item-title").text()
                href = pq(recipe).find(".horizontal-tile__item-title a").attr("href")
#                 print("Page {}, URL {}".format(page, href), end="\r")
                img_src = pq(recipe).find(".horizontal-tile__preview .lazy-load-container").attr("data-src")
                booked = int(pq(recipe).find(".js-bookmark__counter").text())
                likes = int(pq(recipe).find(".widget-list__like-count").text().split()[0])
                dislikes = int(pq(recipe).find(".widget-list__like-count").text().split()[1])
                time_to_cook = pq(recipe).find(".prep-time").text()
                portions = pq(recipe).find(".js-portions-count-print").text()
                ingredients = []
                for ingredient in pq(recipe).find(".ingredients-list .ingredients-list__content-item"):
                    try:
                        attr = pq(ingredient).attr("data-ingredient-object")
                        ingredients.append(json.loads(attr.replace('""', '"')))
                    except JSONDecodeError as err:
                        print(pq(ingredient).attr("data-ingredient-object"))
                        break
                
                data[href] = {
                    "title": title,
                    "img_src": img_src,
                    "ingredients": ingredients,
                    "booked": booked,
                    "likes": likes,
                    "dislikes": dislikes,
                    "time_to_cook": time_to_cook,
                    "portions": portions,
                    "category": rec_cat["title"]
                }
        else:
            has_items = False

In [3]:
len(data)

20759

In [99]:
for num, d in enumerate(data):
    data[d]["id"] = num

In [None]:
json.dump(data, open("recipes.json", "wt", encoding="utf8"))

In [2]:
data = json.load(open("recipes.json", "rt", encoding="utf8"))

In [3]:
df = pd.DataFrame.from_dict(data, orient="index")

In [108]:
df["img_src"].isnull().sum()

4656

In [5]:
df["category"].head()

/recepty/bulony/bulon-iz-kashtanov-25717                         Овощной бульон
/recepty/bulony/bulon-kurinij-s-kleckami-iz-maci-16656           Куриный бульон
/recepty/bulony/bulon-ovoschnoj-14259                            Овощной бульон
/recepty/bulony/holodec-iz-teljatini-kurinih-potroshkov-23603           Холодец
/recepty/bulony/klassicheskiy-svetlyy-kurinyy-bulon-93912        Куриный бульон
Name: category, dtype: object

In [None]:
df.to_msgpack("recipes.msg")

In [7]:
for url, d in tqdm_notebook(data.items()):
    if d["img_src"]:
        if d["id"] in range(14419, 40000): # для возобновления закачки с какого-либо момента
            url = "https:" + d["img_src"]
            res = requests.get(url)
            path = "imgs/" + str(d["id"]) + "." + d["img_src"].split(".")[-1]
            open(path, 'wb').write(res.content)

HBox(children=(IntProgress(value=0, max=20759), HTML(value='')))




In [33]:
cat = {}
for recipes_list in pq(res.text).find(".seo-footer .seo-footer__list"):
    rec_lis = pq(recipes_list).find("li")
    cat_name = pq(rec_lis[0]).text().split("\xa0\xa0")[0]

    for rec_li in pq(rec_lis[1:]):
        subcat_name = pq(rec_li).text().split("\xa0\xa0")[0]
        cat[subcat_name] = cat_name

In [36]:
for num, d in enumerate(data):
    data[d]["sup_category"] = cat[data[d]["category"]]