In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm, tqdm_notebook

In [2]:
root = 'https://www.panelinha.com.br/receita/'

In [3]:
#Web scraping of main dishes

main_links = []

for pagenumber in tqdm_notebook(range(1,100)):
    url_main = 'https://panelinha-api-server-prod.herokuapp.com/v1/search?pageType=receita&category=Pratos%20principais&pageSize=12&pageNumber=' + str(pagenumber)

    html = requests.get(url_main).content
    htmljson = json.loads(html)
    
    lst =  htmljson['data']['results']

    for i in range(0,len(lst)):
        link = root + lst[i]['slug']
        main_links.append(link)


HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [4]:
#Web scraping of side dishes

side_links = []

for pagenumber in tqdm_notebook(range(1,100)):
    url_side = 'https://panelinha-api-server-prod.herokuapp.com/v1/search?category=Acompanhamentos&pageType=receita&pageSize=12&pageNumber=' + str(pagenumber)

    html = requests.get(url_side).content
    htmljson = json.loads(html)
    
    lst =  htmljson['data']['results']
    
    for i in range(0,len(lst)):
        link = root + lst[i]['slug']
        if link not in main_links:
            side_links.append(link)


HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [5]:
#Web scraping of desserts

dessert_links = []

for pagenumber in tqdm_notebook(range(1,100)):
    url_dessert = 'https://panelinha-api-server-prod.herokuapp.com/v1/search?pageType=receita&category=Sobremesas&pageSize=12&pageNumber=' + str(pagenumber)

    html = requests.get(url_dessert).content
    htmljson = json.loads(html)
    
    lst =  htmljson['data']['results']
    
    for i in range(0,len(lst)):
        link = root + lst[i]['slug']
        if link not in main_links and link not in side_links:
            dessert_links.append(link)


HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [6]:
#Web scraping of drinks

drink_links = []

for pagenumber in tqdm_notebook(range(1,100)):
    url_drink = 'https://panelinha-api-server-prod.herokuapp.com/v1/search?pageType=receita&category=Bebidas&pageSize=12&pageNumber=' + str(pagenumber)

    html = requests.get(url_drink).content
    htmljson = json.loads(html)
    
    lst =  htmljson['data']['results']
    
    for i in range(0,len(lst)):
        link = root + lst[i]['slug']
        if link not in main_links and link not in side_links and link not in dessert_links:
            drink_links.append(link)

HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [7]:
def df_builder(link_list, typ):
    df = pd.DataFrame({'links':link_list})
    df['type'] = len(df)*[typ]
    return df

#Transforming lists into dataframes
df_main = df_builder(main_links, 'main')
df_side = df_builder(side_links, 'side')
df_dessert = df_builder(dessert_links, 'dessert')
df_drink = df_builder(drink_links, 'drink')

#Combining all of them in one dataframe
recipes = pd.concat([df_main, df_side, df_dessert, df_drink]).reset_index(drop=True)

In [8]:
recipes.to_csv('recipes_links.csv', index=False)

In [3]:
recipes=pd.read_csv('recipes_links.csv')

In [4]:
recipes.head()

Unnamed: 0,links,type
0,https://www.panelinha.com.br/receita/rosbife,main
1,https://www.panelinha.com.br/receita/lombo-ass...,main
2,https://www.panelinha.com.br/receita/polenta-c...,main
3,https://www.panelinha.com.br/receita/macarrao-...,main
4,https://www.panelinha.com.br/receita/arroz-fri...,main


In [5]:
names_list = []
portions_list = []
prep_time_list = []
ingredients_list = []
directions_list = []
images_list = []

for i in tqdm_notebook(range(0,len(recipes.links))):
    html = requests.get(recipes.links[i]).content
    soup = BeautifulSoup(html,"lxml")
    
    name = soup.find('h1').get_text()
    names_list.append(name)
    
    portions = soup.find_all('span',{'class':'dd'})[2].get_text().strip()
    portions_list.append(portions)
    
    prep_time = soup.find_all('span',{'class':'dd'})[1].get_text()
    prep_time_list.append(prep_time)
    
    ingredients = soup.find_all('div',{'class':'editor ng-star-inserted'})[0].get_text(separator=' ')
    ingredients_list.append(ingredients)
    
    directions = soup.find_all('div',{'class':'editor ng-star-inserted'})[1].get_text(separator=' ').replace('\n', ' ').replace('\xa0',' ')
    directions_list.append(directions)
    
    images = soup.find('img',{'alt':'Imagem da receita','class':'expanded'})['src']
    images_list.append(images)
    
recipes['name'] = names_list
recipes['portions'] = portions_list
recipes['prep_time'] = prep_time_list
recipes['ingredients'] = ingredients_list
recipes['directions'] = directions_list
recipes['images'] = images_list

HBox(children=(IntProgress(value=0, max=1537), HTML(value='')))




In [6]:
recipes.head()

Unnamed: 0,links,type,name,portions,prep_time,ingredients,directions,images
0,https://www.panelinha.com.br/receita/rosbife,main,Rosbife,Até 4 porções,Até 1h,1 peça de filé mignon para rosbife (cerca de 7...,Preaqueça o forno a 220 ºC (temperatura alta...,https://cdn.panelinha.com.br/receita/156950588...
1,https://www.panelinha.com.br/receita/lombo-ass...,main,Lombo assado com tomates e ervas,8 porções,Mais de 2h,1 kg de lombo de porco em peça com capa de gor...,Seque bem a peça de lombo com papel-toalha o...,https://cdn.panelinha.com.br/receita/156590045...
2,https://www.panelinha.com.br/receita/polenta-c...,main,Polenta caprese com refogado de legumes,Até 2 porções,Pá-Pum,1 abobrinha 1 alho-poró 2 tomates maduros ½ xí...,Lave e seque o alho poró. Corte o talo em ro...,https://cdn.panelinha.com.br/receita/156589878...
3,https://www.panelinha.com.br/receita/macarrao-...,main,Macarrão à carbonara,2 porções,Pá-Pum,160 g de macarrão bavette (ou outro massa long...,Leve uma panela média com 2 litros de água a...,https://cdn.panelinha.com.br/receita/156589381...
4,https://www.panelinha.com.br/receita/arroz-fri...,main,Arroz frito com omelete japonesa,Até 2 porções,Pá-Pum,2 ovos 2 xícaras (chá) de arroz cozido 1 ce...,Descasque e passe a cenoura pela parte gross...,https://cdn.panelinha.com.br/receita/156589304...


In [7]:
recipes.to_csv('recipes_full_info.csv', index=False)