<a href="https://colab.research.google.com/github/jddfrance/ifoodextract/blob/main/ifood_extract_0.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#Objetivo:
#Obter os dados do ifood
#Executar limpeza e formatação dos dados
#Analisar promoções 
#Obter estatísticas de preço
#Dividir por setor atendido
#produzir gráficos com as informações obtidas 

In [2]:
import requests
import json
import urllib.request 
from urllib.request import urlopen, Request 
from urllib.error import URLError, HTTPError
import requests
from requests import Session
from bs4 import BeautifulSoup 
import re 
from zipfile import ZipFile
import ssl
import pandas as pd

In [9]:
def trata_html(input): #tratamento do html pro bs4
    return " ".join(input.split()).replace('> <', '><') 
def get_prices(url, id):
  url = str(url)
  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',}
  try:
    request = Request(url, headers = headers)
    response = urlopen(request)
  except HTTPError as e:
    print(f'HTTPError on restaurant id: {id}')
    return None
  except URLError as e:
    print(f'URLError on restaurant id: {id}')
    return None
  html = response.read()
  html = html.decode('utf-8')
  html = trata_html(html)
  soup = BeautifulSoup(html, 'html.parser')
  jsondata = json.loads(soup.find('script', {'id': "__NEXT_DATA__"}).getText())
  pedidos = pd.json_normalize(jsondata)['props.initialState.restaurant.menu'][0]
  subitem = pd.json_normalize(pedidos)
  itemlist = []
  for index, row in subitem.iterrows():
    df = pd.json_normalize(row['itens'])
    df['Grupo'] = row['name']
    itemlist.append(df)
  if itemlist:  #verifica se itemlist tem itens (true)
    pricestats = (pd.concat(itemlist)
        .describe()['unitMinPrice']
        .to_frame()
        .transpose())
    pricestats['id'] = id
    pricestats.set_index('id', inplace = True)
  else:
    return None
  return pricestats


def get_merchant(restaurant_id):
  url = f"https://marketplace.ifood.com.br/v1/merchants/{restaurant_id}/extra"
  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', }
  request = Request(url, headers = headers)
  response = urlopen(request)
  responseread = response.read()
  merchant = pd.json_normalize(json.loads(responseread))
  if merchant.shifts[0] == []:
    return None
  else:
    name = merchant.name[0].replace(" ", "-")
    city = merchant['address.city'][0]
    state = merchant['address.state'][0]
    district = merchant['address.district'][0].replace(" ", "-")
    merchant['drop_target'] = None
    merchant['link'] = f"https://www.ifood.com.br/delivery/{city}-{state}/{name}-{district}/{restaurant_id}"
    if ('logo' in merchant):
      merchant['logo'] = f"https://static-images.ifood.com.br/image/upload/t_thumbnail/logosgde/{pd.json_normalize(merchant.resources[0]).fileName[0]}.jpg"

    merchant.drop(merchant.columns.to_series()["locale":"drop_target"], axis=1, inplace= True)
    dellist = ['shortId', 'companyCode',
                'resources', 'enabled',
                'tags', 'phoneIf',
                'groups', 'features',
                'priceRange', 'name',
                'minimumOrderValue', 'deliveryTime',
                'takeoutTime']
    for item in dellist:
      if (item in merchant):
        merchant.drop(columns = item, inplace = True)

    try:
      filter = pd.json_normalize(merchant.categories[0]).description.to_list()
      merchant.at[0,'categories'] = filter
    except: 
      merchant.at[0,'categories'] = None
    
    headerslist = ['id', 'description',
                    'shifts', 'categories', 
                    'userRatingCount']
    for item in headerslist:
      if not item in merchant.columns:
        merchant[item] = None
    shiftog = pd.json_normalize(merchant.shifts[0]).set_index('dayOfWeek')
    start = pd.DataFrame(shiftog.loc[:,'start']
                          .add_suffix('_start')
                          .to_frame().groupby('dayOfWeek')['start']
                          .sum()).transpose().reset_index(drop = True)
    headerlist = ['FRIDAY_start', 'MONDAY_start',
                  'SATURDAY_start', 'SUNDAY_start', 
                  'THURSDAY_start', 'TUESDAY_start',
                  'WEDNESDAY_start']
    for item in headerlist:
      if not item in start.columns:
        start[item] = None
    duration = pd.DataFrame(shiftog.loc[:,'duration']
                            .add_suffix('_duration')
                            .to_frame().groupby('dayOfWeek')['duration']
                            .sum()).transpose().reset_index(drop = True)
    headerlist =  ['FRIDAY_duration', 'MONDAY_duration',
                    'SATURDAY_duration', 'SUNDAY_duration', 
                    'THURSDAY_duration', 'TUESDAY_duration',
                    'WEDNESDAY_duration']
    for item in headerlist:
      if not item in duration.columns:
        duration[item] = 0

    merchant = pd.concat([merchant, start, duration], axis = 1)
    merchant.drop(columns = ['shifts', 'description'], inplace = True)
    return merchant  
   

def get_rest(latitude, longitude, size = 300):
    appendrest = []
    for page in range(0, 10):
      ssl._create_default_https_context = ssl._create_unverified_context
      url = f"https://marketplace.ifood.com.br/v1/merchants?latitude={latitude}&longitude={longitude}&channel=IFOOD&size={size}&page={page}"
      headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', }
      request = Request(url, headers = headers)
      response = urlopen(request)
      responseread = response.read()
      # Observando as dict_keys: "total" indica o numero de restaurantes na área, 
      #pode ser usado para nos ajudar a iterar pelas páginas, 
      #enquanto "merchants" contém uma lista informações sobre os restaurantes e 
      #"facets" dá um resumo das informações dos restaurantes.
      ifoodjson = json.loads(responseread)
      restaurantes = pd.DataFrame.from_dict(ifoodjson['merchants'])
      maincat = pd.json_normalize(restaurantes['mainCategory'])
      maincat.rename(columns = {'name' : 'type'}, inplace= True)
      restaurantes = pd.concat([restaurantes, maincat], axis=1)
      defee = pd.json_normalize(restaurantes['deliveryFee'])
      restaurantes = pd.concat([restaurantes, defee.value], axis=1)
      restaurantes.rename(columns = {'value' : 'deliveryValue'}, inplace=True)
      dellist = ['mainCategory', 'paymentCodes',
                 'contextSetup', 'deliveryFee',
                  'resources', 'currency',
                  'code','merchantChain',
                  'available', 'slug',
                  'features',]
      for item in dellist:
        if (item in restaurantes):
          restaurantes.drop(columns = item, inplace = True)
      restaurantes = restaurantes.set_index('id')
      appendrest.append(restaurantes)
      
    restaurantes = pd.concat(appendrest)
    restaurantes = restaurantes[(restaurantes['userRating'] != 5) & 
                                (restaurantes['userRating'] != 0)]    
    return restaurantes


def get_ifood(latitude, longitude, size = 300):
  restaurantes = get_rest(latitude, longitude, size)
  mercupdate = []
  for item in restaurantes.index:
    mercupdate.append(get_merchant(item))
  merchant = pd.concat(mercupdate).set_index('id')
  restaurantes = pd.concat([restaurantes, merchant], axis = 1)
  prices = []
  for index, row in restaurantes.iterrows(): 
    if not (pd.isnull(row['link'])):  
      prices.append(get_prices(row['link']
                              .encode('ascii', 'ignore')
                              .decode("utf-8"), index))

  prices = pd.concat(prices)
  restaurantes = pd.concat([restaurantes, prices], axis = 1)
  
  return restaurantes

 


HTTPError on restaurant id: 94afa23e-2200-4c9a-a769-c819850744e4


In [None]:
latitude = '-1.3735'
longitude = '-48.4496'

restlist = get_ifood(latitude, longitude)

In [13]:
restlist.to_excel('restlist.xlsx')

In [11]:
restlist[(restlist['type'] == 'Hambúrguer') | (restlist['type'] == 'Lanches')].to_excel('borgar.xlsx')