# ###############################################################

# A general scraper 
# This code scrapes restaurant data for n webpages, here n is 5.
# Each webpage has 15 restaurants, non restaurants in the listing are skipped.
# For this particular search, the location is Dubai Marina which can be changed by changing the search_url.

# ###############################################################

In [1]:
# Importing Libraries 
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from itertools import chain
import time

# Lists for storing the required data
# menu list will be a list of [name, description, price and logo]
# example for 1 restaurant, the menu items list will be -
# [[name, description, price and logo], [name, description, price and logo],..]
restaurant_name = []
restaurant_logo = []
latitude = []
longitude = []
cuisine_tags = []
menu_items = []

# search_link  - the restaurant search results for delivery to a location
# here dubai marina or 1272
# can be changed to find search results for different delivery locations
search_link = "https://www.talabat.com/uae/restaurants/1272/dubai-marina?page="

# Base url for parsing restaurant data
base_url = "https://www.talabat.com"

# n is the no of pages to parse
# One page has 15 restaurants 
n = 5

# Empty list for storing links
links = []

In [2]:
# To calculate time of execution for getting hyperlinks from n pages
start_time = time.time()

# This loop goes page by page to get hyperlinks for different restaurants
for i in range(0, n):
    
    temp = []
    
    # Requesting the webpage and loading soup
    r = requests.get(search_link + str(i))
    soup = BeautifulSoup(r.content,'html.parser')
    
    # Loop for finding hyperlinks on a single page
    for line in soup.find_all('a'):
        
        temp.append(line.get('href'))
    
    # Line 22:37 has the links for 15 restaurants
    links.append(temp[22:37])

# For flattening the list to 1d
links = list(chain(*links))

# time it took to go through to n pages
print(time.time() - start_time, 'seconds')

7.110396862030029 seconds


In [3]:
# To calculate the time of execution
start_time = time.time()

# Loop for parsing required data from restaurant pages in links
for i in range(0, len(links)):
    
    # Requesting individual restaurant webpages and loading soup
    r = requests.get(base_url + links[i])
    soup = BeautifulSoup(r.content,'html.parser')
    
    # Extracting data from script 
    scripts = soup.find('script', type='application/ld+json')
    
    # Loading to json for easy extraction
    data = json.loads(scripts.text)
    
    # To check if the link is not a restaurant 
    if (data['@type'] != 'Restaurant'):
        
        continue
        
    # Extracting required data (name, logo, coords and tags)
    restaurant_name.append( data['name'] )
    restaurant_logo.append( data['image'] )
    latitude.append( data['geo']['latitude'] )
    longitude.append( data['geo']['longitude'] )
    cuisine_tags.append( data['servesCuisine'] )
    
    # Extracting menu data from script
    data_menu = soup.find('script', type = 'application/json')
    
    # Loading to json for easy extraction
    data_menu = json.loads(data_menu.text)
    
    # Extracting required data
    menu_list = data_menu['props']['pageProps']['initialMenuState']['menuData']['items']
    
    temp = []
    
    # Iterating over menu items to extract required data
    for j in range(0, len(menu_list)):

        temp.append(
            
            [menu_list[j]['name'], menu_list[j]['description'], 
             menu_list[j]['price'], menu_list[j]['image']]
            
        )
        
    menu_items.append(temp)
    
table = {
    
  "restaurant_name": restaurant_name,
  "restaurant_logo": restaurant_logo,
  "latitude": latitude,
  "longitude": longitude,
  "cuisine_tags": cuisine_tags,
  "menu_items": menu_items

}

# time it took
print(time.time() - start_time, 'seconds')


119.00781798362732 seconds


In [4]:
# Converting data to a pandas dataframe 
df = pd.DataFrame(table) 

# saving it to a csv file
df.to_csv('many_many_restaurants.csv')