In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\janan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

                                                  Data Scraping Overview
                                                  
This project involves scraping restaurant data from Deliveroo using Selenium and BeautifulSoup. The goal is to collect data on various restaurants in Nottingham, specifically in the Sneinton Bakersfield area, filtered by different cuisines.

Tools and Libraries Used:

1. Pandas: For data manipulation and creating a DataFrame to store the restaurant data.
2. BeautifulSoup: A Python library for parsing HTML and XML documents. It is used here to extract relevant information from the webpage.
3. Selenium: A web automation tool that allows for dynamic interaction with web pages. It is used for opening the webpage, handling cookies, and scrolling to load all restaurant listings.
4. Regular Expressions (re): Employed for searching and extracting specific patterns from the HTML content, such as ratings and delivery costs.


STEP 1: Setup:

* Defined the target postcode and a list of cuisines to filter the restaurants.
* Initialized the Selenium WebDriver to control the Chrome browser.

STEP 2: Scraping Logic:

* Iterated through the predefined list of cuisines, constructing the appropriate URL for each cuisine on Deliveroo.
* Opened the webpage for each cuisine and accepted cookies if prompted.
* Implemented a scrolling mechanism to ensure all restaurant listings were loaded dynamically.

STEP 3: Data Extraction:

Utilized BeautifulSoup to parse the loaded HTML content and extract relevant data such as:
* Restaurant name
* Distance to the delivery location
* Delivery cost
* Restaurant rating
* Hyperlink to the restaurant's menu
Each piece of information was gathered using specific functions designed to locate and retrieve the required data.

STEP 4: Data Storage:

* Compiled all extracted data into a list of dictionaries, with each dictionary representing a restaurant.
* Converted the list into a Pandas DataFrame for easy analysis and manipulation.
  
STEP 5: Output:

The final DataFrame, restaurant_df1, contains comprehensive information about each restaurant, including its rank, name, distance, delivery cost, rating, and cuisine type.

In [None]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time

# Postcode and cuisine list
postcode = 'NG2 4AP'
cuisines = [
    'all', 'day', 'breakfast', 'american', 'asian', 'breakfast', 
    'british', 'brunch', 'café', 'Caribbean', 'Chinese', 
    'drinks', 'grocery', 'healthy', 'indian', 'italian', 
    'jamaican', 'japanese', 'mexican', 'shopping', 'thai', 
    'turkish'
]

chrome_driver_path = r"C:\Users\janan\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
initial_load_time = 2
scroll_pause_time = 0.0001

# Initialize WebDriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

# Create an empty list to store all restaurant data
all_restaurants_list = []

# Loop through each cuisine
for cuisine in cuisines:
    url = f'https://deliveroo.co.uk/restaurants/nottingham/sneinton-bakersfield?postcode={postcode}&cuisine={cuisine}&collection=all-restaurants'
    
    # Open the webpage
    driver.get(url)
    time.sleep(initial_load_time)

    # Get screen height for scrolling
    screen_height = driver.execute_script('return window.screen.height;')

    # Accept cookies
    try:
        wait = WebDriverWait(driver, 10)
        cookie_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[text()="Accept All"]')))
        driver.execute_script("arguments[0].click();", cookie_button)
    except Exception as e:
        print(f"Could not find or click the accept cookies button: {e}")

    # Scroll down the page
    i = 1
    while True:
        driver.execute_script(f"window.scrollTo(0, {screen_height} * {i});")
        i += 1
        time.sleep(scroll_pause_time)

        scroll_height = driver.execute_script("return document.body.scrollHeight")
        if screen_height * i > scroll_height:
            print("Reached the bottom of the page.")
            break

    # Create a BeautifulSoup object
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Store class names
    a_tag_class = 'HomeFeedUICard-3e299003014c14f9'
    links_html = [str(x) for x in soup.find_all(class_=a_tag_class)]
    
    a_tag_class_restaurant = 'HomeFeedUICard-619f89858093a1f4'
    restaurants_html = [str(x) for x in soup.find_all(class_=a_tag_class_restaurant)]

    # Define data extraction functions
    def find_rating(links):
        link_tag = links.find('a')
        if link_tag and 'aria-label' in link_tag.attrs:
            aria_label = link_tag['aria-label']
            rating_match = re.search(r'Rated\s+([\d.]+)', aria_label)
            if rating_match:
                return rating_match.group(1)
        return "No rating found"

    def restaurant_name(restaurant):
        return restaurant.find('p').text.strip()

    def find_distance_data(restaurant):
        name_tag = restaurant.find('span')
        name = name_tag.text.strip()
        distance_match = re.search(r'(\d+\.?\d*)\s*mi', name)
        return distance_match.group(0) if distance_match else "No distance found"

    def find_delivery_cost(restaurant):
        name_tag = restaurant.find('span')
        name = name_tag.text.strip()
        match = re.search(r'£(\d+\.?\d*)\s*delivery', name)
        return match.group(0) if match else "Free delivery"

    def find_hyperlink(restaurant):
        link_tag = restaurant.find('a')
        if link_tag and 'href' in link_tag.attrs:
            return 'http://deliveroo.co.uk' + link_tag['href']
        return "No link found"

    # Extract restaurant data for the current cuisine
    for restaurant_html, links_html in zip(restaurants_html, links_html):
        restaurant = BeautifulSoup(restaurant_html, 'html.parser')
        links = BeautifulSoup(links_html, 'html.parser')

        restaurant_dictionary = {
            'rank': len(all_restaurants_list) + 1,
            'restaurant_name': restaurant_name(restaurant),
            'distance': find_distance_data(restaurant),
            'delivery_cost': find_delivery_cost(restaurant),
            'link': find_hyperlink(links),
            'rating': find_rating(links),
            'cuisine': cuisine  # Add the current cuisine
        }
        
        all_restaurants_list.append(restaurant_dictionary)

# Close the WebDriver
driver.quit()

# Create a DataFrame from the aggregated data
restaurant_df1 = pd.DataFrame(all_restaurants_list)
restaurant_df1


In [20]:
# Create a DataFrame from the aggregated data
restaurant_df1 = pd.DataFrame(all_restaurants_list)

# Specify the filename
csv_filename = 'restaurants1_data.csv'

# Save the DataFrame to a CSV file
restaurant_df1.to_csv(csv_filename, index=False)

print(f"Data saved to {csv_filename}")


Data saved to restaurants1_data.csv


                                                    2. WEB SCRAPING BASED ON DISHES SERVED

1. Navigating the Web Page: For each dish, we construct a URL to access the corresponding restaurant listings on Deliveroo. The script opens the URL and waits for the page to load.

2. Handling Cookies: Since many websites require users to accept cookies, the script looks for the cookie consent button and clicks it if found.

3. Scrolling for Data: To ensure we capture all restaurant listings on the page, the script scrolls down repeatedly until it reaches the bottom of the page. This is necessary because the website loads additional content dynamically as you scroll.

4. Data Extraction: Using BeautifulSoup, we parse the page's HTML to find the relevant restaurant information. We define several functions to extract:

* Restaurant Name: The name of the restaurant.
* Rating: The rating of the restaurant, extracted from the aria-label attribute of links.
* Distance: The distance from the postcode, parsed from a specific span tag.
* Delivery Cost: The cost of delivery, also extracted from a span tag.
* Link: The URL linking to the restaurant's page on Deliveroo.
* Storing Data: For each restaurant, we create a dictionary containing all extracted information and append it to a list. After processing all dishes, we close the WebDriver.

5. Creating a DataFrame: Finally, we convert the list of dictionaries into a Pandas DataFrame, providing a structured format for further analysis.

Output:

The output of this project is a DataFrame containing detailed information about various restaurants, including their names, ratings, delivery costs, distances, and associated dish types. This data can be used for further analysis, such as comparing ratings against delivery costs or visualizing the distribution of distances.

In [None]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time

# Postcode and cuisine list
postcode = 'NG2 4AP'
dish = [
    'alcohol', 'burgers', 'cakes', 'chicken', 'coffee', 'curry', 
    'dessert', 'fish+and+chips', 'fried+chicken', 'fries', 
    'kebab', 'milkshakes', 'noodles', 'pizza', 'salads', 
    'sandwiches', 'seafood', 'wraps'
]
chrome_driver_path = r"C:\Users\janan\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
initial_load_time = 2
scroll_pause_time = 0.0001

# Initialize WebDriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

# Create an empty list to store all restaurant data
all_restaurants_list = []

# Loop through each cuisine
for dish in dish:
    url = f'https://deliveroo.co.uk/restaurants/nottingham/sneinton-bakersfield?postcode={postcode}&dish={dish}&collection=all-restaurants'
    
    # Open the webpage
    driver.get(url)
    time.sleep(initial_load_time)

    # Get screen height for scrolling
    screen_height = driver.execute_script('return window.screen.height;')

    # Accept cookies
    try:
        wait = WebDriverWait(driver, 10)
        cookie_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[text()="Accept All"]')))
        driver.execute_script("arguments[0].click();", cookie_button)
    except Exception as e:
        print(f"Could not find or click the accept cookies button: {e}")

    # Scroll down the page
    i = 1
    while True:
        driver.execute_script(f"window.scrollTo(0, {screen_height} * {i});")
        i += 1
        time.sleep(scroll_pause_time)

        scroll_height = driver.execute_script("return document.body.scrollHeight")
        if screen_height * i > scroll_height:
            print("Reached the bottom of the page.")
            break

    # Create a BeautifulSoup object
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Store class names
    a_tag_class = 'HomeFeedUICard-3e299003014c14f9'
    links_html = [str(x) for x in soup.find_all(class_=a_tag_class)]
    
    a_tag_class_restaurant = 'HomeFeedUICard-619f89858093a1f4'
    restaurants_html = [str(x) for x in soup.find_all(class_=a_tag_class_restaurant)]

    # Define data extraction functions
    def find_rating(links):
        link_tag = links.find('a')
        if link_tag and 'aria-label' in link_tag.attrs:
            aria_label = link_tag['aria-label']
            rating_match = re.search(r'Rated\s+([\d.]+)', aria_label)
            if rating_match:
                return rating_match.group(1)
        return "No rating found"

    def restaurant_name(restaurant):
        return restaurant.find('p').text.strip()

    def find_distance_data(restaurant):
        name_tag = restaurant.find('span')
        name = name_tag.text.strip()
        distance_match = re.search(r'(\d+\.?\d*)\s*mi', name)
        return distance_match.group(0) if distance_match else "No distance found"

    def find_delivery_cost(restaurant):
        name_tag = restaurant.find('span')
        name = name_tag.text.strip()
        match = re.search(r'£(\d+\.?\d*)\s*delivery', name)
        return match.group(0) if match else "Free delivery"

    def find_hyperlink(restaurant):
        link_tag = restaurant.find('a')
        if link_tag and 'href' in link_tag.attrs:
            return 'http://deliveroo.co.uk' + link_tag['href']
        return "No link found"

    # Extract restaurant data for the current cuisine
    for restaurant_html, links_html in zip(restaurants_html, links_html):
        restaurant = BeautifulSoup(restaurant_html, 'html.parser')
        links = BeautifulSoup(links_html, 'html.parser')

        restaurant_dictionary = {
            'rank': len(all_restaurants_list) + 1,
            'restaurant_name': restaurant_name(restaurant),
            'distance': find_distance_data(restaurant),
            'delivery_cost': find_delivery_cost(restaurant),
            'link': find_hyperlink(links),
            'rating': find_rating(links),
            'dish': dish  # Add the current cuisine
        }
        
        all_restaurants_list.append(restaurant_dictionary)

# Close the WebDriver
driver.quit()

# Create a DataFrame from the aggregated data
restaurant_df = pd.DataFrame(all_restaurants_list)
restaurant_df


In [3]:
# Create a DataFrame from the aggregated data
restaurant_df = pd.DataFrame(all_restaurants_list)

# Specify the filename
csv_filename = 'dish_data.csv'

# Save the DataFrame to a CSV file
restaurant_df.to_csv(csv_filename, index=False)

print(f"Data saved to {csv_filename}")


Data saved to dish_data.csv
