In [280]:
#importing libraries 
from selenium import webdriver
import selenium 
from selenium.webdriver.common.by import By
from selenium import webdriver # used to control web browsers programmatically
from selenium.webdriver.chrome.service import Service #used to manage the lifecycle of the ChromeDriver service.
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException 
import pandas as pd 
import time 
import csv

# Task 1 : Web Scraping Zomato restaurant Listings

Objective : Create a python script to scrape a minimum of 300 restaurant listings from zomato and save the data in a pandas dataframe or use gspread 
library in combination with Google service account to automatically load up the data in a google spreadsheet.  

In [149]:
#initialize the chrome Driver 
driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()))

In [150]:
#open the website 
driver.get("https://www.zomato.com/ncr/delivery")  

In [282]:
def scroll_page(driver):
    # Scroll down the page to load all elements dynamically
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(30)  # Allow time for the page to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

        
# Use this function before finding elements
#scroll_page(driver)

In [152]:
# Number of rows we want to scrap - 
target_rows = 300 

In [153]:
wait = WebDriverWait(driver, 40)

In [154]:
# Scrolling through the page 
print("Scrolling page...")
scroll_page(driver)
print("Scrolling complete, looking for containers...")

Scrolling page...
Scrolling complete, looking for containers...


In [231]:

# Initialize lists to store the extracted data
restaurant_name = []
ratings = []
cuisine_type = []
cost_for_one = []
delivery_time = []
offers = []
urls = []

# Wait for the container to load and locate the container
try:
    containers = wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//div[contains(@class, 'sc-kDgGX') and contains(@class, 'fjYwDL')]" )))
    print(f"Found {len(containers)} containers")
except TimeoutException:
    print("Containers not found")
    containers = []

# Extract data from each element
for box in containers:
    try:
        name = box.find_element(By.CLASS_NAME, "sc-1hp8d8a-0").text
    except:
        name = 'N/A'
    
    try:
        rating = box.find_element(By.XPATH, ".//div[@class='sc-1q7bklc-1 cILgox']").text
    except:
        rating = 'N/A'
    
    try:
        cuisine = box.find_element(By.XPATH, ".//p[@class='sc-1hez2tp-0 sc-dcOKER zafot']").text
    except:
        cuisine = 'N/A'
    
    try:
        cost = box.find_element(By.XPATH, ".//p[@class='sc-1hez2tp-0 sc-dcOKER imYCjj']").text
    except:
        cost = 'N/A'
    
    try:
        time = box.find_element(By.XPATH, ".//div[@class='min-basic-info-right']").text
    except:
        time = 'N/A'
    
    try:
        offer = box.find_element(By.XPATH, ".//p[@class='sc-1hez2tp-0 sc-hPeUyl kxuuMh']").text
    except:
        offer = 'No Offers'
    
    try:
        link = box.find_element(By.TAG_NAME, 'a').get_attribute('href')
    except:
        link = "N/A"
    
    # Append data to the corresponding lists
    restaurant_name.append(name)
    ratings.append(rating)
    cuisine_type.append(cuisine)
    cost_for_one.append(cost)
    delivery_time.append(time)
    offers.append(offer)
    urls.append(link)
    
    # Check if we have reached the target number of rows
    if len(restaurant_name) >= target_rows:
        break
        
#Close the driver 
driver.quit()

Found 744 containers


In [621]:
# printing all the lists  we initialized to store the data - 

print("restaurant_name:" , restaurant_name)
print(" ")
print("ratings:", ratings)
print(" ")
print("cuisine_type:", cuisine_type)
print(" ")
print("cost_for_one:", cost_for_one)
print(" ")
print("delivery_time:", delivery_time)
print(" ")
print("offers:", offers)
print(" ")
print("urls:", urls)

restaurant_name: ["McDonald's", 'KFC', 'Pizza Hut', "La Pino'z Pizza", 'The Burger Club', 'Burger Singh - Big Punjabi Burgers', "Haldiram's", "Kwality Wall's Ice Cream And More", 'RollsKing', "Wendy's Burgers", 'Veer Ji Malai Chaap Wale', 'Nazeer Foods', 'Biryani Blues', 'Bikkgane Biryani', 'Khadak Singh Da Dhaba', 'Bakingo', 'Apni Rasoi', 'Chinese Wok', 'Subway', 'Behrouz Biryani', 'WOW! Momo', 'Gianis - Ice Cream, Shakes & Sundaes', 'The Pizza Kings', 'BTW', 'Bikanervala', 'Faasos - Wraps, Rolls & Shawarma', 'NIC Ice Creams', "Nirula's", 'Thalairaj Biryani', 'Daily Kitchen - Homely Meals', 'LunchBox - Meals and Thalis', 'Biryani By Kilo', 'The Waffle Co.', 'Giani', 'Chicago Pizza', 'Rominus Pizza And Burger', 'Vadilal Ice Creams', 'Punjabi Rasoi', 'The Burger Company', 'Flavours', 'Wraps Kathi Rolls', 'Theobroma', 'Ghee and Turmeric', 'Mansi Chinese Food', 'Sweet Truth - Cake and Desserts', 'Baskin Robbins - Ice Cream Desserts', 'MOJO Pizza - 2X Toppings', 'Parantha Express', 'Pizza 

In [235]:
# Creating a dataframe to store the data - 
df = pd.DataFrame({
    'Restaurant_Name': restaurant_name,
    'Rating': ratings,
    'Cuisine_Type': cuisine_type,
    'Cost_for_one': cost_for_one,
    'Delivery_time': delivery_time,
    'Offers' : offers,
    'Restaurant_url': urls
})


In [241]:
# Displaying the data - 
df

Unnamed: 0,Restaurant_Name,Rating,Cuisine_Type,Cost_for_one,Delivery_time,Offers,Restaurant_url
0,McDonald's,4.4,"Burger, Wraps, Fast Food, Beverages",₹200 for one,18 min,No Offers,https://www.zomato.com/ncr/mcdonalds-1-connaug...
1,KFC,4.1,"Burger, Rolls, Fast Food",₹200 for one,36 min,No Offers,https://www.zomato.com/ncr/kfc-2-paharganj-new...
2,Pizza Hut,4.0,"Pizza, Fast Food, Italian, Desserts, Beverages",₹250 for one,34 min,20% OFF,https://www.zomato.com/ncr/pizza-hut-1-karol-b...
3,La Pino'z Pizza,4.0,"Pizza, Italian, Pasta, Fast Food, Desserts, Be...",₹250 for one,29 min,₹150 OFF,https://www.zomato.com/ncr/la-pinoz-pizza-karo...
4,The Burger Club,4.0,"Burger, Fast Food, Coffee, Beverages",₹350 for one,25 min,₹150 OFF,https://www.zomato.com/ncr/the-burger-club-1-c...
...,...,...,...,...,...,...,...
295,Hide Out Cafe,3.9,"Rolls, Sandwich, North Indian",₹100 for one,41 min,No Offers,https://www.zomato.com/ncr/hide-out-cafe-inder...
296,The Changezi Chicken,3.5,North Indian,₹200 for one,51 min,No Offers,https://www.zomato.com/ncr/the-changezi-chicke...
297,Dev Rasoi,4.1,"North Indian, Rolls, Chinese",₹150 for one,31 min,No Offers,https://www.zomato.com/ncr/dev-rasoi-karol-bag...
298,Nagpal's Chole Bhature,3.6,"North Indian, Beverages",₹150 for one,41 min,Free Sweet Kullad Lassi,https://www.zomato.com/ncr/nagpals-chole-bhatu...


In [256]:
# Saving the extracted data into a csv file - 
df.to_csv('zomato_restaurant.csv', index=False, encoding='utf-8-sig')

# Task 2 : Scrape 50 reviews from a zomato restaurant page 

Objective : Develop a python function scrape_zomato_reviews to scrape the first 50  user reviews from a given Zomato restaurant URL and store them in 
a pandas DataFrame.

In [637]:
#initialize the chrome Driver 
driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()))

In [638]:
#open the website 
driver.get("https://www.zomato.com/ncr/dominos-pizza-4-connaught-place-new-delhi/reviews")  
# Wait for content to load - 
wait = WebDriverWait(driver, 40)  

In [639]:
# Scrolling through the page 
print("Scrolling page...")
scroll_page(driver)
print("Scrolling complete, looking for containers...")

Scrolling page...
Scrolling complete, looking for containers...


In [647]:

# Initialize lists to store the extracted data
user_name = []
numberOfReviews= []
user_follower = []
rating = []
timeOfPosting = []
vote_list = []
review = []
review_type=[] 


# Running a loop to get 50 reviews information 
for page in range (10): # Loop through 10 pages
    # Selecting a element with all information regarding reviews -
    box = driver.find_element(By.XPATH, '//*[@id="root"]/div/main/div/section[4]/div/div/section/div[2]')

    #Selecting element to scrap user name , time of posting , vote list and review-
    names = box.find_elements(By.CLASS_NAME, "sc-1hez2tp-0")
    # using a for loop to iterate through each element 
    for i in range(0, len(names),4):
    # To handle errors -
        try:
            user_name.append(names[i].text)
        except:
            user_name.append("N/A")

    for i in range(1, len(names),4):
    # To handle errors -
        try:
            timeOfPosting.append(names[i].text)
        except:
            timeOfPosting.append("N/A")

    for i in range(3, len(names),4):
    # To handle errors -
        try:
            vote_list.append(names[i].text)
        except:
            vote_list.append("N/A")

    for i in range(2,len(names),4):
    # To handle errors -
        try:
            review.append(names[i].text)
        except:
            review.append("N/A")


     # Selecting element to scrap number of reviews and user follower - 
    nreviews = box.find_elements(By.CSS_SELECTOR, "span")
    
    for i in range(0,len(nreviews),7):
    # To handle errors -
        try:
            numberOfReviews.append(nreviews[i].text)
        except:
            numberOfReviews.append("N/A")

    for i in range(1,len(nreviews),7):
    # To handle errors -
        try:
            user_follower.append(nreviews[i].text)
        except:
            user_follower.append("N/A") 

     # Selecting element to scrap rating- 
    container = box.find_elements(By.XPATH, ".//div[@class='sc-1q7bklc-1 cILgox']")
    
    for i in range(len(container)):
    # To handle errors -
        try:
            rating.append(container[i].text)
        except:
            rating.append("N/A")

     # Selecting element to scrap review- 
    containers = box.find_elements(By.XPATH, ".//div[@class='sc-1q7bklc-9 dYrjiw']")
    
    for i in range(len(containers)):
    # To handle errors -
        try:
            review_type.append(containers[i].text)
        except:
            review_type.append("N/A")


    # Handling pagination
    try:
        next_button = driver.find_element(By.XPATH, ".//div[@class='sc-kAPOMq gbiCdQ']" )
        next_button.click()
        time.sleep(60)  # Wait for the content to load
    except:
        print("Next page not found or error in clicking.")
        break  # Exit loop if there's no next page

    

# Close the browser 
# driver.quit()



   

In [648]:
# printing all the lists  we initialized to store the data - 

print("user_name:" , user_name)
print(" ")
print("timeOfPosting:", timeOfPosting)
print(" ")
print("vote_list:", vote_list)
print(" ")
print("user_follower:", user_follower)
print(" ")
print("numberOfReviews:", numberOfReviews)
print(" ")
print("rating:", rating)
print(" ")
print("review:", review)
print(" ")
print("review_type:", review_type)


user_name: ['Peeyush Agarwal', 'Megha Jain', 'Gaurav', 'Gaurav Christian', 'Mohd Kasim', 'SWAGATA MOHAPATRA', 'Nitisha Pande', 'Sanjana Nayak', 'Shantanu', 'Vikas Bhardwaj', 'Vanamali Gowthami', 'Manita Kumari', 'Sawan Yaduvanshi', 'Akanksha Chauhan', 'Arjun Singh Negi', 'Avinash Sappidi', 'Tanya Yadav', 'Kruthi', 'Satish Prajapati', 'Vrinda Sharma', 'Aarti Lamba', 'Gagandeep Kaur', 'Tanvi Khanna', 'Achu2312', 'Drishti', 'Abhishek Singh', 'Kruthi', 'Godfrey', 'Manoj Kumar', 'Hrishikesh', 'Yukta Keer', 'Vijay', 'Afaque Jawaid', 'Anjali', 'Dhananjay Aneja', 'UDIT KAPAHI', 'Prachi Sharma', 'Vishnu', 'Vibhav', 'Dinesh Kumar', 'Nishi Kavi', 'Kruthi', 'Creative Manoj', 'Rajan', 'Randeep Singh', 'Aarush Nayak', 'Aditya Sen', 'Sumegdha', 'Harshit', 'Drishti']
 
timeOfPosting: ['yesterday', '10 days ago', '10 days ago', '11 days ago', '13 days ago', '26 days ago', '27 days ago', '28 days ago', '30 days ago', 'one month ago', 'one month ago', 'one month ago', 'one month ago', 'one month ago', '2

In [651]:
# Creating a dataframe to store the data - 
df1 = pd.DataFrame({
    'user_name': user_name,
    'numberOfReviews': numberOfReviews,
    'user_follower': user_follower,
    'star_rating': rating,
    'time_of_posting ': timeOfPosting ,
    'votes' : vote_list,
    'review': review,
    'review_type' : review_type
})


In [653]:
# Displaying the data - 
df1

Unnamed: 0,user_name,numberOfReviews,user_follower,star_rating,time_of_posting,votes,review,review_type
0,Peeyush Agarwal,0 reviews,0 Followers,1,yesterday,"0 Votes for helpful, 0 Comments",not delivered order,DELIVERY
1,Megha Jain,0 reviews,1 Followers,1,10 days ago,"0 Votes for helpful, 0 Comments",very bad experience cheese is rotten,DELIVERY
2,Gaurav,0 reviews,0 Followers,5,10 days ago,"0 Votes for helpful, 0 Comments",,DELIVERY
3,Gaurav Christian,0 reviews,1 Followers,1,11 days ago,"0 Votes for helpful, 0 Comments","terrible packing, pizzq stuck to the pizzq box...",DELIVERY
4,Mohd Kasim,0 reviews,0 Followers,2,13 days ago,"0 Votes for helpful, 0 Comments",cost bhot jayada hai,DELIVERY
5,SWAGATA MOHAPATRA,0 reviews,1 Followers,5,26 days ago,"0 Votes for helpful, 0 Comments",,DELIVERY
6,Nitisha Pande,0 reviews,1 Followers,4,27 days ago,"0 Votes for helpful, 0 Comments",,DELIVERY
7,Sanjana Nayak,0 reviews,0 Followers,5,28 days ago,"0 Votes for helpful, 0 Comments",,DELIVERY
8,Shantanu,0 reviews,17 Followers,1,30 days ago,"0 Votes for helpful, 0 Comments",I received a wrong pizza.,DELIVERY
9,Vikas Bhardwaj,0 reviews,0 Followers,1,one month ago,"0 Votes for helpful, 0 Comments",I ordered stuffed garlic bread and the sent ga...,DELIVERY


In [655]:
# Saving the extracted data into a csv file - 
df1.to_csv('zomato_reviews.csv', index=False, encoding='utf-8-sig') 