In [None]:
import requests
from bs4 import BeautifulSoup
import re
import csv
import time

start_time = time.time()

# Function to scrape data from a single page
def scrape_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all the P.O. Box elements on the page
        pobox_elements = soup.find_all('span', class_='pobox')
        pobox_numbers = [pobox.text.strip() if pobox else "n/a" for pobox in pobox_elements]

        # Find all location elements on the page
        location_elements = soup.find_all('span', class_='location')
        location_details = [location.find('span', itemprop='streetAddress').text.strip() if location else "n/a" for location in location_elements]

        # Find all city elements on the page
        city_elements = soup.find_all('span', class_='locationCity')
        city_names = [city.find('strong', itemprop='addressLocality').text.strip() if city else "n/a" for city in city_elements]

        # Find all elements with class 'col-md-10' containing restaurant information
        restaurant_elements = soup.find_all('div', class_='col-md-10')

        # Extract restaurant names, phone numbers, and mobile numbers
        restaurant_info = []
        for restaurant in restaurant_elements:
            name_element = restaurant.find('h2', class_='cmp_name')
            phone_element = restaurant.find('span', class_='phone')

            if name_element:
                name = name_element.text.strip()
                phone = re.search(r'\d{2}-\d{7}', phone_element.text) if phone_element else None
                if phone:
                    restaurant_info.append({'name': name, 'phone': phone.group()})
                else:
                    restaurant_info.append({'name': name, 'phone': "n/a"})
            else:
                restaurant_info.append({'name': "n/a", 'phone': "n/a"})

        # Find all the elements with class 'row categories productslist' which contain products and services
        products_list_elements = soup.find_all('div', class_='productslist')

        # Extract the text containing products and services for each element
        products_and_services = [products_list.text.strip() if products_list else "n/a" for products_list in products_list_elements]

        return restaurant_info, city_names, location_details, pobox_numbers, products_and_services
    else:
        print(f"Failed to fetch page: {url}")
        return [], [], [], [], []

# Lists to store restaurant information, city names, location details, and P.O. Box numbers
restaurant_info = []
city_names = []
location_details = []
pobox_numbers = []
products_and_services = []

# Loop through all pages
for page_num in range(1, 100):  # Adjust range as needed
    url = f"https://www.yellowpages-uae.com/uae/restaurants?page={page_num}"
    print(f"Scraping page {page_num}...")
    page_restaurant_info, page_city_names, page_location_details, page_pobox_numbers, page_products_and_services = scrape_page(url)
    restaurant_info.extend(page_restaurant_info)
    city_names.extend(page_city_names)
    location_details.extend(page_location_details)
    pobox_numbers.extend(page_pobox_numbers)
    products_and_services.extend(page_products_and_services)

    # Print all entries
    for i, restaurant in enumerate(restaurant_info):
    # Check if the index is valid for all lists
        if i < len(city_names) and i < len(location_details) and i < len(pobox_numbers) and i < len(products_and_services):
            print(f"Entry {i + 1}: {restaurant['name']}, {city_names[i]}, {location_details[i]}, {pobox_numbers[i]}, {products_and_services[i]}")
        else:
            print(f"Entry {i + 1}: Index out of range")

        
# Calculate the total number of entries (valid restaurant names)
total_entries = len([restaurant for restaurant in restaurant_info if restaurant['name'] != "n/a"])
print(f"Total number of entries (restaurant names): {total_entries}")

# Create a CSV file and write the data
with open("restaurant_data_yp_new.csv", "w", newline='', encoding='utf-8') as csvfile:
    fieldnames = ['City', 'Location Detail', 'Name', 'Phone', 'P.O. Box', 'Products and Services']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for restaurant, city, location_detail, pobox_number, products_services in zip(restaurant_info, city_names, location_details, pobox_numbers, products_and_services):
        writer.writerow({
            'City': city,
            'Location Detail': location_detail,
            'Name': restaurant.get('name', 'n/a'),
            'Phone': restaurant.get('phone', 'n/a'),
            'P.O. Box': pobox_number,
            'Products and Services': products_services
        })

end_time = time.time()
execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")