In [1]:
!pip install selenium webdriver-manager pandas





In [None]:
import os
import re
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Cities and business categories
cities = ["Cape Town", "Puebla", "Jakarta", "Hanoi"]
categories = ["Cafe", "Salon", "Pet Shop"]
output_dir = "scraped_output"
os.makedirs(output_dir, exist_ok=True)

# Start browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
wait = WebDriverWait(driver, 10)
all_data = []

for city in cities:
    for category in categories:
        print(f"\n🔍 Scraping {category}s in {city}")
        driver.get("https://www.google.com/maps")
        time.sleep(2)

        search_box = wait.until(EC.presence_of_element_located((By.ID, "searchboxinput")))
        search_box.clear()
        search_box.send_keys(f"{category}s in {city}")
        driver.find_element(By.ID, "searchbox-searchbutton").click()
        time.sleep(5)

        # Scroll all cards
        try:
            scroll_box = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@role="feed"]')))
            last_height = 0
            while True:
                driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight);", scroll_box)
                time.sleep(2)
                new_height = driver.execute_script("return arguments[0].scrollHeight", scroll_box)
                if new_height == last_height:
                    break
                last_height = new_height
        except Exception as e:
            print(f"Scroll error: {e}")

        cards = driver.find_elements(By.CLASS_NAME, "hfpxzc")
        print(f"→ Found {len(cards)} listings")

        for i in range(len(cards)):
            try:
                cards = driver.find_elements(By.CLASS_NAME, "hfpxzc")
                ActionChains(driver).move_to_element(cards[i]).click().perform()
                time.sleep(4)

                # Business name
                try:
                    name = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "DUwDvf"))).text
                except:
                    name = "Unknown"

                # Rating, review count, price level, category
                try:
                    info_line = driver.find_element(By.CLASS_NAME, "F7nice").text
                    rating = re.search(r"(\d\.\d)", info_line)
                    rating = rating.group(1) if rating else "Not found"

                    reviews = re.search(r"\(([\d,]+)\)", info_line)
                    reviews = reviews.group(1) if reviews else "Not found"

                    price_match = re.search(r"[₹$€¥₱]{1,3}", info_line)
                    price = price_match.group(0) if price_match else "Not listed"

                    category = info_line.split("·")[-1].strip() if "·" in info_line else "Unknown"
                except:
                    rating = reviews = price = category = "Not found"

                # Description
                try:
                    desc = driver.find_element(By.CLASS_NAME, "UsdlK").text
                except:
                    desc = "Not available"

                # Address
                try:
                    address = driver.find_element(By.XPATH, "//button[contains(@data-item-id, 'address')]").text
                except:
                    address = "Not available"

                # Hours
                try:
                    hours = driver.find_element(By.XPATH, "//div[contains(@aria-label, 'Hours')]").text
                except:
                    hours = "Not available"

                # Website
                try:
                    website = driver.find_element(By.XPATH, "//a[contains(@aria-label, 'Website')]").get_attribute("href")
                except:
                    website = "No website"

                # Phone
                phone = "Not available"
                try:
                    spans = driver.find_elements(By.XPATH, "//span")
                    for span in spans:
                        text = span.text.strip()
                        if re.match(r'^\+?\d[\d\s\-\(\)]{6,}$', text):
                            phone = text
                            break
                except:
                    pass

                # Email
                try:
                    email = driver.find_element(By.XPATH, "//a[starts-with(@href, 'mailto:')]").get_attribute("href").replace("mailto:", "")
                except:
                    email = "Not found"

                # Social links
                facebook = instagram = "Not found"
                try:
                    links = driver.find_elements(By.XPATH, "//a")
                    for link in links:
                        href = link.get_attribute("href")
                        if href:
                            if "facebook.com" in href:
                                facebook = href
                            elif "instagram.com" in href:
                                instagram = href
                except:
                    pass

                # Plus code / Map Code
                try:
                    plus_code = driver.find_element(By.XPATH, "//button[contains(@data-item-id, 'oloc')]").text
                except:
                    plus_code = "Not found"

                # Delivery/Menu sites (ubereats, dineplan etc.)
                delivery_links = []
                try:
                    ext_links = driver.find_elements(By.XPATH, "//a[contains(@href, 'ubereats') or contains(@href, 'dineplan')]")
                    for link in ext_links:
                        delivery_links.append(link.get_attribute("href"))
                except:
                    pass
                delivery = ', '.join(delivery_links) if delivery_links else "None"

                # Save data
                all_data.append({
                    "Business Name": name,
                    "Rating": rating,
                    "Reviews": reviews,
                    "Price Level": price,
                    "Category": category,
                    "Description": desc,
                    "Address": address,
                    "Hours": hours,
                    "Phone": phone,
                    "Email": email,
                    "Website": website,
                    "Facebook": facebook,
                    "Instagram": instagram,
                    "Plus Code": plus_code,
                    "Delivery/Menu Links": delivery,
                    "City": city,
                    "Business Type": category
                })

                print(f"{i+1}. {name} ✅")

            except Exception as e:
                print(f"❌ Error at index {i}: {e}")
                continue

# Close browser
driver.quit()

# Save to Excel
df = pd.DataFrame(all_data)
df.to_excel(f"{output_dir}/Full_Business_Data.xlsx", index=False)
print("\n✅ All data saved to:", f"{output_dir}/Full_Business_Data.xlsx")



🔍 Scraping Cafes in Cape Town
→ Found 120 listings
1. The Capetown Cafe ✅
2. Truth Coffee Roasting ✅
3. Rosetta Roastery Cafe (Bree Street) ✅
4. Ground Art Caffe ✅
5. Origin Coffee Roasting ✅
6. JARRYDS Brunch & Bistro ✅
7. The Ladder ✅
8. Four & Twenty Cafe ✅
9. I Love Coffee Roastery and Deli Claremont ✅
10. Kanéla Café ✅
11. Molten Toffee. ✅
12. Starlings ✅
13. Swan Café ✅
14. Sonder Café ✅
15. Honest Chocolate Cafe - Cape Town City Centre ✅
16. The Blue Cafe ✅
17. Heaven Coffee Shop ✅
18. Lazari ✅
19. Ground Culture Cafe ✅
20. Liquorice & Lime. ✅
21. Boston Coffee Roasters ✅
22. Red Sofa Café & Deli ✅
23. Bean There Coffee Company - Cape Town (by appointment only) ✅
24. Mulino Cafe & Bar ✅
25. The Strangers Club ✅
26. The Backyard Café ✅
27. Table Mountain ✅
28. Hey Stranger Coffee Collective ✅
29. Pang Specialty Coffee ✅
30. Deluxe Coffeeworks ✅
31. Café Crave ✅
32. Naked coffee ✅
33. Park Café ✅
34. The Hans & Lloyd Coffee Co | Newlands ✅
35. Pauline's ✅
36. Olympia Cafe ✅
37. C