In [None]:
from playwright.async_api import async_playwright
from parsel import Selector
import csv,random, re, os
from datetime import datetime



INPUT_FILE = "google_map_queries_new_categories_part_3.csv"
now = datetime.now()
formatted_date = now.strftime("%Y-%m-%d_%H-%M-%S")
OUTPUT_FILE = f"DATA-03-google_map_{formatted_date}.csv"


SKIP_RAW = 0





RAW_NUMBER = 0
TIMEOUT = 20*1000
CDP_WS = "http://localhost:9221"
playwright = await async_playwright().start()
browser = await playwright.chromium.connect_over_cdp(CDP_WS)
context = browser.contexts[0] if browser.contexts else await browser.new_context()
page = await context.new_page()

In [2]:
# Read the CSV file (with BOM handling)
data_rows = []
with open(INPUT_FILE, 'r', encoding='utf-8-sig') as file:
    csv_reader = csv.DictReader(file)
    headers = csv_reader.fieldnames
    for row in csv_reader:
        data_rows.append(row)
original_fieldnames = list(data_rows[0].keys()) if data_rows else []
new_fieldnames = ["listing_url"]
all_fieldnames = original_fieldnames + new_fieldnames
file_exists = os.path.exists(OUTPUT_FILE)
with open(OUTPUT_FILE, 'w', newline='', encoding='utf-8-sig') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=all_fieldnames)
    writer.writeheader()

In [3]:
# Process each row and save immediately
processed_count = 0
for index, row in enumerate(data_rows):
    if index < SKIP_RAW:
        continue
    current_row = index + 1
    print(f"Processing Raw Number: {RAW_NUMBER} ...")
    RAW_NUMBER += 1


    
    website = (row.get('google_map_search_query') or '').strip()
    if not website or 'facebook' in website.lower():
        result_row = row.copy()
        result_row.update({
            "listing_url": None,

        })

        with open(OUTPUT_FILE, 'a', newline='', encoding='utf-8-sig') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=all_fieldnames)
            writer.writerow(result_row)

        # continue 



    else:
        try:
            await page.goto(website, timeout=TIMEOUT)
            await page.wait_for_timeout(300)  # brief pause for lazy content
        except:
            print("== Website loaded Error")
            pass


        


        END_XPATH = "//span[contains(., \"You've reached the end of the list.\")]"

        max_scrolls = 200
        stall_limit = 3
        stalled = 0
        last_count = -1

        for i in range(max_scrolls):
            await page.evaluate("""() => {
                const el = document.querySelector('div[role="feed"]');
                if (el) el.scrollBy(0, el.scrollHeight);
                window.scrollBy(0, 500); // mild nudge for good measure
            }""")
            await page.wait_for_timeout(1500)

            
            html = await page.content()
            sel = Selector(text=html)

            
            if sel.xpath(END_XPATH).get():
                print("Reached the end of the list banner. Stopping.")
                break


            cards = sel.xpath("//div[@role='feed']//a[contains(@href,'/maps/place') or contains(@href,'/maps/search')]").getall()
            curr_count = len(cards)
            if curr_count == last_count:
                stalled += 1
                if stalled >= stall_limit:
                    print("No new results after multiple scrolls. Stopping.")
                    break
            else:
                stalled = 0
                last_count = curr_count

            print(f"Scrolled {i+1}, items seen: {curr_count}")
        







        await page.wait_for_timeout(500)
        html = await page.content()
        response = Selector(text=html)


        div = response.xpath("//a[@class='hfpxzc']")

        for d in div:
            listing_url = d.xpath("./@href").get()
            

            result_row = row.copy()
            result_row.update({
                "listing_url": listing_url,
            })
            with open(OUTPUT_FILE, 'a', newline='', encoding='utf-8-sig') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=all_fieldnames)
                writer.writerow(result_row)

print(f"\nProcessing complete! Processed and saved {processed_count} records to {OUTPUT_FILE}.")


Processing Raw Number: 0 ...
Scrolled 1, items seen: 0
Scrolled 2, items seen: 0
Scrolled 3, items seen: 0
No new results after multiple scrolls. Stopping.
Processing Raw Number: 1 ...
Scrolled 1, items seen: 15
Scrolled 2, items seen: 20
Scrolled 3, items seen: 25
Scrolled 4, items seen: 29
Scrolled 5, items seen: 29
Scrolled 6, items seen: 29
No new results after multiple scrolls. Stopping.
Processing Raw Number: 2 ...
Scrolled 1, items seen: 10
Scrolled 2, items seen: 15
Scrolled 3, items seen: 20
Scrolled 4, items seen: 25
Scrolled 5, items seen: 30
Scrolled 6, items seen: 35
Scrolled 7, items seen: 37
Scrolled 8, items seen: 37
Scrolled 9, items seen: 37
No new results after multiple scrolls. Stopping.
Processing Raw Number: 3 ...
Scrolled 1, items seen: 15
Scrolled 2, items seen: 20
Scrolled 3, items seen: 25
Scrolled 4, items seen: 30
Scrolled 5, items seen: 35
Scrolled 6, items seen: 40
Scrolled 7, items seen: 45
Scrolled 8, items seen: 50
Scrolled 9, items seen: 55
Scrolled 10

In [4]:
# await page.goto("https://www.google.com/maps/search/restaurant+gastronomique+Paris+1e")

In [5]:
# # Keep scrolling until the "end of list" banner shows up
# # and also bail if results stall for a few iterations.
# from parsel import Selector

# END_XPATH = "//span[contains(., \"You've reached the end of the list.\")]"

# max_scrolls = 200          # hard cap (safety)
# stall_limit = 3            # how many times in a row we're allowed to see no growth
# stalled = 0
# last_count = -1

# for i in range(max_scrolls):
#     # scroll the results panel
#     await page.evaluate("""() => {
#         const el = document.querySelector('div[role="feed"]');
#         if (el) el.scrollBy(0, el.scrollHeight);
#         window.scrollBy(0, 500); // mild nudge for good measure
#     }""")
#     await page.wait_for_timeout(1500)

#     # check page state
#     html = await page.content()
#     sel = Selector(text=html)

#     # 1) stop when the end-of-list banner is visible (your XPath idea)
#     if sel.xpath(END_XPATH).get():
#         print("Reached the end of the list banner. Stopping.")
#         break

#     # 2) optional: stop if results are no longer increasing
#     #    (helps when the banner doesn't appear for some reason)
#     cards = sel.xpath("//div[@role='feed']//a[contains(@href,'/maps/place') or contains(@href,'/maps/search')]").getall()
#     curr_count = len(cards)
#     if curr_count == last_count:
#         stalled += 1
#         if stalled >= stall_limit:
#             print("No new results after multiple scrolls. Stopping.")
#             break
#     else:
#         stalled = 0
#         last_count = curr_count

#     print(f"Scrolled {i+1}, items seen: {curr_count}")

# # proceed to parse the final HTML if needed
# html = await page.content()
# response = Selector(text=html)




In [6]:
# div = response.xpath("//a[@class='hfpxzc']")

# for d in div:
#     a = d.xpath("./@href").get()
#     print(a)

In [7]:
# len(div)

In [8]:
# //a[@class='hfpxzc']

In [9]:
# # scroll 5 times
# for i in range(5):
#     print(f"Scrolling {i+1}/5 ...")
#     await page.evaluate("""() => {
#         const scrollable = document.querySelector('div[role="feed"]');
#         if (scrollable) scrollable.scrollBy(0, scrollable.scrollHeight);
#     }""")
#     await page.wait_for_timeout(2000)



# html = await page.content()
# response = Selector(text = html)


# response.xpath("//span[contains(text(), 'reached the end of th')]/text()").get()
# # "You've reached the end of the list."

In [10]:
# html = await page.content()
# response = Selector(text = html)


# response.xpath("//span[contains(text(), 'reached the end of th')]/text()").get()