In [1]:
import pandas as pd
import json
import os

In [97]:
places_path = './src/savedPlaces.json'

### Google maps scraper
Source: # https://scrapfly.io/blog/how-to-scrape-google-maps/

In [4]:
from parsel import Selector
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async

script = """
function waitCss(selector, n=1, require=false, timeout=5000) {
  console.log(selector, n, require, timeout);
  var start = Date.now();
  while (Date.now() - start < timeout){
  	if (document.querySelectorAll(selector).length >= n){
      return document.querySelectorAll(selector);
    }
  }
  if (require){
      throw new Error(`selector "${selector}" timed out in ${Date.now() - start} ms`);
  } else {
      return document.querySelectorAll(selector);
  }
}

var results = waitCss("div[role*=article]>a", n=10, require=false);
return Array.from(results).map((el) => el.getAttribute("href"))
"""


async def search(query, page):
    url = f"https://www.google.com/maps/search/{query.replace(' ', '+')}/?hl=en"
    await page.goto(url)
    urls = await page.evaluate("() => {" + script + "}")
    return urls or [url]

def parse_place(selector):
    """parse Google Maps place"""

    def aria_with_label(label):
        """gets aria element as is"""
        return selector.css(f"*[aria-label*='{label}']::attr(aria-label)")

    def aria_no_label(label):
        """gets aria element as text with label stripped off"""
        text = aria_with_label(label).get("")
        return text.split(label, 1)[1].strip()

    result = {
        "category": selector.css("button[jsaction='pane.rating.category']::text").get(),
        # most of the data can be extracted through accessibility labels:
        "website": aria_no_label("Website: "),
        "phone": aria_no_label("Phone: "),
        "review_count": aria_with_label(" reviews").get(),
        # to extract star numbers from text we can use regex pattern for numbers: "\d+"
        "stars": aria_with_label(" stars").re("\d+.*\d+")[0],
        "5_stars": aria_with_label("5 stars").re(r"(\d+) review")[0],
        "4_stars": aria_with_label("4 stars").re(r"(\d+) review")[0],
        "3_stars": aria_with_label("3 stars").re(r"(\d+) review")[0],
        "2_stars": aria_with_label("2 stars").re(r"(\d+) review")[0],
        "1_stars": aria_with_label("1 stars").re(r"(\d+) review")[0],
    }
    return result

places = []
index_error = []
type_error = []
other_error = []
async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    page = await browser.new_page()
    for marker in markers['features']:
        try:
            urls = await search(marker['properties']['Title'] +' '+ marker['properties']['Location']['Address'], page=page)
            await stealth_async(page)
            await page.goto(urls[0])
            await page.wait_for_selector("button[jsaction='pane.rating.category']")
            content = await page.content()
            parsed_info = parse_place(Selector(text=content.encode('utf-8', 'ignore').decode()))

            place = {
                "name": marker['properties']['Title'],
                "category": parsed_info['category'],
                "google_link": marker['properties']['Google Maps URL'],
                "phone": parsed_info['phone'],
                "website": parsed_info['website']

            }
            places.append(place)
    
        except IndexError:
            index_error.append(marker['properties']['Title'])
            continue
        except TypeError:
            type_error.append(marker['properties']['Title'])
            continue
        except:
            other_error.append(marker['properties']['Title'])
            continue
            


In [98]:
# df_places = pd.DataFrame.from_records(places)
df_places = pd.read_json('./')

In [94]:
print(f"Total markers from google {len(markers['features'])}")

Total markers from google 741


In [92]:
print(f"Index Errors:{len(index_error)} Type Errors:{len(type_error)} Other Errors:{len(other_error)}")

Index Errors:153 Type Errors:74 Other Errors:295


## Data Engineering
The following will set up the data to be used by the frontend

In [84]:
food_category = ['restaurant', 'cafe', 'bistro']
hostel_category = ['hostel', 'hotel', 'bed']
tourism_category = ['tourism', 'museum']
outdoors = ['park', 'lake', 'river', 'waterfall', 'activity']
other_category = []
# TODO: re-scrap data and compare category guesses with list of unique categories from google

places = open(places_path)
df_places = pd.DataFrame.from_records(places) 
# df_places.to_json('parsed_places.json') # will overwrite file

In [95]:
print(f"Unique categories from google {len(df_places.category.unique())}")

AttributeError: 'DataFrame' object has no attribute 'category'

In [None]:
'''
Iter through rows of places and add marker icon to place
'''
for place in df_places.items:
    if any(cat in place.category for cat in food_category):
        print(place)
        df['name' == place.name]['marker'] = 'food'
        print(place.marker)
    else if any(cat in place.category for cat in hostel_category):
        df['name' == place.name]['marker'] = 'hostel'
    else if any(cat in place.category for cat in tourism_category):
        df['name' == place.name]['marker'] = 'tourism'
    else if any(cat in place.category for cat in outdoors_category):
        df['name' == place.name]['marker'] = 'outdoors'

    else:
        df['name' == place.name]['marker'] = 'other'

## Data API Merge
Add the webscraped data to the json file used by ryandev

In [74]:
def add_to_website_json(dataframe, json_file_path):
    categories_added = 0
    categories_ctn = 0
    with open(json_file_path, 'r') as file:
        places = json.load(file)
        for place in places['features']:
            if 'category' not in place:
                if place['properties']['Title'] in dataframe['name'].values:
                    place['properties']['category'] = dataframe[dataframe['name'] == place['properties']['Title']].category.values[0]
                    categories_added = categories_added + 1 
                    categories_ctn = categories_ctn + 1
            else:
                categories_ctn = categories_ctn + 1
    with open(json_file_path, 'w') as file:
#         json.dump(places, file) # will overwrite page
    
    return f"Categories added: {categories_added} Total Categories: {categories_ctn}"


In [96]:
add_to_website_json(df_places, './savedPlaces.json')

KeyError: 'name'