In [1]:
import pandas as pd
import json
import os
import shutil
from datetime import datetime
import websocket
websocket._logging._logger.level = -99

In [2]:
savedPlaces_path = './savedPlaces.json'

In [3]:
def backup_file(file):
    source = './'+file+'.json'
    destination = './backups/'+file+'/'+file+'_' + datetime.today().strftime('%Y_%m_%d_%H%M') + '.json'
    shutil.copy(source, destination)

### Google maps scraper
Source: # https://scrapfly.io/blog/how-to-scrape-google-maps/

In [4]:
from parsel import Selector
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async

script = """
function waitCss(selector, n=1, require=false, timeout=5000) {
  console.log(selector, n, require, timeout);
  var start = Date.now();
  while (Date.now() - start < timeout){
  	if (document.querySelectorAll(selector).length >= n){
      return document.querySelectorAll(selector);
    }
  }
  if (require){
      throw new Error(`selector "${selector}" timed out in ${Date.now() - start} ms`);
  } else {
      return document.querySelectorAll(selector);
  }
}

var results = waitCss("div[role*=article]>a", n=10, require=false);
return Array.from(results).map((el) => el.getAttribute("href"))
"""
places_file = open(savedPlaces_path)
markers = json.load(places_file)

async def search(query, page):
    url = f"https://www.google.com/maps/search/{query.replace(' ', '+')}/?hl=en"
    await page.goto(url)
    urls = await page.evaluate("() => {" + script + "}")
    return urls or [url]

def parse_place(selector):
    """parse Google Maps place"""

    def aria_with_label(label):
        """gets aria element as is"""
        return selector.css(f"*[aria-label*='{label}']::attr(aria-label)")

    def aria_no_label(label):
        """gets aria element as text with label stripped off"""
        text = aria_with_label(label).get("")
        return text.split(label, 1)[1].strip()

    result = {
        "category": selector.css("button[jsaction='pane.rating.category']::text").get(),
        # most of the data can be extracted through accessibility labels:
        "website": aria_no_label("Website: "),
        "phone": aria_no_label("Phone: "),
        "review_count": aria_with_label(" reviews").get(),
        # to extract star numbers from text we can use regex pattern for numbers: "\d+"
        "stars": aria_with_label(" stars").re("\d+.*\d+")[0],
        "5_stars": aria_with_label("5 stars").re(r"(\d+) review")[0],
        "4_stars": aria_with_label("4 stars").re(r"(\d+) review")[0],
        "3_stars": aria_with_label("3 stars").re(r"(\d+) review")[0],
        "2_stars": aria_with_label("2 stars").re(r"(\d+) review")[0],
        "1_stars": aria_with_label("1 stars").re(r"(\d+) review")[0],
    }
    return result


async def scrape_google_maps():
    '''
    Scrape google maps
    :return Dataframe of places
    '''
    places = []
    index_error = []
    type_error = []
    other_error = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        for marker in markers['features']:
            try:
                urls = await search(marker['properties']['Title'] +' '+ marker['properties']['Location']['Address'], page=page)
                await stealth_async(page)
                await page.goto(urls[0])
                await page.wait_for_selector("button[jsaction='pane.rating.category']")
                content = await page.content()
                parsed_info = parse_place(Selector(text=content.encode('utf-8', 'ignore').decode()))

                place = {
                    "name": marker['properties']['Title'],
                    "category": parsed_info['category'],
                    "google_link": marker['properties']['Google Maps URL'],
                    "phone": parsed_info['phone'],
                    "website": parsed_info['website']

                }
                places.append(place)

            except IndexError:
                index_error.append(marker['properties']['Title'])
                continue
            except TypeError:
                type_error.append(marker['properties']['Title'])
                continue
            except:
                other_error.append(marker['properties']['Title'])
                continue
            
#             if len(places) == 3:
#                 return pd.DataFrame.from_records(places)
            
    backup_file('parsed_places')
    df_places = pd.DataFrame.from_records(places)
    df_places.to_json('parsed_places.json')
    print(f"Index Errors:{len(index_error)} Type Errors:{len(type_error)} Other Errors:{len(other_error)}")
    return df_places



In [5]:
df_places = await scrape_google_maps()
# df_places = pd.read_json('./')
print(f"Total places scraped from google:{len(df_places)}")

Index Errors:40 Type Errors:0 Other Errors:777
Total places scraped from google:29


In [6]:
print(f"All my markers from google:{len(markers['features'])}")

All my markers from google:846


In [7]:
# print(f"Index Errors:{len(index_error)} Type Errors:{len(type_error)} Other Errors:{len(other_error)}")

## Data Engineering
The following will set up the data to be used by the frontend

In [8]:
df_places.head()

Unnamed: 0,name,category,google_link,phone,website
0,Batu Caves,Shrine,http://maps.google.com/?cid=17491958630477438713,03-6189 6284,tourism.gov.my
1,Kolam Air Panas Kuala Kubu Baru,Nature preserve,http://maps.google.com/?cid=17100438188293160934,016-673 4383,m.facebook.com
2,Nagu Curry House (Indian),Indian restaurant,http://maps.google.com/?cid=10139427453734398584,03-6064 1154,m.facebook.com
3,Jemari Srikandi Spa,Spa,http://maps.google.com/?cid=16549140982370725794,019-281 2664,facebook.com
4,Cocoya_Malaysia @kampar coconut shake,Cafe,http://maps.google.com/?cid=9977942180261847222,010-268 8573,facebook.com


In [9]:
food_category = ['restaurant', 'bistro', 'vegan', 'vegetarian']
cafe_category = ['cafe', 'coffee', 'bar', 'pub']
hostel_category = ['hostel', 'hotel', 'bed', 'guest']
market_category = ['market', 'supermarket', 'grocery', 'fruit', 'vegetable']
money_category = ['bank', 'atm']
tourism_category = ['tourism', 'museum', 'church', 'cathedral', 'school']
travel_category = ['bus', 'station', 'train', 'boat']
outdoors_category = ['park', 'lake', 'river', 'waterfall', 'activity', 'gym', 'beach', 'mountain']
other_category = []
# TODO: re-scrap data and compare category guesses with list of unique categories from google

# places = open(places_path)

In [10]:
print(f"Unique categories from google {len(df_places.category.unique())}")

Unique categories from google 22


In [11]:
df_places['category'].unique()

array(['Shrine', 'Nature preserve', 'Indian restaurant', 'Spa', 'Cafe',
       'Rafting', 'Bakery', 'Tourist attraction', 'Organic shop', 'Park',
       'Vegetarian cafe and deli', 'Research foundation', 'State park',
       'South Indian restaurant', 'Vegan restaurant', 'Buddhist temple',
       '餐廳', 'Rock climbing gym', 'Bicycle Shop', 'Farm',
       'Vegetarian restaurant', 'Resort hotel'], dtype=object)

In [12]:
def add_markers(dataframe):
    '''
    Iter through rows of places and add marker icon to place
    '''
    df = dataframe.copy()
    df['marker'] = ''
    for ind, place in dataframe.iterrows():
        category = 'other'
        place_category = place.category.lower()
        place_name = place['name'].lower()
        if any(cat in place_category or cat in place_name for cat in food_category):
            category = 'food'
        elif any(cat in place_category or cat in place_name for cat in cafe_category):
            category = 'cafe'
        elif any(cat in place_category or cat in place_name for cat in money_category):
            category = 'money'
        elif any(cat in place_category or cat in place_name for cat in market_category):
            category = 'market'
        elif any(cat in place_category or cat in place_name for cat in travel_category):
            category = 'travel'
        elif any(cat in place_category or cat in place_name for cat in hostel_category):
            category = 'hostel'
        elif any(cat in place_category or cat in place_name for cat in tourism_category):
            category = 'tourism'
        elif any(cat in place_category or cat in place_name for cat in outdoors_category):
            category = 'outdoors'
        df.loc[df_places['name'] == place['name'], ['marker']] = category
    return df

## Data API Merge
Add the webscraped data to the json file used by ryandev

In [13]:
df_to_merge = add_markers(df_places)
print(f"Total places to merge: {len(df_to_merge['name'])}")
df_to_merge.head()

Total places to merge: 29


Unnamed: 0,name,category,google_link,phone,website,marker
0,Batu Caves,Shrine,http://maps.google.com/?cid=17491958630477438713,03-6189 6284,tourism.gov.my,other
1,Kolam Air Panas Kuala Kubu Baru,Nature preserve,http://maps.google.com/?cid=17100438188293160934,016-673 4383,m.facebook.com,cafe
2,Nagu Curry House (Indian),Indian restaurant,http://maps.google.com/?cid=10139427453734398584,03-6064 1154,m.facebook.com,food
3,Jemari Srikandi Spa,Spa,http://maps.google.com/?cid=16549140982370725794,019-281 2664,facebook.com,other
4,Cocoya_Malaysia @kampar coconut shake,Cafe,http://maps.google.com/?cid=9977942180261847222,010-268 8573,facebook.com,cafe


In [14]:
def add_to_website_json(dataframe, json_file_path):
    categories_added = 0
    categories_ctn = 0
    with open(json_file_path, 'r') as file:
        places = json.load(file)
        print(len(places['features']))
        print(len(dataframe.name))
        for place in places['features']:
            place['properties']['marker'] = ''
            if dataframe['name'].eq(place['properties']['Title']).any():
                place['properties']['marker'] = dataframe.loc[dataframe['name'] == place['properties']['Title'],['marker']].values[0][0]
            else:
                place['properties']['marker'] = 'other'
                
            if 'category' not in place:
                if place['properties']['Title'] in dataframe['name'].values:
                    place['properties']['category'] = dataframe.loc[dataframe['name'] == place['properties']['Title'],['category']].values[0][0]
                    categories_added = categories_added + 1 
                    categories_ctn = categories_ctn + 1
            else:
                categories_ctn = categories_ctn + 1
    backup_file('savedPlaces')
    with open(json_file_path, 'w') as file:
        json.dump(places, file) # will overwrite page
    
    return f"Categories added: {categories_added} Total Categories: {categories_ctn}"


In [15]:
add_to_website_json(df_to_merge, savedPlaces_path)

846
29


'Categories added: 29 Total Categories: 29'