# Taking a Look at a Given OSM Dataset

In [5]:
# Data Manipulation
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, functions, types

# Plotting
import matplotlib.pyplot as plt
import folium


# Initialize a new Spark Session.
spark = SparkSession.builder.appName('Preprocess data').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

# Define a Feature Schema
amenity_schema = types.StructType([
    types.StructField('lat', types.DoubleType(), nullable=False),
    types.StructField('lon', types.DoubleType(), nullable=False),
    types.StructField('timestamp', types.TimestampType(), nullable=False),
    types.StructField('amenity', types.StringType(), nullable=False),
    types.StructField('name', types.StringType(), nullable=True),
    types.StructField('tags', types.MapType(types.StringType(), types.StringType()), nullable=True),
])

## Amenities

The given dataset is reduced a lot by filtering untitled entries out, so we will be using a node only if it:
- Has a WikiData Tag
- A name in either OSM or WikiData Datasets

To make recommendations, We will eventually join provided OSM data (Amenity, Non-Amenity, Transportation) with data retreived from WikiData Query Service.
Since some datapoints have varying 'location' tags. Additional request is made to include data:
- For areas: British Columbia / Vancouver, BC / Richmond, BC / Greater Vancouver / Burnaby, BC 
- Image (if exists) + it's link 
- WikiData Tag

In [6]:
# Printing a Working Directory for reference
ls

 01-ETL.ipynb              main.py             [0m[01;34m__pycache__[0m/
 02-EDA.ipynb              mapDriver.py        requirements.txt
 03-Dataset_Merge.ipynb    map.html            [01;34mtransportation[0m/
 04-Recommendation.ipynb   [01;34mnon-amenities[0m/      [01;34mUserWalk[0m/
 [01;34mamenities-cleaned[0m/       [01;34m'Prediction Data'[0m/   utilHyp.py


In [10]:
# Retrieves the Tag names for each entry.
def tagType(item, string):
    if(item.get(string) != None):
        return item.get(string)
    return False

# Load Preprocessed Data
data_amenities = pd.read_json('./amenities-cleaned/part-00000-89ef8ff3-0af4-4810-bb63-fb93dc6d8d25-c000.json', lines = True)
amenities = data_amenities[data_amenities['name'].isna() == False]

# What kind of Amenities does a dataset contain?
amenities['amenity'].unique()

array(['cafe', 'fast_food', 'place_of_worship', 'fuel', 'restaurant',
       'parking_entrance', 'pub', 'school', 'community_centre',
       'pharmacy', 'dentist', 'doctors', 'post_office', 'childcare',
       'public_building', 'bank', 'cinema', 'theatre', 'ferry_terminal',
       'bar', 'library', 'car_rental', 'bicycle_rental', 'clinic',
       'public_bookcase', 'university', 'dojo', 'toilets', 'arts_centre',
       'ice_cream', 'bench', 'shelter', 'bicycle_parking', 'recycling',
       'drinking_water', 'fountain', 'veterinary', 'bureau_de_change',
       'car_wash', 'nightclub', 'social_facility', 'post_box',
       'bus_station', 'college', 'construction', 'post_depot', 'atm',
       'nursery', 'clock', 'kindergarten', 'conference_centre',
       'vending_machine', 'car_sharing', 'marketplace', 'hospital',
       'taxi', 'police', 'fire_station', 'parking', 'motorcycle_parking',
       'charging_station', 'gambling', 'family_centre', 'townhall',
       'waste_basket', 'music_sch

In [11]:
# The dataset contains 8169 entities
amenities.shape

(8169, 5)

What can we retreive from here?<br>
It may make sense to categorize the types of places we have at our disposal as follows:

1) Enterntainment  
   
2) Food

3) Pubs and Nightclubs

4) Infrastructure

In [12]:
# Retrieve Enterntainment-Related Entities
enterntainment = amenities[amenities['amenity'] == 'theatre']
enterntainment = enterntainment.append(amenities[amenities['amenity'] == 'cinema'])
enterntainment = enterntainment.append(amenities[amenities['amenity'] == 'gambling'])
enterntainment = enterntainment.append(amenities[amenities['amenity'] == 'leisure'])
enterntainment = enterntainment.append(amenities[amenities['amenity'] == 'internet_cafe'])
enterntainment = enterntainment.append(amenities[amenities['amenity'] == 'arts_centre'])
enterntainment = enterntainment.append(amenities[amenities['amenity'] == 'townhall'])
enterntainment = enterntainment.append(amenities[amenities['amenity'] == 'lounge'])
enterntainment = enterntainment.append(amenities[amenities['amenity'] == 'casino'])
enterntainment = enterntainment.append(amenities[amenities['amenity'] == 'leisure'])
enterntainment = enterntainment.append(amenities[amenities['amenity'] == 'internet_cafe'])

enterntainment.to_json('entr_only', orient = 'records')
enterntainment.shape

(82, 5)

In [13]:
# Retrieve Nightlife-Related Entities
night = amenities[amenities['amenity'] == 'nightclub']
night = night.append(amenities[amenities['amenity'] == 'pub'])
night = night.append(amenities[amenities['amenity'] == 'bar'])
night = night.append(amenities[amenities['amenity'] == 'events_venue'])
night.to_json('night_only')
night.shape

(380, 5)

In [14]:
# Retrieve Food-Related Entities
food = amenities[amenities['amenity'] == 'restaurant']
food = food.append(amenities[amenities['amenity'] == 'cafe'])
food = food.append(amenities[amenities['amenity'] == 'fast_food'])
food = food.append(amenities[amenities['amenity'] == 'ice_cream'])
food = food.append(amenities[amenities['amenity'] == 'bistro'])
food = food.append(amenities[amenities['amenity'] == 'food_court'])
food = food.append(amenities[amenities['amenity'] == 'juice_bar'])

In [15]:
food.shape
food.to_json('food_only')

## Non-Amenity & Tourism Data

In [18]:
# Load Previously Preprocessed Data
data_non_amenities = pd.read_json('./non-amenities/part-00000-1cc7500a-32ba-4219-b04b-86429b55ad7a-c000.json', lines = True)


In [19]:
# Get Historic Spots
data_non_amenities['historic'] = data_non_amenities['tags'].apply(tagType, string = 'historic')
historic = data_non_amenities[data_non_amenities['historic'] != False]
historic = historic[historic['name'].isna() == False]

# Take a look at Place Types
historic['historic'].unique()

array(['monument', 'memorial', 'ruins', 'yes', 'marker', 'aircraft',
       'boundary_stone', 'milepost', 'milestone', 'building', 'tree',
       'church', 'ship', 'locomotive'], dtype=object)

In [20]:
historic.to_json('historic_only')
historic.shape

(200, 5)

In [21]:
# Get Tourist Spots
data_non_amenities['tourism'] = data_non_amenities['tags'].apply(tagType, string = 'tourism')
tourism = data_non_amenities[data_non_amenities['tourism'] != False]
tourism = tourism[tourism['name'].isna() == False].drop('historic', axis = 1)

# What kind of Tourism-Related Spots do we have?
tourism['tourism'].unique()

array(['information', 'attraction', 'hostel', 'artwork', 'guest_house',
       'museum', 'camp_site', 'hotel', 'viewpoint', 'apartment',
       'picnic_site', 'gallery', 'theme_park', 'motel', 'caravan_site',
       'winery', 'lookout', 'Plane_Spotting_Platform', 'house'],
      dtype=object)

In [22]:
tourism.to_json('tourism_only')
tourism.shape

(690, 5)

## Transportation

In [23]:
# Load Previously Preprocessed Data
data_transport = pd.read_json('./transportation/part-00000-032c8b88-03eb-4f92-a2dd-7f95285f7a7b-c000.json', lines = True)

# Get Subway Data Only.
# There are too many bus stops -> It will not make sense to guide the user having no Route info.
subway = data_transport[data_transport['tags'].apply(tagType, string = 'subway') != False]
subway = subway[subway['tags'].apply(tagType, string = 'wikidata') != False]
subway.head(5)

Unnamed: 0,lat,lon,name,tags
1,49.273152,-123.100444,Main Street–Science World,"{'wheelchair': 'yes', 'alt_name': 'Main Street..."
2,49.282015,-123.118936,Vancouver City Centre,"{'subway': 'yes', 'public_transport': 'stop_po..."
3,49.266647,-123.115425,Olympic Village,"{'subway': 'yes', 'public_transport': 'stop_po..."
4,49.226035,-123.116495,Langara-49th Avenue,"{'subway': 'yes', 'public_transport': 'stop_po..."
5,49.209617,-123.116938,Marine Drive,"{'subway': 'yes', 'public_transport': 'stop_po..."


In [24]:
subway.to_json('transit_only')
subway.shape

(48, 4)

## OSM data Visualization Example

In [28]:
# Initialize a Map object focusing on Area with desired Coordinates.
map_full = folium.Map(
    location=[49.28, -123.12],
    zoom_start = 12
)

tooltip = 'Details'

# Amenity Iterators
enterntainment_rows = enterntainment.iterrows()
night_rows = night.iterrows()
food_rows = food.iterrows()

# Non-Amenity Iterators
tourism_rows = tourism.iterrows()
historic_rows = historic.iterrows()

# Transportation Iterators
subway_rows = subway.iterrows()

# UNCOMMENT DESIRED LOOPS TO SEE WHERE PLACES OF CERTAIN TYPES ARE LOCATED.

# Where Enterntainment amenities are located.
for idx, each in enterntainment_rows:
    folium.Marker([each['lat'], each['lon']], popup = each['name'], icon = folium.Icon(color = 'red')).add_to(map_full)
    
# for idx, each in food_rows:
#     folium.Marker([each['lat'], each['lon']], popup = each['name'], icon = folium.Icon(color = 'red')).add_to(map_full)
    
# for idx, each in night_rows:
#     folium.Marker([each['lat'], each['lon']], popup = each['name'], icon = folium.Icon(color = 'red')).add_to(map_full)

# Where Historic Related amenities are located.
# for idx, each in historic_rows:
#     folium.Marker([each['lat'], each['lon']], popup = each['name'], icon = folium.Icon(color = 'green')).add_to(map_full)
        
# Where Tourism Related amenities are located.
# for idx, each in tourism_rows:
#     folium.Marker([each['lat'], each['lon']], popup = each['name'], icon = folium.Icon(color = 'green')).add_to(map_full)
        
# Where Transportation amenities are located.
# for idx, each in subway_rows:
#     folium.Marker([each['lat'], each['lon']], popup = each['name'], icon = folium.Icon(color = 'blue')).add_to(map_full)
        
    
display(map_full)