# Leisure Walking Recommendation

Analysing the POIs in the description - both verbal descriptions of POIs and the geometric and thematic characterisitics

## Setup

Loading libraries and models

In [1]:
# working with files
import os.path
# IO
import json
# calling Webservices
import requests
# systematic thread stops for polite crawling
import time

# set random seed for reproducibility of results
from umap import UMAP

# dataframe 
import numpy as np
import pandas as pd
import geopandas as gpd

# geocoding
from geopy.geocoders import Nominatim

# getting data from OSM
import osmnx as ox

# topic modelling
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired

# nlp
from sentence_transformers import SentenceTransformer, util
import spacy
from nltk.corpus import stopwords

# en_core_web_lg must be downloaded, if not run: 'python -m spacy download en_core_web_lg' first!
nlp = spacy.load('en_core_web_lg')

umap_model = UMAP(random_state=42)

stopword_removal = False

## Dataset

Reading the dataset crawled from WalkingMap website.

In [2]:
with open('dataset/walkingmaps.json', 'r', encoding='utf-8') as fp:
    dataset = json.load(fp)

In [3]:
dataset[1]

{'markers': [{'markerId': 1581,
   'snapToRoad': False,
   'lat': -38.46876091,
   'lng': 144.08441126},
  {'markerId': 1582,
   'snapToRoad': False,
   'lat': -38.46847531,
   'lng': 144.09128845},
  {'markerId': 1583,
   'snapToRoad': False,
   'lat': -38.46838291,
   'lng': 144.09777939},
  {'markerId': 1584,
   'snapToRoad': False,
   'lat': -38.46883651,
   'lng': 144.10096586},
  {'markerId': 1585,
   'snapToRoad': False,
   'lat': -38.46712287,
   'lng': 144.1010946},
  {'markerId': 1586,
   'snapToRoad': False,
   'lat': -38.46678686,
   'lng': 144.10134137},
  {'markerId': 1587,
   'snapToRoad': False,
   'lat': -38.46640884,
   'lng': 144.0987128},
  {'markerId': 1588,
   'snapToRoad': False,
   'lat': -38.46572001,
   'lng': 144.09890592},
  {'markerId': 1589,
   'snapToRoad': False,
   'lat': -38.46549319,
   'lng': 144.09875572},
  {'markerId': 1590,
   'snapToRoad': False,
   'lat': -38.46572841,
   'lng': 144.09900248},
  {'markerId': 1591,
   'snapToRoad': False,
   'la

### Dataset Transformation

Aim: Transforming the dataset into pandas and geopandas dataframes, with a focus on POIs

A basic preprocessing step to create a dataset of POI descriptions, also including A preliminary analysis their location.

In [4]:
counter = 0
total = 0
for record in dataset:
    if 'pois' in record.keys() and len(record['pois']) > 0:
        counter+=1
        total += len(record['pois'])
print('records: {0} total POIs: {1} - average per record: {2}'.format(counter, total, round(total/counter)))

records: 386 total POIs: 4392 - average per record: 11


In [5]:
data_structure = {'record_title':[], 'record_description':[], 'poi_title':[], 'poi_summary':[], 'latitude': [], 'longitude': []}
for record in dataset:
    if 'pois' in record.keys() and len(record['pois']) > 0:
        for poi in record['pois']:
            data_structure['record_title'].append(record['title'])
            data_structure['record_description'].append(record['description'])
            data_structure['poi_title'].append(poi['title'])
            data_structure['poi_summary'].append(poi['summary'])
            data_structure['latitude'].append(poi['lat'])
            data_structure['longitude'].append(poi['lng'])

In [6]:
df = pd.DataFrame(data_structure)

gdf = gpd.GeoDataFrame(df[['poi_title', 'poi_summary', 'latitude', 'longitude']], geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326")

In [None]:
gdf.explore()

In [8]:
gdf.head()

Unnamed: 0,poi_title,poi_summary,latitude,longitude,geometry
0,Fairhaven Surf Life Saving Club,Fairhaven is a well known surf beach. The beac...,-38.468759,144.084459,POINT (144.08446 -38.46876)
1,Beach walk,"From Sprout Creek, Eastern View, Moggs Creek, ...",-38.468542,144.089693,POINT (144.08969 -38.46854)
2,Rock pools,See what sort of shells and stones you can col...,-38.468459,144.09242,POINT (144.09242 -38.46846)
3,Sand dunes,The beautiful rolling sand dunes shape the bea...,-38.468418,144.095318,POINT (144.09532 -38.46842)
4,Painkalac Creek,The creek separates Aireys Inlet from Fairhave...,-38.46839,144.097312,POINT (144.09731 -38.46839)


In [9]:
df.head()

Unnamed: 0,record_title,record_description,poi_title,poi_summary,latitude,longitude
0,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Fairhaven Surf Life Saving Club,Fairhaven is a well known surf beach. The beac...,-38.468759,144.084459
1,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Beach walk,"From Sprout Creek, Eastern View, Moggs Creek, ...",-38.468542,144.089693
2,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Rock pools,See what sort of shells and stones you can col...,-38.468459,144.09242
3,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Sand dunes,The beautiful rolling sand dunes shape the bea...,-38.468418,144.095318
4,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Painkalac Creek,The creek separates Aireys Inlet from Fairhave...,-38.46839,144.097312


## Topic Modelling

**Aim**: Findings topics describing the POIs using BERT embeddings

**Approach**: Using raw textual description (title + summary) and feed it BERTopic for categorisation. Manunally investigating the categories to find out the categories are coherent and meaningful.

In [10]:
gdf['description'] = gdf['poi_title']+': '+gdf['poi_summary']

In [11]:
def remove_stopwords(row):
    return ' '.join([w for w in row['description'].split() if w.lower() not in stopwords.words('english')])

In [12]:
if stopword_removal:
    gdf['description'] = gdf.apply(remove_stopwords, axis=1)

In [13]:
gdf.head()

Unnamed: 0,poi_title,poi_summary,latitude,longitude,geometry,description
0,Fairhaven Surf Life Saving Club,Fairhaven is a well known surf beach. The beac...,-38.468759,144.084459,POINT (144.08446 -38.46876),Fairhaven Surf Life Saving Club: Fairhaven is ...
1,Beach walk,"From Sprout Creek, Eastern View, Moggs Creek, ...",-38.468542,144.089693,POINT (144.08969 -38.46854),"Beach walk: From Sprout Creek, Eastern View, M..."
2,Rock pools,See what sort of shells and stones you can col...,-38.468459,144.09242,POINT (144.09242 -38.46846),Rock pools: See what sort of shells and stones...
3,Sand dunes,The beautiful rolling sand dunes shape the bea...,-38.468418,144.095318,POINT (144.09532 -38.46842),Sand dunes: The beautiful rolling sand dunes s...
4,Painkalac Creek,The creek separates Aireys Inlet from Fairhave...,-38.46839,144.097312,POINT (144.09731 -38.46839),Painkalac Creek: The creek separates Aireys In...


In [14]:
docs = list(gdf['description'])

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
topic_model = BERTopic(umap_model=umap_model, ctfidf_model=ctfidf_model)
topics, probs = topic_model.fit_transform(docs)

In [15]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1454,-1_reserve_street_through_hall,"[reserve, street, through, hall, water, centre...",[Boroondara Shops: A great little shopping str...
1,0,328,0_track_steep_signage_gravel,"[track, steep, signage, gravel, top, path, wal...",[The walking route: The route is flat and acce...
2,1,165,1_beach_bay_sand_tide,"[beach, bay, sand, tide, breakwater, surf, coa...","[Beach walkers: If the tide is out, walking al..."
3,2,161,2_bridge_pedestrian_footbridge_railway,"[bridge, pedestrian, footbridge, railway, unde...",[Overhead Railway Bridge : The Overhead Bridge...
4,3,148,3_sculpture_artists_mural_art,"[sculpture, artists, mural, art, wall, artwork...",[Three Businessmen Who Brought Their Own Lunch...
...,...,...,...,...,...
59,58,12,58_boydys_anthills_center_mornington,"[boydys, anthills, center, mornington, ice, di...",[Boydys Take Away: Excellent place for a meal ...
60,59,11,59_brummys_weeks_burnt_bremner,"[brummys, weeks, burnt, bremner, elms, smoke, ...","[As weeks go by - Laurens Tan, 2004: As Weeks ..."
61,60,11,60_tramway_formation_grub_lysterfield,"[tramway, formation, grub, lysterfield, expens...",[incline tramway: This is the only incline fir...
62,61,10,61_wisteria_emblem_riceflower_dilwinia,"[wisteria, emblem, riceflower, dilwinia, flowe...",[Wisteria heaven: Come in spring time for an e...


In [16]:
topic_model.visualize_topics()

In [17]:
topic_model.visualize_hierarchy()

In [18]:
# what happen when merging topics as well
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 62/62 [00:00<00:00, 228.09it/s]


In [19]:
topic_model.visualize_barchart()

In [20]:
topic_model.visualize_heatmap()

In [21]:
# topic outlier reduction - if necessary (todo)
topics = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf")

In [22]:
topic_df = pd.DataFrame({'topic': topics, 'document': docs})

In [23]:
topic_df.head()

Unnamed: 0,topic,document
0,1,Fairhaven Surf Life Saving Club: Fairhaven is ...
1,1,"Beach walk: From Sprout Creek, Eastern View, M..."
2,12,Rock pools: See what sort of shells and stones...
3,1,Sand dunes: The beautiful rolling sand dunes s...
4,35,Painkalac Creek: The creek separates Aireys In...


### Conclusions 
Some categories are not atomic, several biases toward famous geographic names including LGAs and suburbs. Removing the geographic names from the POI descriptions will help to understand 'what' categories they belong rather than 'where' they are.

#### Work ToDo:
1. **Classification**: Analysing the hierarchical organization of topics to define classes describing what POIs are described.
2. **Space and Classes/Topics**: LGA? SA2/SA3? Question: Is there a relationship between categories and location - how it changes before and after removing place names?
3. **Walks and Classes/Topics**: Type of the walk - finding a categorization of leisure walks and check the association of topics to leisure walk purpose. Question: Is there a association between walks/categories of walks and certian POIs selected based on their classification/topics.

### Possible Future Work: Multimodel Topic Modelling

Using both text and images for topic modelling.

Could be a good future work or an extension to the work - I need change the crawler information for this to capture links for description and also for images.

See: https://maartengr.github.io/BERTopic/getting_started/multimodal/multimodal.html#text-images

## OSM Points of Interests

In [25]:
# bounding box of each path
paths = []
for record in dataset:
    min_lat = 90
    max_lat = -90
    min_lng = 180
    max_lng = -180
    for latlng in record['pathDetails']:
        if min_lat > latlng['lat']:
            min_lat = latlng['lat']
        if max_lat < latlng['lat']:
            max_lat = latlng['lat']
        if min_lng > latlng['lng']:
            min_lng = latlng['lng']
        if max_lng < latlng['lng']:
            max_lng = latlng['lng']
    paths.append({'min_lat': min_lat, 'max_lat': max_lat, 'min_lng': min_lng, 'max_lng': max_lng})

In [26]:
paths

[{'min_lat': -37.82326007,
  'max_lat': -37.81401352,
  'min_lng': 144.96751249,
  'max_lng': 144.97828424},
 {'min_lat': -38.46883651,
  'max_lat': -38.46549319,
  'min_lng': 144.08441126,
  'max_lng': 144.1040799},
 {'min_lat': -37.80922028,
  'max_lat': -37.79740113,
  'min_lng': 144.96275961,
  'max_lng': 144.97345358},
 {'min_lat': -34.190259657952346,
  'max_lat': -34.1828022667217,
  'min_lng': 142.15728521325218,
  'max_lng': 142.16644227499273},
 {'min_lat': -37.81905011,
  'max_lat': -37.81639942,
  'min_lng': 144.9672094,
  'max_lng': 144.98397186},
 {'min_lat': -38.34566157,
  'max_lat': -38.31852415,
  'min_lng': 144.69141662000004,
  'max_lng': 144.73630071000002},
 {'min_lat': -37.57328182,
  'max_lat': -37.54903468,
  'min_lng': 149.75053489,
  'max_lng': 149.76544261},
 {'min_lat': -37.80127748,
  'max_lat': -37.79937006,
  'min_lng': 145.00765997,
  'max_lng': 145.01382369},
 {'min_lat': -36.39264513,
  'max_lat': -36.38615029,
  'min_lng': 145.39311469,
  'max_lng': 

In [27]:
tags = {'amenity': True, 'natural': True, 'animal': True, 'leisure': True}

In [28]:
path = paths[0]
feature_gdf = ox.features_from_bbox(north=path['max_lat'], south=path['min_lat'], east=path['max_lng'], west=path['min_lng'], tags=tags)

In [None]:
feature_gdf.explore()

In [30]:
feature_gdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,addr:city,addr:housenumber,addr:postcode,addr:street,amenity,name,operator,website,wikidata,geometry,...,bus,motor_vehicle,building:part,not:operator:wikidata,water,unisex,ways,type,intermittent,salt
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,176729780,Melbourne,191.0,3000.0,Collins Street,theatre,Regent Theatre,Marriner Group,https://www.marrinergroup.com.au/theatre-regen...,Q7308110,POINT (144.96760 -37.81550),...,,,,,,,,,,
node,243097159,,,,,loading_dock,,,,,POINT (144.96796 -37.82161),...,,,,,,,,,,
node,247024808,,,,,parking_entrance,,,,,POINT (144.97019 -37.81548),...,,,,,,,,,,
node,247689970,,,,,parking_entrance,,,,,POINT (144.97070 -37.81789),...,,,,,,,,,,
node,266733834,,,,,parking,Sofitel Hotel Carpark,Wilson Parking,,,POINT (144.97302 -37.81451),...,,,,,,,,,,


In [31]:
list(feature_gdf.columns)

['addr:city',
 'addr:housenumber',
 'addr:postcode',
 'addr:street',
 'amenity',
 'name',
 'operator',
 'website',
 'wikidata',
 'geometry',
 'parking',
 'level',
 'capacity:disabled',
 'fee',
 'layer',
 'barrier',
 'created_by',
 'wheelchair',
 'check_date',
 'addr:housename',
 'food',
 'outdoor_seating',
 'phone',
 'smoking',
 'access',
 'male',
 'operator:wikidata',
 'toilets:disposal',
 'tourism',
 'maxheight',
 'internet_access',
 'source',
 'collection_times',
 'covered',
 'brand',
 'brand:wikidata',
 'brand:wikipedia',
 'operator:wikipedia',
 'addr:state',
 'addr:suburb',
 'building',
 'opening_hours',
 'cuisine',
 'takeaway',
 'note',
 'atm',
 'bicycle_parking',
 'capacity',
 'brewery',
 'leisure',
 'natural',
 'backrest',
 'name:en',
 'email',
 'fountain',
 'mapillary',
 'survey:date',
 'changing_table',
 'source:geometry',
 'official_name',
 'healthcare',
 'payment:credit_cards',
 'addr:unit',
 'payment:card',
 'payment:cash',
 'alt_name',
 'payment:cards',
 'payment:coins',


In [32]:
feature_gdf.amenity = feature_gdf.amenity.astype(str)
feature_gdf.natural = feature_gdf.natural.astype(str)
feature_gdf.leisure = feature_gdf.leisure.astype(str)
feature_gdf.name = feature_gdf.name.astype(str)

In [33]:
feature_gdf = feature_gdf[['name', 'amenity', 'natural', 'leisure', 'geometry']].dropna(how='all')
feature_gdf

Unnamed: 0_level_0,Unnamed: 1_level_0,name,amenity,natural,leisure,geometry
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
node,176729780,Regent Theatre,theatre,,,POINT (144.96760 -37.81550)
node,243097159,,loading_dock,,,POINT (144.96796 -37.82161)
node,247024808,,parking_entrance,,,POINT (144.97019 -37.81548)
node,247689970,,parking_entrance,,,POINT (144.97070 -37.81789)
node,266733834,Sofitel Hotel Carpark,parking,,,POINT (144.97302 -37.81451)
...,...,...,...,...,...,...
way,1239949234,,,,pitch,"POLYGON ((144.97781 -37.82089, 144.97757 -37.8..."
relation,954522,,,water,,"POLYGON ((144.96548 -37.81920, 144.96554 -37.8..."
relation,1718900,Court 3,,,stadium,"POLYGON ((144.97799 -37.82051, 144.97797 -37.8..."
relation,1718902,1573 Arena,,,stadium,"POLYGON ((144.97677 -37.82102, 144.97673 -37.8..."


In [34]:
for idx, path in enumerate(paths):
    
    if os.path.isfile('dataset/features-osm-{}.geojson'.format(idx)):
        print('features for path {0} out of {1} is already loaded and saved.'.format(idx, len(paths)))
        continue;
    try:
        feature_gdf = ox.features_from_bbox(north=path['max_lat'], south=path['min_lat'], east=path['max_lng'], west=path['min_lng'], tags=tags)
        cols = feature_gdf.columns
        #preprocess
        if 'amenity' not in cols:
            feature_gdf['amenity'] = np.nan
        feature_gdf.amenity = feature_gdf.amenity.astype(str)    
        if 'natural' not in cols:
            feature_gdf['natural'] = np.nan
        feature_gdf.natural = feature_gdf.natural.astype(str)    
        if 'leisure' not in cols:
            feature_gdf['leisure'] = np.nan
        feature_gdf.leisure = feature_gdf.leisure.astype(str)
        if 'name' not in cols:
            feature_gdf['name'] = np.nan
        feature_gdf.name = feature_gdf.name.astype(str)
        feature_gdf = feature_gdf[['name', 'amenity', 'natural', 'leisure', 'geometry']].dropna(how='all')
    
        feature_gdf.to_file("dataset/features-osm-{}.geojson".format(idx), driver='GeoJSON')
        print('features for path {0} out of {1} is loaded from OSM and saved ...'.format(idx, len(paths)))
    except Exception as e:
        print('error in writing path {0} out of {1}...'.format(idx, len(paths)))
        print(e)

features for path 0 out of 387 is already loaded and saved.
features for path 1 out of 387 is already loaded and saved.
features for path 2 out of 387 is already loaded and saved.
features for path 3 out of 387 is already loaded and saved.
features for path 4 out of 387 is already loaded and saved.
features for path 5 out of 387 is already loaded and saved.
features for path 6 out of 387 is already loaded and saved.
features for path 7 out of 387 is already loaded and saved.
features for path 8 out of 387 is already loaded and saved.
features for path 9 out of 387 is already loaded and saved.
features for path 10 out of 387 is already loaded and saved.
features for path 11 out of 387 is already loaded and saved.
features for path 12 out of 387 is already loaded and saved.
features for path 13 out of 387 is already loaded and saved.
features for path 14 out of 387 is already loaded and saved.
features for path 15 out of 387 is already loaded and saved.
features for path 16 out of 387 is


The `north`, `south`, `east`, and `west` parameters are deprecated and will be removed in the v2.0.0 release. Use the `bbox` parameter instead.



error in writing path 59 out of 387...
No data elements in server response. Check log and query location/tags.
features for path 60 out of 387 is already loaded and saved.
features for path 61 out of 387 is already loaded and saved.
features for path 62 out of 387 is already loaded and saved.
features for path 63 out of 387 is already loaded and saved.
features for path 64 out of 387 is already loaded and saved.
features for path 65 out of 387 is already loaded and saved.
features for path 66 out of 387 is already loaded and saved.
features for path 67 out of 387 is already loaded and saved.
features for path 68 out of 387 is already loaded and saved.
features for path 69 out of 387 is already loaded and saved.
features for path 70 out of 387 is already loaded and saved.
features for path 71 out of 387 is already loaded and saved.
features for path 72 out of 387 is already loaded and saved.
features for path 73 out of 387 is already loaded and saved.
features for path 74 out of 387 is 


The `north`, `south`, `east`, and `west` parameters are deprecated and will be removed in the v2.0.0 release. Use the `bbox` parameter instead.



error in writing path 128 out of 387...
No data elements in server response. Check log and query location/tags.
features for path 129 out of 387 is already loaded and saved.
features for path 130 out of 387 is already loaded and saved.
features for path 131 out of 387 is already loaded and saved.
features for path 132 out of 387 is already loaded and saved.
features for path 133 out of 387 is already loaded and saved.
features for path 134 out of 387 is already loaded and saved.
features for path 135 out of 387 is already loaded and saved.
features for path 136 out of 387 is already loaded and saved.
features for path 137 out of 387 is already loaded and saved.
features for path 138 out of 387 is already loaded and saved.
features for path 139 out of 387 is already loaded and saved.
features for path 140 out of 387 is already loaded and saved.
features for path 141 out of 387 is already loaded and saved.
features for path 142 out of 387 is already loaded and saved.
features for path 14


The `north`, `south`, `east`, and `west` parameters are deprecated and will be removed in the v2.0.0 release. Use the `bbox` parameter instead.



error in writing path 151 out of 387...
No data elements in server response. Check log and query location/tags.
features for path 152 out of 387 is already loaded and saved.
features for path 153 out of 387 is already loaded and saved.
features for path 154 out of 387 is already loaded and saved.
features for path 155 out of 387 is already loaded and saved.
features for path 156 out of 387 is already loaded and saved.
features for path 157 out of 387 is already loaded and saved.
features for path 158 out of 387 is already loaded and saved.
features for path 159 out of 387 is already loaded and saved.
features for path 160 out of 387 is already loaded and saved.
features for path 161 out of 387 is already loaded and saved.
features for path 162 out of 387 is already loaded and saved.
features for path 163 out of 387 is already loaded and saved.
features for path 164 out of 387 is already loaded and saved.
features for path 165 out of 387 is already loaded and saved.
features for path 16


The `north`, `south`, `east`, and `west` parameters are deprecated and will be removed in the v2.0.0 release. Use the `bbox` parameter instead.



error in writing path 309 out of 387...
No data elements in server response. Check log and query location/tags.
features for path 310 out of 387 is already loaded and saved.
features for path 311 out of 387 is already loaded and saved.
features for path 312 out of 387 is already loaded and saved.
features for path 313 out of 387 is already loaded and saved.
features for path 314 out of 387 is already loaded and saved.
features for path 315 out of 387 is already loaded and saved.
features for path 316 out of 387 is already loaded and saved.
features for path 317 out of 387 is already loaded and saved.
features for path 318 out of 387 is already loaded and saved.
features for path 319 out of 387 is already loaded and saved.
features for path 320 out of 387 is already loaded and saved.
features for path 321 out of 387 is already loaded and saved.
features for path 322 out of 387 is already loaded and saved.
features for path 323 out of 387 is already loaded and saved.
features for path 32


The `north`, `south`, `east`, and `west` parameters are deprecated and will be removed in the v2.0.0 release. Use the `bbox` parameter instead.



error in writing path 338 out of 387...
No data elements in server response. Check log and query location/tags.
features for path 339 out of 387 is already loaded and saved.
features for path 340 out of 387 is already loaded and saved.
features for path 341 out of 387 is already loaded and saved.
features for path 342 out of 387 is already loaded and saved.
features for path 343 out of 387 is already loaded and saved.
features for path 344 out of 387 is already loaded and saved.
features for path 345 out of 387 is already loaded and saved.
features for path 346 out of 387 is already loaded and saved.
features for path 347 out of 387 is already loaded and saved.
features for path 348 out of 387 is already loaded and saved.
features for path 349 out of 387 is already loaded and saved.
features for path 350 out of 387 is already loaded and saved.
features for path 351 out of 387 is already loaded and saved.
features for path 352 out of 387 is already loaded and saved.
features for path 35

In [35]:
feature_gdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,amenity,natural,leisure,geometry
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
node,176729780,Regent Theatre,theatre,,,POINT (144.96760 -37.81550)
node,243097159,,loading_dock,,,POINT (144.96796 -37.82161)
node,247024808,,parking_entrance,,,POINT (144.97019 -37.81548)
node,247689970,,parking_entrance,,,POINT (144.97070 -37.81789)
node,266733834,Sofitel Hotel Carpark,parking,,,POINT (144.97302 -37.81451)


## Matching POIs to OSM POIs

- spatial criteria: defining containment
- semantic criteria: defining semantic similarity using word embeddings

### Semantic Matching

Ranking the relevance of textual descriptions in OSM POIs and LW POIS

Example to test how it works

In [36]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') # symmetric semantic search
msmarco_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-dot-v5')  # asymmetric semantic search

# embedding derived from BERT for the pois
def embed_texts(sentences, model=sbert_model):
    sentence_embeddings = model.encode(sentences)
    return sentence_embeddings


def compute_similarities(query, sentences, sentence_embeddings, model=sbert_model):
    query_vec = embed_texts(query)
    scores = util.dot_score(query_vec, sentence_embeddings)[0].cpu().tolist()
    doc_score_pairs = list(zip(sentences, scores))
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    print("Query:", query)
    for doc, score in doc_score_pairs:
        print('\t', score, '\t', doc)
    return doc_score_pairs


The `ipykernel.comm.Comm` class has been deprecated. Please use the `comm` module instead.For creating comms, use the function `from comm import create_comm`.



Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/6.51k [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [37]:
def only_noun_phrases(sentence):
    doc = nlp(sentence)
    phrases = set() 
    for nc in doc.noun_chunks:
        phrases.add(nc.text)
        phrases.add(doc[nc.root.left_edge.i:nc.root.right_edge.i+1].text)
    return ' '.join(phrases)


In [38]:
example_poi_osm = "Gaswork park theatre"
example_sentences = ["Gasworks Park: There are artists studios, a theatre and a cafe. Every 3rd Saturday there is a Farmers' Market.  In the park you'll also come across various wonderful sculptures and installations.",
                                                 "Australia's Number One university and world leader in education, teaching and research excellence.",
                                                 "Completed in 1870, the Melbourne Town Hall is at the heart of the city's cultural and civic activity",
                                                 "The magnificent octagonal domed reading room is both a quiet space for study and an iconic Melbourne location to take an unforgettable selfie.?"]
example_sentence_embeddings = embed_texts(example_sentences)
compute_similarities(example_poi_osm, example_sentences, example_sentence_embeddings)

Query: Gaswork park theatre
	 112.5803451538086 	 Completed in 1870, the Melbourne Town Hall is at the heart of the city's cultural and civic activity
	 92.63168334960938 	 Gasworks Park: There are artists studios, a theatre and a cafe. Every 3rd Saturday there is a Farmers' Market.  In the park you'll also come across various wonderful sculptures and installations.
	 84.31318664550781 	 The magnificent octagonal domed reading room is both a quiet space for study and an iconic Melbourne location to take an unforgettable selfie.?
	 62.57233810424805 	 Australia's Number One university and world leader in education, teaching and research excellence.


[("Completed in 1870, the Melbourne Town Hall is at the heart of the city's cultural and civic activity",
  112.5803451538086),
 ("Gasworks Park: There are artists studios, a theatre and a cafe. Every 3rd Saturday there is a Farmers' Market.  In the park you'll also come across various wonderful sculptures and installations.",
  92.63168334960938),
 ('The magnificent octagonal domed reading room is both a quiet space for study and an iconic Melbourne location to take an unforgettable selfie.?',
  84.31318664550781),
 ("Australia's Number One university and world leader in education, teaching and research excellence.",
  62.57233810424805)]

In [39]:
# with preprocessing
only_noun_example_sentences = [only_noun_phrases(sentence) for sentence in example_sentences]
example_sentence_embeddings = embed_texts(only_noun_example_sentences)
compute_similarities(example_poi_osm, only_noun_example_sentences, example_sentence_embeddings)

Query: Gaswork park theatre
	 133.74261474609375 	 Gasworks Park artists studios, a theatre and a cafe installations various wonderful sculptures a theatre and a cafe a theatre you artists studios the park a cafe various wonderful sculptures and installations a Farmers' Market Gasworks Park:
	 121.1363296508789 	 the Melbourne Town Hall the heart of the city's cultural and civic activity the city's cultural and civic activity the heart
	 75.88065338134766 	 The magnificent octagonal domed reading room an unforgettable selfie study an iconic Melbourne location to take an unforgettable selfie an iconic Melbourne location a quiet space a quiet space for study and an iconic Melbourne location to take an unforgettable selfie
	 58.85948181152344 	 excellence Australia's Number One university and world leader in education, teaching and research excellence. teaching and research excellence research education, teaching and research excellence Australia's Number One university and world leader t

[("Gasworks Park artists studios, a theatre and a cafe installations various wonderful sculptures a theatre and a cafe a theatre you artists studios the park a cafe various wonderful sculptures and installations a Farmers' Market Gasworks Park:",
  133.74261474609375),
 ("the Melbourne Town Hall the heart of the city's cultural and civic activity the city's cultural and civic activity the heart",
  121.1363296508789),
 ('The magnificent octagonal domed reading room an unforgettable selfie study an iconic Melbourne location to take an unforgettable selfie an iconic Melbourne location a quiet space a quiet space for study and an iconic Melbourne location to take an unforgettable selfie',
  75.88065338134766),
 ("excellence Australia's Number One university and world leader in education, teaching and research excellence. teaching and research excellence research education, teaching and research excellence Australia's Number One university and world leader teaching education",
  58.8594818

In [40]:
example_sentence_embeddings = embed_texts(example_sentences, model=msmarco_model)
compute_similarities(example_poi_osm, example_sentences, example_sentence_embeddings, model=msmarco_model)

Query: Gaswork park theatre
	 42.83140182495117 	 Gasworks Park: There are artists studios, a theatre and a cafe. Every 3rd Saturday there is a Farmers' Market.  In the park you'll also come across various wonderful sculptures and installations.
	 30.71784019470215 	 Completed in 1870, the Melbourne Town Hall is at the heart of the city's cultural and civic activity
	 25.08165740966797 	 The magnificent octagonal domed reading room is both a quiet space for study and an iconic Melbourne location to take an unforgettable selfie.?
	 18.95928382873535 	 Australia's Number One university and world leader in education, teaching and research excellence.


[("Gasworks Park: There are artists studios, a theatre and a cafe. Every 3rd Saturday there is a Farmers' Market.  In the park you'll also come across various wonderful sculptures and installations.",
  42.83140182495117),
 ("Completed in 1870, the Melbourne Town Hall is at the heart of the city's cultural and civic activity",
  30.71784019470215),
 ('The magnificent octagonal domed reading room is both a quiet space for study and an iconic Melbourne location to take an unforgettable selfie.?',
  25.08165740966797),
 ("Australia's Number One university and world leader in education, teaching and research excellence.",
  18.95928382873535)]

In [41]:
example_sentence_embeddings = embed_texts(only_noun_example_sentences, model=msmarco_model)
compute_similarities(example_poi_osm, only_noun_example_sentences, example_sentence_embeddings, model=msmarco_model)

Query: Gaswork park theatre
	 44.30448913574219 	 Gasworks Park artists studios, a theatre and a cafe installations various wonderful sculptures a theatre and a cafe a theatre you artists studios the park a cafe various wonderful sculptures and installations a Farmers' Market Gasworks Park:
	 30.507658004760742 	 the Melbourne Town Hall the heart of the city's cultural and civic activity the city's cultural and civic activity the heart
	 23.770713806152344 	 The magnificent octagonal domed reading room an unforgettable selfie study an iconic Melbourne location to take an unforgettable selfie an iconic Melbourne location a quiet space a quiet space for study and an iconic Melbourne location to take an unforgettable selfie
	 17.956480026245117 	 excellence Australia's Number One university and world leader in education, teaching and research excellence. teaching and research excellence research education, teaching and research excellence Australia's Number One university and world leader

[("Gasworks Park artists studios, a theatre and a cafe installations various wonderful sculptures a theatre and a cafe a theatre you artists studios the park a cafe various wonderful sculptures and installations a Farmers' Market Gasworks Park:",
  44.30448913574219),
 ("the Melbourne Town Hall the heart of the city's cultural and civic activity the city's cultural and civic activity the heart",
  30.507658004760742),
 ('The magnificent octagonal domed reading room an unforgettable selfie study an iconic Melbourne location to take an unforgettable selfie an iconic Melbourne location a quiet space a quiet space for study and an iconic Melbourne location to take an unforgettable selfie',
  23.770713806152344),
 ("excellence Australia's Number One university and world leader in education, teaching and research excellence. teaching and research excellence research education, teaching and research excellence Australia's Number One university and world leader teaching education",
  17.956480

In [42]:
topic_df[topic_df.document.str.contains('Gasworks', na=False)]

Unnamed: 0,topic,document
673,58,"Gasworks Park: There are artists studios, a th..."
4373,31,The Weighbridge: This is where the coal was br...
4374,3,Brick Wall: I love the patterning of this bric...
4375,39,Solar!: Note the solar panels appearing on hou...


In [43]:
topic_model.get_topic(-1)

[('reserve', 0.11792720130243667),
 ('street', 0.1142422991259655),
 ('through', 0.11406649920539602),
 ('hall', 0.1136732770113999),
 ('water', 0.1098914926540999),
 ('centre', 0.10860185819207639),
 ('town', 0.10813687336068976),
 ('stop', 0.10793617779580789),
 ('shopping', 0.1066996453060482),
 ('spot', 0.10664274791347784)]

In [44]:
topic_df.head()

Unnamed: 0,topic,document
0,1,Fairhaven Surf Life Saving Club: Fairhaven is ...
1,1,"Beach walk: From Sprout Creek, Eastern View, M..."
2,12,Rock pools: See what sort of shells and stones...
3,1,Sand dunes: The beautiful rolling sand dunes s...
4,35,Painkalac Creek: The creek separates Aireys In...


## Case Investigation

Checking the POIs in description with respect to OSM POIs

In [45]:
def get_case(idx):
    record = dataset[idx]  
    pois = {'title': [], 'summary': [], 'lat': [], 'lng': []}
    for poi in record['pois']:
        pois['title'].append(poi['title'])
        pois['summary'].append(poi['summary'])
        pois['lat'].append(poi['lat'])
        pois['lng'].append(poi['lng'])
        
    df = pd.DataFrame(pois)
    gdf = gpd.GeoDataFrame(df[['title', 'summary', 'lat', 'lng']],
                           geometry=gpd.points_from_xy(df.lng, df.lat), crs="EPSG:4326")
    
    if os.path.isfile('dataset/features-osm-{}.geojson'.format(idx)):
        osm_pois = gpd.read_file('dataset/features-osm-{}.geojson'.format(idx))
    else:
        print('OSM features are not loaded - potentially empty dataframe')
        osm_pois = None
    return gdf, osm_pois

In [46]:
lw_poi, osm_poi = get_case(1)

In [47]:
lw_poi

Unnamed: 0,title,summary,lat,lng,geometry
0,Fairhaven Surf Life Saving Club,Fairhaven is a well known surf beach. The beac...,-38.468759,144.084459,POINT (144.08446 -38.46876)
1,Beach walk,"From Sprout Creek, Eastern View, Moggs Creek, ...",-38.468542,144.089693,POINT (144.08969 -38.46854)
2,Rock pools,See what sort of shells and stones you can col...,-38.468459,144.09242,POINT (144.09242 -38.46846)
3,Sand dunes,The beautiful rolling sand dunes shape the bea...,-38.468418,144.095318,POINT (144.09532 -38.46842)
4,Painkalac Creek,The creek separates Aireys Inlet from Fairhave...,-38.46839,144.097312,POINT (144.09731 -38.46839)
5,Rocks and caves under the light house,There are more rockpools and rocky outcrops to...,-38.468822,144.100861,POINT (144.10086 -38.46882)
6,Aireys Inlet playground and picnic ground,There is small skateboard ramp for children to...,-38.466199,144.098772,POINT (144.09877 -38.46620)
7,Aireys Inlet lower shops,"Pick up a coffee, newspaper or Fish and Chips!...",-38.465536,144.098801,POINT (144.09880 -38.46554)
8,Loutit Bay lookout,Return to Painkalac Creek inlet and walk to th...,-38.467916,144.103435,POINT (144.10344 -38.46792)
9,Historical homestead and building,At the lighthouse is the original homestead fo...,-38.468048,144.103832,POINT (144.10383 -38.46805)


In [48]:
osm_poi

Unnamed: 0,element_type,osmid,name,amenity,natural,leisure,geometry
0,node,831201200,,toilets,,,POINT (144.09837 -38.46594)
1,node,831201305,,toilets,,,POINT (144.10104 -38.46734)
2,node,831201411,,bbq,,,POINT (144.09857 -38.46592)
3,node,831201826,,shelter,,,POINT (144.09864 -38.46595)
4,node,5315720235,,,,picnic_table,POINT (144.10073 -38.46681)
5,node,8568393481,,waste_basket,,,POINT (144.10029 -38.46666)
6,way,30501938,Painkalac Creek Estuary,,water,,"POLYGON ((144.09591 -38.46359, 144.09625 -38.4..."
7,way,69366065,,parking,,,"POLYGON ((144.10000 -38.46659, 144.09998 -38.4..."
8,way,69366070,,parking,,,"POLYGON ((144.09836 -38.46586, 144.09836 -38.4..."
9,way,69366078,,,,playground,"POLYGON ((144.10198 -38.46581, 144.10217 -38.4..."


In [49]:
# projection 

lw_projected = lw_poi.to_crs("EPSG:32755")
osm_projected = osm_poi.to_crs("EPSG:32755")

In [50]:
poi_case = 6

lw_poi.iloc[poi_case]

title               Aireys Inlet playground and picnic ground
summary     There is small skateboard ramp for children to...
lat                                                -38.466199
lng                                                144.098772
geometry                    POINT (144.09877169 -38.46619881)
Name: 6, dtype: object

In [51]:
osm_projected.distance(lw_projected.iloc[poi_case]['geometry'])

0      45.318589
1     235.344292
2      35.687710
3      30.304774
4     184.116072
5     142.141188
6      51.867211
7     115.908134
8      37.170695
9     283.902017
10     13.688306
11    271.562240
12     15.267822
13     78.705460
14      8.479294
15    277.569933
16    279.422113
17      0.902516
18    968.728085
19    394.194319
20    242.844765
21    148.460931
22     79.652279
23    388.207314
24    277.569933
dtype: float64

## Enrich the OSM Description of POIs
- using hierarchy (contained by)
- combining names, types and extratags

In [52]:
# todo - maybe creating a dataset as well! the task is actually difficult!
def generate_req_id(row):
    return row['element_type'][0].upper()+str(row['osmid'])

osm_poi['req_id'] = osm_poi.apply(generate_req_id, axis=1)
osm_poi.req_id.tolist()

['N831201200',
 'N831201305',
 'N831201411',
 'N831201826',
 'N5315720235',
 'N8568393481',
 'W30501938',
 'W69366065',
 'W69366070',
 'W69366078',
 'W69366081',
 'W69366092',
 'W69366108',
 'W69366116',
 'W69560073',
 'W95186468',
 'W161748270',
 'W283542690',
 'W865565686',
 'W865569273',
 'W1007494584',
 'W1009404376',
 'R9212148',
 'R9212157',
 'R9457256']

In [53]:
req_ids = set()

# read all osm ids and save in a file
for idx, path in enumerate(paths):
    if os.path.isfile('dataset/features-osm-{}.geojson'.format(idx)):
        osm_poi = gpd.read_file('dataset/features-osm-{}.geojson'.format(idx))
        osm_poi['req_id'] = osm_poi.apply(generate_req_id, axis=1)
        req_ids.update(osm_poi.req_id.tolist())

In [54]:
headers = {"Content-Type": "application/json; charset=utf-8"}
address_endpoint_template = "https://nominatim.openstreetmap.org/lookup?osm_ids={}&format=json&extratags=1"

def download_osm_details(rids):
    resp = requests.get(address_endpoint_template.format(','.join(rids), headers=headers))
    return resp.json()

In [55]:
req_ids = list(req_ids)

all_osm_info = []

if os.path.isfile('dataset/osm-detailed-pois.json'):
    with open('dataset/osm-detailed-pois.json', 'r') as fp:
        all_osm_info = json.load(fp)
else:
    bucket_size = 50  # maximum value for OSM lookup!
    for i in range(0, len(req_ids), bucket_size):
        try:
            all_osm_info.append(download_osm_details(req_ids[i:i+bucket_size]))
            print('bucket done: {}'.format(i))
            time.sleep(0.5)
        except Exception as e:
            print(e)
            print('bucket: {}'.format(i))

In [56]:
all_osm_list = []
for bucket in all_osm_info:
    all_osm_list.extend(bucket)

In [58]:
with open('dataset/osm-detailed-pois.json', 'w', encoding='utf-8') as fp:
    json.dump(all_osm_info, fp)

with open('dataset/processed-osm-detailed-pois.json', 'w', encoding='utf-8') as fp:
    json.dump(all_osm_list, fp)
    
print('Detailed information about OSM pois are stored in `dataset/osm-detailed-pois.json`')

Detailed information about OSM pois are stored in `dataset/osm-detailed-pois.json`


In [59]:
print(len(req_ids))
print(len(all_osm_info))
print(len(all_osm_list))
osm_poi_details_df = pd.DataFrame(all_osm_list)
osm_poi_details_df.head()

84203
1685
27900


Unnamed: 0,place_id,licence,osm_type,osm_id,lat,lon,class,type,place_rank,importance,addresstype,name,display_name,address,extratags,boundingbox
0,50105769,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,210529635,-37.77131895,144.88922947664923,amenity,parking,30,1e-05,amenity,David Jones Carpark,"David Jones Carpark, Primary Place, Maribyrnon...","{'amenity': 'David Jones Carpark', 'road': 'Pr...","{'parking': 'multi-storey', 'building': 'parki...","[-37.7716973, -37.7708987, 144.8879962, 144.89..."
1,50264145,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,1005592702,-37.755842,144.79434671579116,amenity,parking,30,1e-05,amenity,,"Ken Jordan Road, Cairnlea, Melbourne, City of ...","{'road': 'Ken Jordan Road', 'suburb': 'Cairnle...",{'parking': 'street_side'},"[-37.7559597, -37.7557312, 144.7942967, 144.79..."
2,49748802,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,948227337,-38.33887055,144.72523383795718,leisure,swimming_pool,30,1e-05,leisure,,"Stonecutters Road, Portsea, Melbourne, Shire o...","{'road': 'Stonecutters Road', 'suburb': 'Ports...",,"[-38.3389150, -38.3388240, 144.7251718, 144.72..."
3,50013438,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,542417354,-37.9853244,145.2116291306154,amenity,parking,30,1e-05,amenity,,"Robinson Street, Dandenong, Melbourne, City of...","{'road': 'Robinson Street', 'suburb': 'Dandeno...",,"[-37.9856572, -37.9849960, 145.2114001, 145.21..."
4,50156137,"Data © OpenStreetMap contributors, ODbL 1.0. h...",node,678349689,-37.800412,144.966749,amenity,restaurant,30,1e-05,amenity,Il Cantuccio,"Il Cantuccio, 209, Lygon Street, Little Italy,...","{'amenity': 'Il Cantuccio', 'house_number': '2...","{'phone': '+61 3 9347 9959', 'cuisine': 'itali...","[-37.8004620, -37.8003620, 144.9666990, 144.96..."


In [60]:
osm_poi_details_df[(osm_poi_details_df['osm_type'] == 'way') & (osm_poi_details_df['osm_id'] == 542417354)]['display_name'].values[0]

'Robinson Street, Dandenong, Melbourne, City of Greater Dandenong, Victoria, 3177, Australia'

In [61]:
def enrich(row):
    info = osm_poi_details_df[(osm_poi_details_df['osm_type'] == row['element_type']) & 
    (osm_poi_details_df['osm_id'] == row['osmid'])]
    if len(info) > 0:
        t_name = ' '.join(info['display_name'].values[0].split(',')[:2])
        if info['extratags'].values[0] is not None:
            t_name += ' '.join([k+' '+v for (k,v) in info['extratags'].values[0].items()])
    else:
        t_name = ''
    h_name = ''
    parents = osm_poi.loc[(osm_poi.geometry.contains(row.geometry)) & (osm_poi.id != row.id)]['name'].values.tolist()
    if len(parents) > 0:
        h_name += ' in '+ ', '.join([p for p in parents if p != 'nan'])
    p_name = ''
    if row['amenity'] != 'nan':
        p_name += 'amenity {} '.format(row['amenity']).replace('_', ' ')
    if row['natural'] != 'nan':
        p_name += 'natural {} '.format(row['natural']).replace('_', ' ')
    if row['leisure'] != 'nan':
        p_name += 'leisure {} '.format(row['leisure']).replace('_', ' ')
    if row['name'] == 'nan':
        return p_name+ t_name + h_name
    return row['name'] + ' ' + p_name + t_name + h_name

In [62]:
osm_poi.reset_index(inplace=True)
osm_poi = osm_poi.rename(columns= {'index': 'id'})
osm_poi

Unnamed: 0,id,element_type,osmid,name,amenity,natural,leisure,geometry,req_id
0,0,node,10889194475,,,tree,,POINT (145.05762 -37.65706),N10889194475
1,1,node,10889194476,,,tree,,POINT (145.05774 -37.65697),N10889194476
2,2,node,10889194477,,,tree,,POINT (145.05782 -37.65706),N10889194477
3,3,node,10889194478,,,tree,,POINT (145.05787 -37.65710),N10889194478
4,4,node,10889194481,,,tree,,POINT (145.05794 -37.65696),N10889194481
...,...,...,...,...,...,...,...,...,...
69,69,way,1171940505,,,,pitch,"POLYGON ((145.05668 -37.65476, 145.05677 -37.6...",W1171940505
70,70,way,1171940506,,,,pitch,"POLYGON ((145.05469 -37.65574, 145.05475 -37.6...",W1171940506
71,71,way,1171940507,,,,pitch,"POLYGON ((145.05555 -37.65548, 145.05560 -37.6...",W1171940507
72,72,way,1171940508,,,,pitch,"POLYGON ((145.05563 -37.65674, 145.05557 -37.6...",W1171940508


In [63]:
osm_poi['full_name'] = osm_poi.apply(enrich, axis=1)
osm_poi

Unnamed: 0,id,element_type,osmid,name,amenity,natural,leisure,geometry,req_id,full_name
0,0,node,10889194475,,,tree,,POINT (145.05762 -37.65706),N10889194475,natural tree
1,1,node,10889194476,,,tree,,POINT (145.05774 -37.65697),N10889194476,natural tree
2,2,node,10889194477,,,tree,,POINT (145.05782 -37.65706),N10889194477,natural tree
3,3,node,10889194478,,,tree,,POINT (145.05787 -37.65710),N10889194478,natural tree
4,4,node,10889194481,,,tree,,POINT (145.05794 -37.65696),N10889194481,natural tree
...,...,...,...,...,...,...,...,...,...,...
69,69,way,1171940505,,,,pitch,"POLYGON ((145.05668 -37.65476, 145.05677 -37.6...",W1171940505,leisure pitch Gilroy Crescent Mill Parksport ...
70,70,way,1171940506,,,,pitch,"POLYGON ((145.05469 -37.65574, 145.05475 -37.6...",W1171940506,leisure pitch Lady Penrhyn Avenue Mill Parksp...
71,71,way,1171940507,,,,pitch,"POLYGON ((145.05555 -37.65548, 145.05560 -37.6...",W1171940507,leisure pitch Gilroy Crescent Mill Parksport ...
72,72,way,1171940508,,,,pitch,"POLYGON ((145.05563 -37.65674, 145.05557 -37.6...",W1171940508,leisure pitch Lady Penrhyn Avenue Mill Parksp...


In [64]:
osm_poi.loc[(osm_poi.geometry.contains(osm_poi.loc[70].geometry)) & (osm_poi.index != 70)]['name'].values.tolist()

['Mill Park Recreation Reserve']

In [65]:
osm_poi.iloc[70]['full_name']

'leisure pitch Lady Penrhyn Avenue  Mill Parksport softball in Mill Park Recreation Reserve'

In [66]:
for idx, path in enumerate(paths):
    if os.path.isfile('dataset/features-osm-{}.geojson'.format(idx)):
        if os.path.isfile("dataset/features-osm-poi-{}.geojson".format(idx)):
            continue
        osm_poi = gpd.read_file('dataset/features-osm-{}.geojson'.format(idx))
        print('analysing: {0} - number of features: {1}'.format(idx, len(osm_pois)))
        osm_poi.reset_index(inplace=True)
        osm_poi = osm_poi.rename(columns= {'index': 'id'})
        osm_poi['full_name'] = osm_poi.apply(enrich, axis=1)
        osm_poi.to_file("dataset/features-osm-poi-{}.geojson".format(idx), driver='GeoJSON')
        print('enriched features for path {0} out of {1} is loaded from OSM and saved ...'.format(idx, len(paths)))

In [67]:
osm_poi

Unnamed: 0,id,element_type,osmid,name,amenity,natural,leisure,geometry,req_id,full_name
0,0,node,10889194475,,,tree,,POINT (145.05762 -37.65706),N10889194475,natural tree
1,1,node,10889194476,,,tree,,POINT (145.05774 -37.65697),N10889194476,natural tree
2,2,node,10889194477,,,tree,,POINT (145.05782 -37.65706),N10889194477,natural tree
3,3,node,10889194478,,,tree,,POINT (145.05787 -37.65710),N10889194478,natural tree
4,4,node,10889194481,,,tree,,POINT (145.05794 -37.65696),N10889194481,natural tree
...,...,...,...,...,...,...,...,...,...,...
69,69,way,1171940505,,,,pitch,"POLYGON ((145.05668 -37.65476, 145.05677 -37.6...",W1171940505,leisure pitch Gilroy Crescent Mill Parksport ...
70,70,way,1171940506,,,,pitch,"POLYGON ((145.05469 -37.65574, 145.05475 -37.6...",W1171940506,leisure pitch Lady Penrhyn Avenue Mill Parksp...
71,71,way,1171940507,,,,pitch,"POLYGON ((145.05555 -37.65548, 145.05560 -37.6...",W1171940507,leisure pitch Gilroy Crescent Mill Parksport ...
72,72,way,1171940508,,,,pitch,"POLYGON ((145.05563 -37.65674, 145.05557 -37.6...",W1171940508,leisure pitch Lady Penrhyn Avenue Mill Parksp...


## Matching LW POIs to OSM POIs

Using:

- *Spatial criterion*: nearby or contained
- *Thematic criterion*: topic representation of POI with types in OSM POIs
- *Linguistic criterion*: description of the POI with detailed contextual information from OSM (name, type, hierarchy)

**Note** The POIs might be missing in OSM data

In [68]:
def get_case_with_details(idx):
    record = dataset[idx]  
    pois = {'title': [], 'summary': [], 'lat': [], 'lng': []}
    for poi in record['pois']:
        pois['title'].append(poi['title'])
        pois['summary'].append(poi['summary'])
        pois['lat'].append(poi['lat'])
        pois['lng'].append(poi['lng'])
        
    df = pd.DataFrame(pois)
    gdf = gpd.GeoDataFrame(df[['title', 'summary', 'lat', 'lng']],
                           geometry=gpd.points_from_xy(df.lng, df.lat), crs="EPSG:4326")
    
    if os.path.isfile('dataset/features-osm-poi-{}.geojson'.format(idx)):
        osm_pois = gpd.read_file('dataset/features-osm-poi-{}.geojson'.format(idx))
    else:
        print('OSM features are not loaded - potentially empty dataframe')
        osm_pois = None
    return gdf, osm_pois

In [69]:
test_case_idx = 2
test_case_gdf, test_case_pois = get_case_with_details(test_case_idx)

In [70]:
test_case_gdf = test_case_gdf.to_crs("EPSG:32755")
test_case_gdf

Unnamed: 0,title,summary,lat,lng,geometry
0,1. Tramway signal box,Built in 1928 soon after the electrification o...,-37.806953,144.962813,POINT (320663.065 5813648.747)
1,2. City Baths,"Built in 1903, the design reflected the social...",-37.807382,144.96299,POINT (320679.723 5813601.482)
2,3. Magistrates Court,Built on the site of the earlier Supreme Court...,-37.808828,144.966112,POINT (320958.059 5813447.052)
3,4. Old Melbourne Gaol,Built between 1851 - 1864. As the oldest survi...,-37.807569,144.96571,POINT (320919.660 5813585.973)
4,5. Eight Hour Day Monument,"Built in 1923, the monument commemorates the E...",-37.807126,144.965808,POINT (320927.197 5813635.299)
5,6. Trades Hall,"Built in stages from 1873 - 1926, Trades Hall ...",-37.806905,144.965989,POINT (320942.628 5813660.156)
6,7. Medley Hall,"Built in 1893 as a private residence, the buil...",-37.805803,144.967618,POINT (321083.333 5813785.590)
7,8. Lygon shop corner,Lygon Buildings is architecturally significant...,-37.804863,144.966279,POINT (320963.170 5813887.275)
8,9. Matthais House,A two storeyed stucco faced bluestone house of...,-37.803827,144.967759,POINT (321091.019 5814005.140)
9,10. Sacred Heart Catholic Church,Built in 1855-56. In the 1930s and 1940s the C...,-37.803051,144.969378,POINT (321231.646 5814094.295)


In [71]:
test_case_pois = test_case_pois.to_crs("EPSG:32755")
test_case_pois

Unnamed: 0,id,element_type,osmid,name,amenity,natural,leisure,full_name,geometry
0,0,node,242538793,,post_box,,,amenity post box Queensberry Street Carlton,POINT (320727.351 5813899.378)
1,1,node,242540159,,telephone,,,amenity telephone Swanston Street East End Th...,POINT (320749.536 5813412.933)
2,2,node,242823091,,telephone,,,amenity telephone Swanston Street East End Th...,POINT (320730.759 5813422.172)
3,3,node,242823102,,post_box,,,amenity post box Pelham Street Carlton,POINT (321220.734 5814072.447)
4,4,node,242823114,,toilets,,,amenity toilets Rathdowne Street Carltonfee n...,POINT (321272.049 5814137.476)
...,...,...,...,...,...,...,...,...,...
2375,2375,way,1201832429,,,,garden,leisure garden Porters Lane Parkville in The ...,"POLYGON ((320637.296 5814510.471, 320647.393 5..."
2376,2376,way,1217340419,Melbourne Sexual Health Centre,clinic,,,Melbourne Sexual Health Centre amenity clinic ...,"POLYGON ((320778.428 5814047.124, 320780.211 5..."
2377,2377,way,1232632180,Lemon Tree Children's Centre,kindergarten,,,Lemon Tree Children's Centre amenity kindergar...,"POLYGON ((321255.627 5814355.303, 321255.375 5..."
2378,2378,relation,6614802,Carlton Gardens,,,park,Carlton Gardens leisure park Carlton Gardens ...,"MULTIPOLYGON (((321219.805 5813634.926, 321210..."


In [72]:
test_row_id = 6
test_row = test_case_gdf.loc[test_row_id]
test_row

title                                          7. Medley Hall
summary     Built in 1893 as a private residence, the buil...
lat                                                -37.805803
lng                                                144.967618
geometry         POINT (321083.33274179127 5813785.590421503)
Name: 6, dtype: object

In [73]:
test_case_pois.loc[test_row.geometry.distance(test_case_pois.geometry) < 100]

Unnamed: 0,id,element_type,osmid,name,amenity,natural,leisure,full_name,geometry
2153,2153,way,265141186,Lygon Street Christian Chapel,place_of_worship,,,Lygon Street Christian Chapel amenity place of...,"POLYGON ((321008.928 5813761.985, 320994.084 5..."
2231,2231,way,710777495,,parking,,,amenity parking McDonald Lane Carltonaccess p...,"POLYGON ((321000.885 5813812.657, 320997.612 5..."
2232,2232,way,710777496,,parking,,,amenity parking Elm Tree Place Carltonaccess ...,"POLYGON ((321135.119 5813776.077, 321133.814 5..."
2254,2254,way,743141724,,parking,,,amenity parking McDonald Lane Carltonaccess c...,"POLYGON ((320978.251 5813859.025, 321001.671 5..."
2283,2283,way,831017470,,parking,,,amenity parking Hudson Place Carltonaccess pr...,"POLYGON ((321146.543 5813801.593, 321159.873 5..."
2284,2284,way,831017484,,parking,,,amenity parking Trades Hall Place Carltonacce...,"POLYGON ((321003.601 5813711.301, 321007.125 5..."
2285,2285,way,831017485,,parking,,,amenity parking Trades Hall Place Carltonacce...,"POLYGON ((321009.308 5813760.340, 321010.362 5..."


In [74]:
# missing tags - tourisms * - historic * - monument *

In [74]:
# geocoding results for all POIs
titles = [pt for pt in data_structure['poi_title']]

In [75]:
geocoder = Nominatim(user_agent='research_app')

In [76]:
paths[test_case_idx]

{'min_lat': -37.80922028,
 'max_lat': -37.79740113,
 'min_lng': 144.96275961,
 'max_lng': 144.97345358}

In [77]:
result = geocoder.geocode("Medley Hall", viewbox=[(paths[test_case_idx]['max_lat'], paths[test_case_idx]['max_lng']), 
                                                         (paths[test_case_idx]['min_lat'], paths[test_case_idx]['min_lng'])],
                          bounded=True)

In [78]:
result

Location(Medley Hall, 48-56, Drummond Street, Carlton, Melbourne, City of Melbourne, Victoria, 3053, Australia, (-37.80576115, 144.9679888773678, 0.0))

In [79]:
def geocode_by_name(name, path):
    name = name.lstrip('0123456789.- ')
    return geocoder.geocode(name, viewbox=[(path['max_lat'], path['max_lng']), 
                                                         (path['min_lat'], path['min_lng'])],
                          bounded=True)

In [80]:
if os.path.isfile('dataset/nominatim-geocoding.json'):
    with open('dataset/nominatim-geocoding.json') as fp:
        nominatim_output = json.load(fp)
     
    print('nominatim dump file is already loaded')
else:
    geocoding_results = []
    for idx, path in enumerate(paths):
        record = dataset[idx]
        for poi in record['pois']:
            name = poi['title']
            result = geocode_by_name(name, path)
            geocoding_results.append(result)
            if result is not None:
                print(name, path)
        if idx%10 == 0:
            time.sleep(1)
            print('idx: {}'.format(idx))

    nominatim_output = {}
    counter = 0
    for idx, path in enumerate(paths):
        record = dataset[idx]
        for poi in record['pois']:
            geocoding_result = geocoding_results[counter]
        
            name = poi['title']
            description = poi['summary']
            lat = poi['lat']
            lng = poi['lng']

            nominatim_output[counter] = {'walk_id': idx, 'title': name, 'summary': description, 'lat': lat, 'lng': lng}
            if geocoding_result is not None:
                nominatim_output[counter]['osm'] = geocoding_result.raw
            else:
                nominatim_output[counter]['osm'] = None
            counter += 1

nominatim dump file is already loaded


In [81]:
nominatim_output

{'0': {'walk_id': 1,
  'title': 'Fairhaven Surf Life Saving Club',
  'summary': 'Fairhaven is a well known surf beach. The beach is patrolled during December to the Easter. The Fairhaven SLSC was established in 1958.',
  'lat': -38.46875894,
  'lng': 144.08445884,
  'osm': None},
 '1': {'walk_id': 1,
  'title': 'Beach walk',
  'summary': 'From Sprout Creek, Eastern View, Moggs Creek, Fairhaven to the bottom of the light house there is approx 6 km of beach, featuring rolling surf and spectacular views of the Otway Ranges and Lorne.',
  'lat': -38.46854155,
  'lng': 144.08969344,
  'osm': None},
 '2': {'walk_id': 1,
  'title': 'Rock pools',
  'summary': 'See what sort of shells and stones you can collect. Can you find a bright red sea star? You can see all sorts of sea vegetation, small fish and crabs. Look at the bird life, gulls and cormorants.',
  'lat': -38.4684592,
  'lng': 144.09241977,
  'osm': None},
 '3': {'walk_id': 1,
  'title': 'Sand dunes',
  'summary': 'The beautiful rollin

In [82]:
topic_df.iloc[23]

topic                                                      15
document    13. Royal Exhibition Building: The Royal Exhib...
Name: 23, dtype: object

In [83]:
nominatim_output['23']

{'walk_id': 2,
 'title': '13. Royal Exhibition Building',
 'summary': 'The Royal Exhibition Building is the only surviving Great Hall that once housed a 19th-century international exhibition and is still used for exhibitions. ',
 'lat': -37.80513488,
 'lng': 144.97123539,
 'osm': {'place_id': 17546919,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
  'osm_type': 'way',
  'osm_id': 4817059,
  'lat': '-37.804666850000004',
  'lon': '144.9714669305319',
  'class': 'historic',
  'type': 'building',
  'place_rank': 30,
  'importance': 0.39044459367468287,
  'addresstype': 'historic',
  'name': 'Royal Exhibition Building',
  'display_name': 'Royal Exhibition Building, 9, Nicholson Street, Carlton, Melbourne, City of Melbourne, Victoria, 3053, Australia',
  'boundingbox': ['-37.8051500', '-37.8041865', '144.9705305', '144.9724671']}}

In [84]:
topic_df.reset_index(inplace=True)

In [85]:
def is_found(row):
    idx = str(row['index'])
    if idx not in nominatim_output:
        print('index not found')
        return None
    if nominatim_output[idx]['osm'] is not None:
        return True
    return False

def which_class_type(row):
    idx = str(row['index'])
    if idx not in nominatim_output:
        print('index not found')
        return None
    if nominatim_output[idx]['osm'] is not None:
        return nominatim_output[idx]['osm']['class'] + ' ' + nominatim_output[idx]['osm']['type']
    return None


In [86]:
topic_df['is_found'] = topic_df.apply(is_found, axis=1)
topic_df['class'] = topic_df.apply(which_class_type, axis=1)

In [87]:
topic_df.head(50)

Unnamed: 0,index,topic,document,is_found,class
0,0,1,Fairhaven Surf Life Saving Club: Fairhaven is ...,False,
1,1,1,"Beach walk: From Sprout Creek, Eastern View, M...",False,
2,2,12,Rock pools: See what sort of shells and stones...,False,
3,3,1,Sand dunes: The beautiful rolling sand dunes s...,False,
4,4,35,Painkalac Creek: The creek separates Aireys In...,False,
5,5,14,Rocks and caves under the light house: There a...,False,
6,6,8,Aireys Inlet playground and picnic ground: The...,False,
7,7,7,"Aireys Inlet lower shops: Pick up a coffee, ne...",False,
8,8,1,Loutit Bay lookout: Return to Painkalac Creek ...,True,tourism viewpoint
9,9,13,Historical homestead and building: At the ligh...,False,


In [88]:
topic_agg_df = topic_df.groupby(['topic']).agg({'is_found': ['sum', 'count']})

In [89]:
topic_agg_df[('is_found', 'ratio')] = topic_agg_df[('is_found', 'sum')] / topic_agg_df[('is_found', 'count')]

In [90]:
topic_agg_df.reset_index(inplace=True)
topic_agg_df

Unnamed: 0_level_0,topic,is_found,is_found,is_found
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,ratio
0,-1,2,4,0.500000
1,0,13,406,0.032020
2,1,32,209,0.153110
3,2,27,198,0.136364
4,3,19,176,0.107955
...,...,...,...,...
59,58,5,36,0.138889
60,59,4,24,0.166667
61,60,3,19,0.157895
62,61,2,40,0.050000


In [91]:
def add_represenation(row):
    return '-'.join(topic_model.get_topic_info(row['topic'])['Representation'].values[0])

topic_agg_df.sort_values(by=('is_found', 'ratio'), inplace=True)

In [92]:
topic_agg_df.head(20)

Unnamed: 0_level_0,topic,is_found,is_found,is_found
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,ratio
58,57,0,23,0.0
54,53,1,39,0.025641
13,12,3,107,0.028037
19,18,2,71,0.028169
8,7,3,104,0.028846
47,46,1,33,0.030303
1,0,13,406,0.03202
39,38,2,56,0.035714
33,32,2,55,0.036364
22,21,4,86,0.046512


In [93]:
topic_agg_df.tail(20)

Unnamed: 0_level_0,topic,is_found,is_found,is_found
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,ratio
14,13,16,83,0.192771
42,41,7,36,0.194444
26,25,10,50,0.2
29,28,10,45,0.222222
53,52,6,26,0.230769
5,4,33,132,0.25
50,49,8,31,0.258065
16,15,23,88,0.261364
20,19,16,60,0.266667
52,51,10,36,0.277778


In [94]:
not_founds = list(topic_agg_df.head(20)['topic'])
for idx in not_founds:
    print(topic_model.get_topic_info(idx)['Representation'].values[0])

['fungi', 'mushrooms', 'mushroom', 'spider', 'descend', 'did', 'darker', 'ask', 'thirty', 'wasnt']
['crusoe', 'reservoir', 'gate', 'millers', 'spillover', 'tank', 'crest', 'bass', 'gravel', 'mitigation']
['bird', 'hide', 'birdlife', 'nesting', 'birds', 'peter', 'feeding', 'parrots', 'ostrich', 'rump']
['ducks', 'pelicans', 'swans', 'pelican', 'cormorants', 'duck', 'seagulls', 'black', 'waterbirds', 'bread']
['kangaroos', 'animals', 'wildlife', 'echidna', 'lizard', 'zoo', 'constellation', 'wallaby', 'insects', 'wombat']
['wildflowers', 'wildflower', 'daisies', 'yam', 'meadow', 'daisy', 'feathery', 'dainty', 'bumpy', 'edgars']
['track', 'steep', 'signage', 'gravel', 'top', 'path', 'walking', 'markers', 'valley', 'lookout']
['car', 'parking', 'carpark', 'thunder', 'dee', 'bike', 'decide', 'endeavour', 'wonderland', 'board']
['gum', 'gums', 'red', 'gumtree', 'twisting', 'stabilising', 'flowering', 'plays', 'greet', 'trees']
['seat', 'seats', 'seating', 'rest', 'plenty', 'sit', 'tired', 'le

In [95]:
founds = list(topic_agg_df.tail(20)['topic'])
for idx in founds:
    print(topic_model.get_topic_info(idx)['Representation'].values[0])

['homestead', 'house', 'houses', 'prefabricated', 'imported', 'housing', 'ruins', 'flats', 'architecture', 'desbrowe']
['rowing', 'boat', 'rowers', 'clubs', 'apex', 'band', 'ramp', 'blackbird', 'canoe', 'piers']
['mildura', 'milduras', 'wharf', 'shillidays', 'ave', 'langtree', 'deakin', 'earliest', 'daily', 'sunraysia']
['skate', 'bmx', 'incredibly', 'bigger', 'jumps', 'velodrome', 'skaters', 'ramp', 'park', 'better']
['ringwood', 'maroondah', 'nswwestpac', 'finnish', 'savings', 'operates', 'trading', 'wares', 'bank', 'northwestern']
['cafe', 'bakery', 'restaurant', 'coffee', 'shop', 'vietnamese', 'shops', 'caf', 'cake', 'rolls']
['elmore', 'preschool', 'emr', 'station', 'kathleen', 'railway', 'kindergarten', '1972', 'miniature', 'progress']
['ballarat', 'civic', 'hall', 'cinema', 'building', 'imax', 'nash', 'coburn', 'theatre', 'sturt']
['church', 'uniting', 'cathedral', 'sacred', 'anglican', 'presbyterian', 'heart', 'andrews', 'catholic', 'christ']
['lonsdale', 'womens', 'lt', 'clini

## Analyzing Topics

Finding better solutions for the topic modelling part:

- many topics are biased toward geographic areas (place names of suburbs and LGAs) rather than what is captured as POI - Topics 3, 12, 14, 20, 44, 45, 57, 59, 64, 69, 78
- topics with mixed themes: 53, 63, 74, 75
- not so clear topics: 17, 34, 35, 37, 46, 52, 55, 72, 76, 77, 78  

In [96]:
topic_model.visualize_barchart(top_n_topics=79)


distutils Version classes are deprecated. Use packaging.version instead.



## Refining Topic Models

Due to frequency of famous place names: LGAs and suburbs, several clusters are just defined by their location, while we are more interested to identify their characteristics.

Source:
https://public.opendatasoft.com/explore/dataset/georef-australia-local-government-area/table/?disjunctive.ste_code&disjunctive.ste_name&disjunctive.lga_code&disjunctive.lga_name

https://public.opendatasoft.com/explore/dataset/georef-australia-state-suburb/table/?disjunctive.ste_code&disjunctive.ste_name&disjunctive.lga_code&disjunctive.lga_name&disjunctive.scc_code&disjunctive.scc_name

In [97]:
with open('dataset/georef-australia-state-suburb.json', 'r') as fp:
    suburbs = json.load(fp)
with open('dataset/georef-australia-local-government-area.json', 'r') as fp:
    lgas = json.load(fp)

In [98]:
suburb_names = [suburb['scc_name'][0] for suburb in suburbs]

In [99]:
lga_names = [lga['lga_name'][0] for lga in lgas]

In [100]:
lga_names_lower = [lga.lower() for lga in lga_names]
suburb_names_lower = [suburb.lower() for suburb in suburb_names]

In [102]:
def replace_by_list(desc, names):
    for n in names:
        if n in desc:
            desc = desc.replace(n, '')
    return desc

def refined_description(row):
    desc = row['description']
    desc = replace_by_list(desc, lga_names)
    desc = replace_by_list(desc, lga_names_lower)
    desc = replace_by_list(desc, suburb_names)
    return replace_by_list(desc, suburb_names_lower)

In [103]:
gdf['refined_description'] = gdf.apply(refined_description, axis=1)

In [104]:
gdf.head()

Unnamed: 0,poi_title,poi_summary,latitude,longitude,geometry,description,refined_description
0,Fairhaven Surf Life Saving Club,Fairhaven is a well known surf beach. The beac...,-38.468759,144.084459,POINT (144.08446 -38.46876),Fairhaven Surf Life Saving Club: Fairhaven is ...,Surf Life Saving Club: is a well known surf ...
1,Beach walk,"From Sprout Creek, Eastern View, Moggs Creek, ...",-38.468542,144.089693,POINT (144.08969 -38.46854),"Beach walk: From Sprout Creek, Eastern View, M...","Beach walk: From Sprout Creek, , , to the bot..."
2,Rock pools,See what sort of shells and stones you can col...,-38.468459,144.09242,POINT (144.09242 -38.46846),Rock pools: See what sort of shells and stones...,Rock pools: See what sort of shells and stones...
3,Sand dunes,The beautiful rolling sand dunes shape the bea...,-38.468418,144.095318,POINT (144.09532 -38.46842),Sand dunes: The beautiful rolling sand dunes s...,Sand dunes: The beautiful rolling sand dunes s...
4,Painkalac Creek,The creek separates Aireys Inlet from Fairhave...,-38.46839,144.097312,POINT (144.09731 -38.46839),Painkalac Creek: The creek separates Aireys In...,Painkalac Creek: The creek setes from and fo...


In [105]:
refined_docs = list(gdf['refined_description'])

refined_topic_model = BERTopic(umap_model=umap_model, ctfidf_model=ctfidf_model)
refined_topics, refined_probs = refined_topic_model.fit_transform(refined_docs)


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.


`alltrue` is deprecated as of NumPy 1.25.0, and will be removed in NumPy 2.0. Please use `all` instead.



In [106]:
refined_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1590,-1_reserve_street_up_birds,"[reserve, street, up, birds, information, path...",[While the cars slog away above its peaceful d...
1,0,234,0_building_office_post_court,"[building, office, post, court, offices, civic...",[70 Avenue (LHS of Ave): Once the site of th...
2,1,160,1_beach_bay_sand_tide,"[beach, bay, sand, tide, foreshore, surf, coas...",[Coastal walking path: The narrow unsealed coa...
3,2,144,2_sculpture_art_mural_wall,"[sculpture, art, mural, wall, sculptures, arti...","[9. Mural, Civic /Library Car Park, off Market..."
4,3,138,3_bridge_footbridge_pedestrian_cross,"[bridge, footbridge, pedestrian, cross, railwa...",[Overhead Railway Bridge: The Overhead Bridge ...
...,...,...,...,...,...
66,65,13,65_market_meat_queen_1878,"[market, meat, queen, 1878, largest, class, vi...",[Queen Victoria Market: The Queen Victoria Mar...
67,66,12,66_library_kensington_neighbourhood_knh,"[library, kensington, neighbourhood, knh, broo...",[State Library Victoria: The State Library is ...
68,67,11,67_weir_turbine_generation_diving,"[weir, turbine, generation, diving, picturevie...",[ 11: Completed in 1928. Paddle steamers still...
69,68,11,68_jk_crofton_reservecherry_booth,"[jk, crofton, reservecherry, booth, turnstile,...","[ Reserve: If you're in need of more space, R..."


In [107]:
refined_topic_model.visualize_topics()


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.


distutils Version classes are deprecated. Use packaging.version instead.



In [108]:
refined_topic_model.visualize_hierarchy()


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead



In [109]:
# what happen when merging topics as well
hierarchical_refined_topics = refined_topic_model.hierarchical_topics(refined_docs)
refined_topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_refined_topics)

100%|██████████| 69/69 [00:00<00:00, 252.80it/s]

scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


distutils Version classes are deprecated. Use packaging.version instead.



In [110]:
# topic outlier reduction - if necessary (todo)
refined_topics = refined_topic_model.reduce_outliers(refined_docs, refined_topics, strategy="c-tf-idf")

refined_topic_df = pd.DataFrame({'topic': refined_topics, 'document': refined_docs})

In [111]:
refined_topic_df

Unnamed: 0,topic,document
0,1,Surf Life Saving Club: is a well known surf ...
1,1,"Beach walk: From Sprout Creek, , , to the bot..."
2,62,Rock pools: See what sort of shells and stones...
3,1,Sand dunes: The beautiful rolling sand dunes s...
4,20,Painkalac Creek: The creek setes from and fo...
...,...,...
4387,9,Lots of native flora: There are many native tr...
4388,6,Continue the loop or go elsewhere: There are p...
4389,6,Residential Street: This section of the walk t...
4390,4,The Stables Playground: This playground is hug...


In [112]:
refined_topic_model.visualize_barchart(top_n_topics=65)


distutils Version classes are deprecated. Use packaging.version instead.



In [113]:
refined_topic_df.reset_index(inplace=True)

refined_topic_df['is_found'] = refined_topic_df.apply(is_found, axis=1)
refined_topic_df['class'] = refined_topic_df.apply(which_class_type, axis=1)

refined_topic_agg_df = refined_topic_df.groupby(['topic']).agg({'is_found': ['sum', 'count']})

refined_topic_agg_df[('is_found', 'ratio')] = refined_topic_agg_df[('is_found', 'sum')] / refined_topic_agg_df[('is_found', 'count')]

refined_topic_agg_df.reset_index(inplace=True)

refined_topic_agg_df.sort_values(by=('is_found', 'ratio'), inplace=True)


In [114]:
not_founds = list(refined_topic_agg_df.head(20)['topic'])
for idx in not_founds:
    print(refined_topic_model.get_topic_info(idx)['Representation'].values[0])

['smoking', 'aboriginal', 'interpretive', 'ceremony', 'gippslands', 'yorta', 'gippsland', 'stories', 'welcomes', 'walker']
['ducks', 'pelicans', 'duck', 'pacific', 'ducklings', 'pelican', 'black', 'seagull', 'waterbirds', 'bread']
['olive', 'strip', 'moments', 'poinciana', 'dobsons', 'iramoo', 'vas', 'agricultural', 'grasslands', 'nature']
['echidna', 'zoo', 'wildlife', 'animals', 'koala', 'insects', 'creatures', 'rustle', 'safari', 'doing']
['wildflowers', 'wildflower', 'daisies', 'yam', 'flowers', 'daisy', 'fingernail', 'edgars', 'bumpy', 'peculiar']
['gym', 'exercise', 'equipment', 'outdoor', 'consists', 'fitness', 'ups', 'gear', 'crunches', 'equipments']
['bird', 'birdlife', 'nesting', 'hide', 'parrots', 'birds', 'heron', 'loads', 'cormorants', 'feng']
['orchid', 'orchids', 'waxlip', '43', 'hyacinth', 'roseum', 'specimens', 'sp', 'fringing', 'nodding']
['toilets', 'toilet', 'public', 'showers', 'clean', 'amenities', 'basll', 'rules', 'stables', 'unisex']
['fishing', 'trout', 'fish'

In [115]:
founds = list(refined_topic_agg_df.tail(20)['topic'])
for idx in founds:
    print(refined_topic_model.get_topic_info(idx)['Representation'].values[0])

['tennis', 'courts', 'netball', 'basketball', 'cricket', 'wanting', 'tucked', 'football', 'anyone', 'tournaments']
['skate', 'bmx', 'incrbly', 'newport', 'bigger', 'jumps', 'scooters', 'facility', 'skaters', 'ramp']
['shops', 'store', 'shopping', 'shillidays', 'general', 'shop', 'preston', 'clothes', 'strath', 'fashion']
['pool', 'aquamoves', 'swimming', 'swim', 'sadler', 'creche', 'olympic', 'pools', 'waterworld', 'swimland']
['bakery', 'vietnamese', 'bun', 'rolls', 'grocery', 'pho', 'salad', 'savoury', 'crust', 'sweet']
['boat', 'ramp', 'marina', 'ship', 'yacht', 'altona', 'rowing', 'paddlesteamer', 'canoe', 'te']
['coffee', 'cafe', 'caf', 'restaurant', 'chai', 'cafes', 'bakery', 'british', 'harbour', 'salingers']
['jk', 'crofton', 'reservecherry', 'booth', 'turnstile', 'hut', 'reserve', 'exit', 'cubbyh', 'subdivisions']
['conservatory', 'ford', 'recital', 'sites', 'sub', 'ngv', 'potter', 'headlines', 'leachs', 'bitten']
['weir', 'turbine', 'generation', 'diving', 'pictureview', 'con

In [116]:
refined_topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_refined_topics)


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


distutils Version classes are deprecated. Use packaging.version instead.



In [118]:
# TODO: Outdated topic ids
labelled_topics = {'cultural, historical and social': [46,14,0,17,41,25,6,1,50,34],
                   'affordance, function': [55,19,24,29,36,62,7,31,40,44,32,53,54,51,21,8,58,61,48,52,56,37],
                   'signs': [26,28,35,45],
                   'natural features': [39,23,57,49,13,15,5,2,3,4,11,10],
                   'species': [59,43,47,38,9,22,18,27,16,12,29,69],
                   'unknown': [-1]
                  }

In [119]:
def labelled_stats(topic_ids, labelled_topics = labelled_topics):
    stat_topics = {}
    for idx in topic_ids:
        for key, values in labelled_topics.items():
            if idx in values:
                if key not in stat_topics.keys():
                    stat_topics[key] = 1
                else:
                    stat_topics[key] += 1
    return stat_topics

In [120]:
not_found_topics = labelled_stats(not_founds)
not_found_topics

{'affordance, function': 6,
 'natural features': 2,
 'cultural, historical and social': 2,
 'species': 4,
 'signs': 2}

In [121]:
found_topics = labelled_stats(founds)
found_topics

{'affordance, function': 8,
 'species': 4,
 'cultural, historical and social': 3,
 'natural features': 1,
 'unknown': 1,
 'signs': 1}

In [122]:
refined_topic_agg_df

Unnamed: 0_level_0,topic,is_found,is_found,is_found
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,ratio
65,64,0,26,0.000000
32,31,0,49,0.000000
38,37,0,40,0.000000
12,11,1,75,0.013333
45,44,1,33,0.030303
...,...,...,...,...
52,51,10,31,0.322581
0,-1,3,9,0.333333
66,65,12,28,0.428571
36,35,18,40,0.450000


## Labelling Process:

- label if the geocoding was correct - if geocoding was successful
- find 10 most likely using textual and spatial criteria and record the element as a match

In [123]:
classes_found = list(refined_topic_df['class'].unique())
sup_classes = set([c.split(' ')[0] for c in classes_found if c is not None])
sup_classes

{'amenity',
 'boundary',
 'bridge',
 'building',
 'club',
 'highway',
 'historic',
 'landuse',
 'leisure',
 'man_made',
 'natural',
 'office',
 'place',
 'railway',
 'shop',
 'tourism',
 'waterway'}

In [124]:
tags = dict([(k, True) for k in sup_classes])
tags['animal'] = True
tags

{'landuse': True,
 'tourism': True,
 'leisure': True,
 'railway': True,
 'man_made': True,
 'bridge': True,
 'natural': True,
 'place': True,
 'club': True,
 'amenity': True,
 'highway': True,
 'historic': True,
 'office': True,
 'shop': True,
 'waterway': True,
 'boundary': True,
 'building': True,
 'animal': True}

In [125]:
def get_feature_from_osm(lat, lng, dist=200, tags=tags):
    return ox.features_from_point((lat, lng), tags, dist)

In [126]:
case_id = 23
test_case = nominatim_output[str(case_id)]

In [127]:
test_case

{'walk_id': 2,
 'title': '13. Royal Exhibition Building',
 'summary': 'The Royal Exhibition Building is the only surviving Great Hall that once housed a 19th-century international exhibition and is still used for exhibitions. ',
 'lat': -37.80513488,
 'lng': 144.97123539,
 'osm': {'place_id': 17546919,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
  'osm_type': 'way',
  'osm_id': 4817059,
  'lat': '-37.804666850000004',
  'lon': '144.9714669305319',
  'class': 'historic',
  'type': 'building',
  'place_rank': 30,
  'importance': 0.39044459367468287,
  'addresstype': 'historic',
  'name': 'Royal Exhibition Building',
  'display_name': 'Royal Exhibition Building, 9, Nicholson Street, Carlton, Melbourne, City of Melbourne, Victoria, 3053, Australia',
  'boundingbox': ['-37.8051500', '-37.8041865', '144.9705305', '144.9724671']}}

In [128]:
dist_threshold = 200
features = get_feature_from_osm(test_case['lat'], test_case['lng'], dist=dist_threshold)
features

Unnamed: 0_level_0,Unnamed: 1_level_0,highway,traffic_signals:direction,geometry,access,amenity,fee,operator,toilets:disposal,unisex,wheelchair,...,name:mk,short_name,political_division,heritage,heritage:operator,heritage:website,area,intermittent,salt,water
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,319157917,,,POINT (144.96920 -37.80626),yes,toilets,no,Melbourne City Council,flush,yes,yes,...,,,,,,,,,,
node,368393200,,,POINT (144.97064 -37.80351),,cinema,,Melbourne museum,,,,...,,,,,,,,,,
node,371974432,,,POINT (144.97138 -37.80545),,fountain,,,,,,...,,,,,,,,,,
node,493873180,crossing,,POINT (144.96961 -37.80495),,,,,,,,...,,,,,,,,,,
node,501027469,crossing,,POINT (144.97350 -37.80553),,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
relation,13238592,pedestrian,,"POLYGON ((144.97319 -37.80469, 144.97316 -37.8...",,,,,,,,...,,,,,,,yes,,,
relation,16464561,,,"POLYGON ((144.96877 -37.80399, 144.96879 -37.8...",,,,,,,,...,,,,,,,,,,
relation,16505481,pedestrian,,"POLYGON ((144.97232 -37.80461, 144.97205 -37.8...",,,,,,,,...,,,,,,,,,,
relation,17205856,,,"POLYGON ((144.97222 -37.80608, 144.97215 -37.8...",,,,,,,,...,,,,,,,,no,no,pond


In [129]:
list(features.columns)

['highway',
 'traffic_signals:direction',
 'geometry',
 'access',
 'amenity',
 'fee',
 'operator',
 'toilets:disposal',
 'unisex',
 'wheelchair',
 'crossing',
 'check_date',
 'name',
 'payment:mastercard',
 'payment:visa',
 'phone',
 'screen',
 'website',
 'wikidata',
 'bicycle',
 'artist',
 'covered',
 'drinking_water',
 'indoor',
 'tactile_paving',
 'button_operated',
 'crossing:markings',
 'foot',
 'traffic_signals:sound',
 'traffic_signals:vibration',
 'backrest',
 'source',
 'railway',
 'parking',
 'crossing:island',
 'kerb',
 'bicycle_parking',
 'fountain',
 'leisure',
 'brand',
 'brand:wikidata',
 'brand:wikipedia',
 'operator:wikidata',
 'operator:wikipedia',
 'payment:cash',
 'payment:credit_cards',
 'toilets:wheelchair',
 'place',
 'network',
 'public_transport',
 'ref',
 'tram',
 'natural',
 'bench',
 'bin',
 'bus',
 'lit',
 'network:wikidata',
 'network:wikipedia',
 'ref:ptv_website',
 'route_ref',
 'shelter',
 'capacity',
 'location',
 'historic',
 'traffic_signals',
 'mat

In [130]:
for key, value in nominatim_output.items():
    print('key: {}'.format(key))
    if os.path.isfile('dataset/osm-poi-{0}-dist-{1}-features.geojson'.format(key, dist_threshold)):
        print('already investigated...')
        continue;
    try:
        features = get_feature_from_osm(value['lat'], value['lng'])
        cols = list(features.columns)
        cols.remove('geometry')
        features[cols] = features[cols].astype(str)
        features.to_file('dataset/osm-poi-{0}-dist-{1}-features.geojson'.format(key, dist_threshold), driver="GeoJSON") 
        print('done')
    except Exception as e:
        print(e)

key: 0
already investigated...
key: 1
already investigated...
key: 2
already investigated...
key: 3
already investigated...
key: 4
already investigated...
key: 5
already investigated...
key: 6
already investigated...
key: 7
already investigated...
key: 8
already investigated...
key: 9
already investigated...
key: 10
already investigated...
key: 11
already investigated...
key: 12
already investigated...
key: 13
already investigated...
key: 14
already investigated...
key: 15
already investigated...
key: 16
already investigated...
key: 17
already investigated...
key: 18
already investigated...
key: 19
already investigated...
key: 20
already investigated...
key: 21
already investigated...
key: 22
already investigated...
key: 23
already investigated...
key: 24
already investigated...
key: 25
already investigated...
key: 26
already investigated...
key: 27
already investigated...
key: 28
already investigated...
key: 29
already investigated...
key: 30
already investigated...
key: 31
already in

In [131]:
import pyproj
from shapely.geometry import Point
from shapely.ops import transform

wgs84_pt = Point(test_case['lng'], test_case['lat'])
wgs84 = pyproj.CRS('EPSG:4326')
utm = pyproj.CRS('EPSG:32755')

project = pyproj.Transformer.from_crs(wgs84, utm, always_xy=True).transform

utm_point = transform(project, wgs84_pt)

features = gpd.read_file('dataset/osm-poi-{0}-dist-{1}-features.geojson'.format(case_id, dist_threshold))
features = features.to_crs('EPSG:32755')

features['distance'] = [utm_point.distance(geom) for geom in features.geometry]

In [132]:
features

Unnamed: 0,element_type,osmid,highway,traffic_signals:direction,access,amenity,fee,operator,toilets:disposal,unisex,...,source:population,name:mk,short_name,political_division,heritage,heritage:operator,heritage:website,area,geometry,distance
0,node,319157917,,,yes,toilets,no,Melbourne City Council,flush,yes,...,,,,,,,,,POINT (321223.384 5813738.129),218.599034
1,node,368393200,,,,cinema,,Melbourne museum,,,...,,,,,,,,,POINT (321343.567 5814046.274),188.373535
2,node,371974432,,,,fountain,,,,,...,,,,,,,,,POINT (321413.415 5813831.827),37.211904
3,node,493873180,crossing,,,,,,,,...,,,,,,,,,POINT (321256.686 5813883.765),144.563983
4,node,501027469,crossing,,,,,,,,...,,,,,,,,,POINT (321600.961 5813827.093),204.586235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,relation,6614802,,,,,,Melbourne City Council,,,...,,,,,,,,,"MULTIPOLYGON (((321219.805 5813634.926, 321210...",3.048837
436,relation,6623509,,,,,,,,,...,,,,,1,whc,http://whc.unesco.org/en/list/1131,,"MULTIPOLYGON (((321415.382 5813963.959, 321416...",3.048837
437,relation,13238592,pedestrian,,,,,,,,...,,,,,,,,yes,"POLYGON ((321571.451 5813919.342, 321569.416 5...",96.331620
438,relation,16464561,,,,,,,,,...,,,,,,,,,"POLYGON ((321163.337 5813990.989, 321180.796 5...",209.794588


In [133]:
# filter geometry, distance
# filter nan, no values
# filter values for yes
# involve name, short_name if not nan first
# location inside if exists!
not_consider = ['geometry', 'distance', 'element_type', 'osmid']
consider_first = ['name', 'short_name']
cols = list(features.columns)

In [134]:
def generate_textual_descriptions(row, cols=cols):
    full_name = ''
    for c in consider_first:
        if c in cols and row[c] != 'nan':
            full_name += ' ' + row[c]
    for c in cols:
        if c not in consider_first and c not in not_consider and ':' not in c and row[c] != 'nan' and row[c] != 'no' and 'http' not in row[c] and '[' not in row[c] and 'wiki' not in c:
            if row[c] == 'yes':
                full_name += ' ' + c
            else:
                full_name += ' {0} {1}'.format(c, row[c])
    return full_name.strip()

In [135]:
features['full_name'] = features.apply(generate_textual_descriptions, axis=1)
features

Unnamed: 0,element_type,osmid,highway,traffic_signals:direction,access,amenity,fee,operator,toilets:disposal,unisex,...,name:mk,short_name,political_division,heritage,heritage:operator,heritage:website,area,geometry,distance,full_name
0,node,319157917,,,yes,toilets,no,Melbourne City Council,flush,yes,...,,,,,,,,POINT (321223.384 5813738.129),218.599034,access amenity toilets operator Melbourne City...
1,node,368393200,,,,cinema,,Melbourne museum,,,...,,,,,,,,POINT (321343.567 5814046.274),188.373535,IMAX Melbourne amenity cinema operator Melbour...
2,node,371974432,,,,fountain,,,,,...,,,,,,,,POINT (321413.415 5813831.827),37.211904,amenity fountain
3,node,493873180,crossing,,,,,,,,...,,,,,,,,POINT (321256.686 5813883.765),144.563983,highway crossing crossing zebra
4,node,501027469,crossing,,,,,,,,...,,,,,,,,POINT (321600.961 5813827.093),204.586235,highway crossing crossing traffic_signals tact...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,relation,6614802,,,,,,Melbourne City Council,,,...,,,,,,,,"MULTIPOLYGON (((321219.805 5813634.926, 321210...",3.048837,Carlton Gardens operator Melbourne City Counci...
436,relation,6623509,,,,,,,,,...,,,,1,whc,http://whc.unesco.org/en/list/1131,,"MULTIPOLYGON (((321415.382 5813963.959, 321416...",3.048837,Royal Exhibition Building and Carlton Gardens ...
437,relation,13238592,pedestrian,,,,,,,,...,,,,,,,yes,"POLYGON ((321571.451 5813919.342, 321569.416 5...",96.331620,highway pedestrian type multipolygon area
438,relation,16464561,,,,,,,,,...,,,,,,,,"POLYGON ((321163.337 5813990.989, 321180.796 5...",209.794588,building type multipolygon


In [136]:
def compute_similarities_topk(query, sentences, sentence_embeddings, model=sbert_model, k=10, verbose=False):
    query_vec = embed_texts(query)
    scores = util.dot_score(query_vec, sentence_embeddings)[0].cpu().tolist()
    doc_score_pairs = list(zip(sentences, scores))
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    if verbose:
        print("Query:", query)
        for doc, score in doc_score_pairs:
            print('\t', score, '\t', doc)
    return np.argsort(scores)[-k:]

In [137]:
# example_sentence_embeddings = embed_texts(example_sentences, model=msmarco_model)
# compute_similarities(example_poi_osm, example_sentences, example_sentence_embeddings, model=msmarco_model)
feature_descriptions = list(features['full_name'])
feature_embeddings = embed_texts(feature_descriptions, model=msmarco_model)
case_description = nominatim_output[str(case_id)]['title'] + ' ' + nominatim_output[str(case_id)]['summary']
k_similar = compute_similarities_topk(case_description, feature_descriptions, feature_embeddings, model=msmarco_model, verbose=True)
k_similar

Query: 13. Royal Exhibition Building The Royal Exhibition Building is the only surviving Great Hall that once housed a 19th-century international exhibition and is still used for exhibitions. 
	 44.04184341430664 	 Royal Exhibition Building and Carlton Gardens tourism attraction type multipolygon heritage 1
	 40.16721725463867 	 Royal Exhibition Building source Vicmap Address historic building tourism attraction building height 20 layer 1
	 39.15629196166992 	 Royal Exhibition Building Opening historic memorial memorial plaque
	 36.941375732421875 	 Melbourne Museum fee operator Museum Victoria wheelchair phone +61 3 8341 7777 source Vicmap Address tourism museum building layer 1 atm internet_access wlan opening_hours Mo-Su 10:00-17:00,09:00-17:00
	 36.46418380737305 	 access historic monument inscription To Victoria from one of her earliest colonists in pleasant remeberance 1840 - 88
	 35.37289047241211 	 Exhibition Building/Rathdowne Street highway bus_stop tactile_paving network PTV

array([ 46,   1,  26,  27,  44,  49, 298,  43, 297, 436], dtype=int64)

In [138]:
features.iloc[k_similar]

Unnamed: 0,element_type,osmid,highway,traffic_signals:direction,access,amenity,fee,operator,toilets:disposal,unisex,...,name:mk,short_name,political_division,heritage,heritage:operator,heritage:website,area,geometry,distance,full_name
46,node,9106562132,,,,,,,,,...,,,,,,,,POINT (321559.646 5813927.545),170.659614,artwork_type sculpture tourism artwork
1,node,368393200,,,,cinema,,Melbourne museum,,,...,,,,,,,,POINT (321343.567 5814046.274),188.373535,IMAX Melbourne amenity cinema operator Melbour...
26,node,4061250667,bus_stop,,,,,,,,...,,,,,,,,POINT (321243.477 5813954.530),179.719018,Exhibition Building/Rathdowne Street highway b...
27,node,4332324003,,,,charging_station,,Museums Victoria,,,...,,,,,,,,POINT (321463.713 5813962.011),114.58008,amenity charging_station operator Museums Vict...
44,node,7248901076,bus_stop,,,,,,,,...,,,,,,,,POINT (321211.629 5813902.092),191.907516,Exhibition Building/Rathdowne Street highway b...
49,node,9307551791,,,yes,,,,,,...,,,,,,,,POINT (321592.650 5813874.406),192.577146,access historic monument inscription To Victor...
298,way,4817074,,,,,yes,Museum Victoria,,,...,,,,,,,,"POLYGON ((321505.691 5814023.082, 321459.066 5...",159.043248,Melbourne Museum fee operator Museum Victoria ...
43,node,6810298878,,,,,,,,,...,,,,,,,,POINT (321503.096 5813888.150),105.093724,Royal Exhibition Building Opening historic mem...
297,way,4817059,,,,,,,,,...,,,,,,,,"POLYGON ((321415.382 5813963.959, 321416.338 5...",11.700249,Royal Exhibition Building source Vicmap Addres...
436,relation,6623509,,,,,,,,,...,,,,1.0,whc,http://whc.unesco.org/en/list/1131,,"MULTIPOLYGON (((321415.382 5813963.959, 321416...",3.048837,Royal Exhibition Building and Carlton Gardens ...


In [139]:
def get_top_k(case_id, k=10, model = msmarco_model, verbose=False):
    test_case = nominatim_output[str(case_id)]
    
    features = gpd.read_file('dataset/osm-poi-{0}-dist-{1}-features.geojson'.format(case_id, dist_threshold))
    cols = list(features.columns)
    if verbose:
        print(features.head())
        print(cols)
    features['full_name'] = features.apply(lambda row: generate_textual_descriptions(row, cols), axis=1)
    
    feature_descriptions = list(features['full_name'])
    feature_embeddings = embed_texts(feature_descriptions, model=model)
    
    case_description = test_case['title'] + ' ' + test_case['summary']
    k_similar = compute_similarities_topk(case_description, feature_descriptions, feature_embeddings, model=model, verbose=verbose)
    return features.iloc[np.flip(k_similar)]

## Annotation Experiment

In [140]:
if os.path.isfile('dataset/annotated-osm-entities.json'):
    with open('dataset/annotated-osm-entities.json', 'r') as fp:
        annotations = json.load(fp)
else:
    annotations = {}
annotations

{'0': [['way', 1089591567]],
 '1': [['way', 1007494584]],
 '2': [],
 '3': [],
 '4': [['way', 30501938]],
 '5': [],
 '6': [['way', 69366108], ['node', 5315720235]],
 '7': [['node', 11158854687],
  ['node', 4583492091],
  ['node', 831201411],
  ['node', 11158854665],
  ['node', 11158854688]],
 '8': [['node', 3621405496]],
 '9': [],
 '10': [['node', 831201041]],
 '11': [['node', 3933086392]],
 '12': [['way', 26564416]],
 '13': [['way', 49961266]],
 '14': [['way', 26564392]],
 '15': [['node', 5371634608]],
 '16': [['node', 7228362154], ['way', 32710391]],
 '17': [['way', 435967756]],
 '18': [],
 '19': [],
 '20': [['way', 1016758027]],
 '21': [['node', 368393200]],
 '22': [['way', 4817074]],
 '23': [['way', 4817059]],
 '24': [['relation', 6614802]],
 '25': [['way', 29369985]],
 '26': [['way', 30066634]],
 '27': [['way', 830995835], ['way', 830995836], ['way', 830995837]],
 '28': [['node', 6515021014]],
 '29': [['node', 331165028]],
 '30': [['node', 243419251]],
 '31': [['node', 649798969]],

### Iterate from here!

In [176]:
already_investigated = [int(key) for key in annotations.keys()]
case_id = max(already_investigated) + 1
print(case_id)
print(nominatim_output[str(case_id)])
top_k_df = get_top_k(case_id)
top_k_df[['element_type','osmid', 'name', 'full_name']]

341
{'walk_id': 23, 'title': 'Playground', 'summary': "A colourful playground to let your little ones run wild (if you've got kids to entertain).", 'lat': -37.77917777, 'lng': 144.96626599, 'osm': None}


Unnamed: 0,element_type,osmid,name,full_name
139,way,715659053,,leisure playground
85,way,198843622,Hardy Gallagher Reserve,Hardy Gallagher Reserve leisure park
146,way,1100733362,,leisure park
52,way,22787631,Communal Gardens,Communal Gardens leisure garden
54,way,26131217,Linear Park Reserve,Linear Park Reserve leisure park
21,node,10074058696,,natural tree
27,node,10074058702,,natural tree
25,node,10074058700,,natural tree
24,node,10074058699,,natural tree
23,node,10074058698,,natural tree


In [177]:
gdf.iloc[case_id:case_id+1].explore(marker_type='marker')

In [178]:
print('https://www.openstreetmap.org/#map=18/{0}/{1}'.format(nominatim_output[str(case_id)]['lat'], nominatim_output[str(case_id)]['lng']))

https://www.openstreetmap.org/#map=18/-37.77917777/144.96626599


In [179]:
annotations[case_id] = [('way', 715659053)]
if len(annotations.keys()) % 10 == 0:
    with open('dataset/annotated-osm-entities.json', 'w', encoding='utf-8') as fp:
        json.dump(annotations, fp)
    print('annotation file saved')
annotations

annotation file saved


{'0': [['way', 1089591567]],
 '1': [['way', 1007494584]],
 '2': [],
 '3': [],
 '4': [['way', 30501938]],
 '5': [],
 '6': [['way', 69366108], ['node', 5315720235]],
 '7': [['node', 11158854687],
  ['node', 4583492091],
  ['node', 831201411],
  ['node', 11158854665],
  ['node', 11158854688]],
 '8': [['node', 3621405496]],
 '9': [],
 '10': [['node', 831201041]],
 '11': [['node', 3933086392]],
 '12': [['way', 26564416]],
 '13': [['way', 49961266]],
 '14': [['way', 26564392]],
 '15': [['node', 5371634608]],
 '16': [['node', 7228362154], ['way', 32710391]],
 '17': [['way', 435967756]],
 '18': [],
 '19': [],
 '20': [['way', 1016758027]],
 '21': [['node', 368393200]],
 '22': [['way', 4817074]],
 '23': [['way', 4817059]],
 '24': [['relation', 6614802]],
 '25': [['way', 29369985]],
 '26': [['way', 30066634]],
 '27': [['way', 830995835], ['way', 830995836], ['way', 830995837]],
 '28': [['node', 6515021014]],
 '29': [['node', 331165028]],
 '30': [['node', 243419251]],
 '31': [['node', 649798969]],

## Annotation Analysis

- a place
- a path
- a vista

In [180]:
api_endpoint = 'https://www.openstreetmap.org/api/0.6/{0}/{1}.json'

def get_info(otype, oid):
    if otype not in ['node', 'way', 'relation']:
        print('wrong type - {}'.format(otype))
    resp = requests.get(api_endpoint.format(otype, oid), headers=headers)
    return resp.json()

In [181]:
get_info('node', 4506300751)

{'version': '0.6',
 'generator': 'CGImap 0.8.10 (1564108 spike-06.openstreetmap.org)',
 'copyright': 'OpenStreetMap and contributors',
 'attribution': 'http://www.openstreetmap.org/copyright',
 'license': 'http://opendatacommons.org/licenses/odbl/1-0/',
 'elements': [{'type': 'node',
   'id': 4506300751,
   'lat': 50.405,
   'lon': 9.2805862,
   'timestamp': '2023-01-21T17:13:36Z',
   'version': 5,
   'changeset': 131548253,
   'user': 'cEvLGWiQ',
   'uid': 5432507,
   'tags': {'addr:city': 'Birstein',
    'addr:housenumber': '1',
    'addr:postcode': '63633',
    'addr:street': 'Volkartshainer Weg',
    'name': 'Scheffehof',
    'phone': '+496054 6920',
    'tourism': 'trail_riding_station'}}]}