# POIs from OSM

Aim is to download all candidate POIs from OSM. This set is used to provide suggestions for manaul annotations and later to build a candidate set of not-recommended POIs.

## Setup

Loading libraries and models

In [1]:
# working with files
import os.path
# sys
import sys

# warning off
import warnings
# IO
import json
# calling Webservices
import requests
# systematic thread stops for polite crawling
import time

# set random seed for reproducibility of results
from umap import UMAP

# dataframe 
import numpy as np
import pandas as pd
import geopandas as gpd

# geocoding
from geopy.geocoders import Nominatim

# getting data from OSM
import osmnx as ox

# topic modelling
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired

# nlp
from sentence_transformers import SentenceTransformer, util
import spacy
from nltk.corpus import stopwords

# visualization
import matplotlib.pyplot as plt

# logging
from loguru import logger

# set logger level
logger.remove(0)
logger.add(sys.stderr, level="INFO")

warnings.filterwarnings("ignore")

# en_core_web_lg must be downloaded, if not run: 'python -m spacy download en_core_web_lg' first!
nlp = spacy.load('en_core_web_lg')

umap_model = UMAP(random_state=42)

stopword_removal = False

## Dataset

Reading the dataset crawled from WalkingMap website.

In [2]:
with open('dataset/walkingmaps.json', 'r', encoding='utf-8') as fp:
    dataset = json.load(fp)

In [3]:
logger.debug(f'an example record in dataset: {dataset[1]}')
logger.info(f'structure of records in dataset: {dataset[1].keys()}')

[32m2025-01-25 20:25:12.429[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mstructure of records in dataset: dict_keys(['markers', 'pathDetails', 'pois', 'title', 'description'])[0m


### Dataset Transformation

Aim: Transforming the dataset into pandas and geopandas dataframes, with a focus on POIs

A basic preprocessing step to create a dataset of POI descriptions, also including A preliminary analysis their location.

In [4]:
counter = 0
total = 0
for record in dataset:
    if 'pois' in record.keys() and len(record['pois']) > 0:
        counter+=1
        total += len(record['pois'])
logger.info('records: {0} total POIs: {1} - average per record: {2}'.format(counter, total, round(total/counter)))

[32m2025-01-25 20:25:14.304[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mrecords: 386 total POIs: 4392 - average per record: 11[0m


In [5]:
import statistics 

verbal_descriptions = []
poi_descriptions = []
for record in dataset:
    if 'pois' in record.keys() and len(record['pois']) > 0:
        vb = record['title']+' '+record['description']
        verbal_descriptions.append(vb)
        for poi in record['pois']:
            pvb = poi['title']+' '+poi['summary']
            poi_descriptions.append(pvb)

wc_vb = [len(vb.split(' ')) for vb in verbal_descriptions]
logger.info(f'word count verbal description \n\t- average: {int(statistics.mean(wc_vb))} - median: {statistics.median(wc_vb)} - min: {min(wc_vb)} - max: {max(wc_vb)}')
wc_pvb = [len(pvb.split(' ')) for pvb in poi_descriptions]
logger.info(f'word count POI verbal description \n\t- average: {int(statistics.mean(wc_pvb))} - median: {statistics.median(wc_pvb)} - min: {min(wc_pvb)} - max: {max(wc_pvb)}')
cc_vb = [len(vb) for vb in verbal_descriptions]
logger.info(f'character count verbal description \n\t- average: {int(statistics.mean(cc_vb))} - median: {statistics.median(cc_vb)} - min: {min(cc_vb)} - max: {max(cc_vb)}')
cc_pvb = [len(pvb) for pvb in poi_descriptions]
logger.info(f'character count POI verbal description \n\t- average: {int(statistics.mean(cc_pvb))} - median: {statistics.median(cc_pvb)} - min: {min(cc_pvb)} - max: {max(cc_pvb)}')

[32m2025-01-25 20:25:18.824[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mword count verbal description 
	- average: 181 - median: 130.0 - min: 7 - max: 540[0m
[32m2025-01-25 20:25:18.836[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mword count POI verbal description 
	- average: 22 - median: 23.0 - min: 2 - max: 116[0m
[32m2025-01-25 20:25:18.837[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mcharacter count verbal description 
	- average: 1062 - median: 764.0 - min: 43 - max: 3052[0m
[32m2025-01-25 20:25:18.840[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m20[0m - [1mcharacter count POI verbal description 
	- average: 130 - median: 127.0 - min: 11 - max: 296[0m


In [6]:
data_structure = {'record_title':[], 'record_description':[], 'poi_title':[], 'poi_summary':[], 'latitude': [], 'longitude': []}
for record in dataset:
    if 'pois' in record.keys() and len(record['pois']) > 0:
        for poi in record['pois']:
            data_structure['record_title'].append(record['title'])
            data_structure['record_description'].append(record['description'])
            data_structure['poi_title'].append(poi['title'])
            data_structure['poi_summary'].append(poi['summary'])
            data_structure['latitude'].append(poi['lat'])
            data_structure['longitude'].append(poi['lng'])

In [7]:
df = pd.DataFrame(data_structure)

gdf = gpd.GeoDataFrame(df[['poi_title', 'poi_summary', 'latitude', 'longitude']], geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326")

In [9]:
gdf.head()

Unnamed: 0,poi_title,poi_summary,latitude,longitude,geometry
0,Fairhaven Surf Life Saving Club,Fairhaven is a well known surf beach. The beac...,-38.468759,144.084459,POINT (144.08446 -38.46876)
1,Beach walk,"From Sprout Creek, Eastern View, Moggs Creek, ...",-38.468542,144.089693,POINT (144.08969 -38.46854)
2,Rock pools,See what sort of shells and stones you can col...,-38.468459,144.09242,POINT (144.09242 -38.46846)
3,Sand dunes,The beautiful rolling sand dunes shape the bea...,-38.468418,144.095318,POINT (144.09532 -38.46842)
4,Painkalac Creek,The creek separates Aireys Inlet from Fairhave...,-38.46839,144.097312,POINT (144.09731 -38.46839)


In [10]:
df.head()

Unnamed: 0,record_title,record_description,poi_title,poi_summary,latitude,longitude
0,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Fairhaven Surf Life Saving Club,Fairhaven is a well known surf beach. The beac...,-38.468759,144.084459
1,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Beach walk,"From Sprout Creek, Eastern View, Moggs Creek, ...",-38.468542,144.089693
2,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Rock pools,See what sort of shells and stones you can col...,-38.468459,144.09242
3,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Sand dunes,The beautiful rolling sand dunes shape the bea...,-38.468418,144.095318
4,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Painkalac Creek,The creek separates Aireys Inlet from Fairhave...,-38.46839,144.097312


## OSM Points of Interests

**Aim**: Collect rich OSM POI information in the bounding box area of the lesiure walk.

**Approach**: Using OSM tags for `{'amenity': True, 'natural': True, 'animal': True, 'leisure': True}` to collect information inside the bounding boxes of leisure walks.

In [11]:
# bounding box of each path
paths = []
for record in dataset:
    min_lat = 90
    max_lat = -90
    min_lng = 180
    max_lng = -180
    for latlng in record['pathDetails']:
        if min_lat > latlng['lat']:
            min_lat = latlng['lat']
        if max_lat < latlng['lat']:
            max_lat = latlng['lat']
        if min_lng > latlng['lng']:
            min_lng = latlng['lng']
        if max_lng < latlng['lng']:
            max_lng = latlng['lng']
    paths.append({'min_lat': min_lat, 'max_lat': max_lat, 'min_lng': min_lng, 'max_lng': max_lng})

In [12]:
logger.debug(f'all path bounding boxes: {paths}')  # bounding box information leisure walks
logger.info(f'example path bounding boxes: {paths[0]}')  # bounding box information leisure walks

[32m2025-01-25 20:25:55.110[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mexample path bounding boxes: {'min_lat': -37.82326007, 'max_lat': -37.81401352, 'min_lng': 144.96751249, 'max_lng': 144.97828424}[0m


In [13]:
tags = {'amenity': True, 'natural': True, 'animal': True, 'leisure': True}

In [14]:
path = paths[0]
feature_gdf = ox.features_from_bbox(north=path['max_lat'], south=path['min_lat'], east=path['max_lng'], west=path['min_lng'], tags=tags)

In [15]:
feature_gdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,addr:city,addr:housenumber,addr:postcode,addr:street,amenity,name,operator,website,wikidata,geometry,...,motor_vehicle,contact:instagram,building:part,not:operator:wikidata,water,unisex,ways,type,intermittent,salt
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,176729780,Melbourne,191.0,3000.0,Collins Street,theatre,Regent Theatre,Marriner Group,https://www.marrinergroup.com.au/theatre-regen...,Q7308110,POINT (144.96760 -37.81550),...,,,,,,,,,,
node,247024808,,,,,parking_entrance,,,,,POINT (144.97019 -37.81548),...,,,,,,,,,,
node,247689970,,,,,parking_entrance,,,,,POINT (144.97070 -37.81789),...,,,,,,,,,,
node,266733834,,,,,parking,Sofitel Hotel Carpark,Wilson Parking,,,POINT (144.97302 -37.81451),...,,,,,,,,,,
node,304169365,,,,,theatre,Playhouse Theatre,,,,POINT (144.96840 -37.82172),...,,,,,,,,,,


In [16]:
logger.debug(f'feature columns: {feature_gdf.columns}')
logger.info(f'number of feature columns in feature gdf: {len(feature_gdf.columns)}')

[32m2025-01-25 20:26:01.589[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mnumber of feature columns in feature gdf: 139[0m


In [17]:
feature_gdf.amenity = feature_gdf.amenity.astype(str)
feature_gdf.natural = feature_gdf.natural.astype(str)
feature_gdf.leisure = feature_gdf.leisure.astype(str)
feature_gdf.name = feature_gdf.name.astype(str)

In [18]:
feature_gdf = feature_gdf[['name', 'amenity', 'natural', 'leisure', 'geometry']].dropna(how='all')
feature_gdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,amenity,natural,leisure,geometry
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
node,176729780,Regent Theatre,theatre,,,POINT (144.96760 -37.81550)
node,247024808,,parking_entrance,,,POINT (144.97019 -37.81548)
node,247689970,,parking_entrance,,,POINT (144.97070 -37.81789)
node,266733834,Sofitel Hotel Carpark,parking,,,POINT (144.97302 -37.81451)
node,304169365,Playhouse Theatre,theatre,,,POINT (144.96840 -37.82172)


In [19]:
for idx, path in enumerate(paths):
    
    if os.path.isfile('dataset/features-osm-{}.geojson'.format(idx)):
        logger.debug('features for path {0} out of {1} is already loaded and saved.'.format(idx, len(paths)))
        continue;
    try:
        feature_gdf = ox.features_from_bbox(north=path['max_lat'], south=path['min_lat'], east=path['max_lng'], west=path['min_lng'], tags=tags)
        cols = feature_gdf.columns
        #preprocess
        if 'amenity' not in cols:
            feature_gdf['amenity'] = np.nan
        feature_gdf.amenity = feature_gdf.amenity.astype(str)    
        if 'natural' not in cols:
            feature_gdf['natural'] = np.nan
        feature_gdf.natural = feature_gdf.natural.astype(str)    
        if 'leisure' not in cols:
            feature_gdf['leisure'] = np.nan
        feature_gdf.leisure = feature_gdf.leisure.astype(str)
        if 'name' not in cols:
            feature_gdf['name'] = np.nan
        feature_gdf.name = feature_gdf.name.astype(str)
        feature_gdf = feature_gdf[['name', 'amenity', 'natural', 'leisure', 'geometry']].dropna(how='all')
    
        feature_gdf.to_file("dataset/features-osm-{}.geojson".format(idx), driver='GeoJSON')
        logger.info('features for path {0} out of {1} is loaded from OSM and saved ...'.format(idx, len(paths)))
    except Exception as e:
        logger.warning('error in writing path {0} out of {1}...'.format(idx, len(paths)))
        logger.warning(e)



In [20]:
feature_gdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,amenity,natural,leisure,geometry
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
node,176729780,Regent Theatre,theatre,,,POINT (144.96760 -37.81550)
node,247024808,,parking_entrance,,,POINT (144.97019 -37.81548)
node,247689970,,parking_entrance,,,POINT (144.97070 -37.81789)
node,266733834,Sofitel Hotel Carpark,parking,,,POINT (144.97302 -37.81451)
node,304169365,Playhouse Theatre,theatre,,,POINT (144.96840 -37.82172)


## Matching POIs to OSM POIs

**Aim** To match collected OSM POIs with described POIs

**Approach** using textual matching of POIs description to OSM tags (*semantic criterion*) and spatial matching based on proximity (*spatial criterion*):

- spatial criterion: defining containment
- semantic criterion: defining semantic similarity using word embeddings

### Semantic Matching

Ranking the relevance of textual descriptions in OSM POIs and LW POIS

Example to test how it works

In [21]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') # symmetric semantic search
msmarco_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-dot-v5')  # asymmetric semantic search

# embedding derived from BERT for the pois
def embed_texts(sentences, model=sbert_model):
    sentence_embeddings = model.encode(sentences)
    return sentence_embeddings


def compute_similarities(query, sentences, sentence_embeddings, model=sbert_model):
    query_vec = embed_texts(query)
    scores = util.dot_score(query_vec, sentence_embeddings)[0].cpu().tolist()
    doc_score_pairs = list(zip(sentences, scores))
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    logger.debug("Query:", query)
    for doc, score in doc_score_pairs:
        logger.info(f'\t{score}\t{doc}')
    return doc_score_pairs

In [22]:
def only_noun_phrases(sentence):
    doc = nlp(sentence)
    phrases = set() 
    for nc in doc.noun_chunks:
        phrases.add(nc.text)
        phrases.add(doc[nc.root.left_edge.i:nc.root.right_edge.i+1].text)
    return ' '.join(phrases)

In [23]:
example_poi_osm = "Gaswork park theatre"
example_sentences = ["Gasworks Park: There are artists studios, a theatre and a cafe. Every 3rd Saturday there is a Farmers' Market.  In the park you'll also come across various wonderful sculptures and installations.",
                                                 "Australia's Number One university and world leader in education, teaching and research excellence.",
                                                 "Completed in 1870, the Melbourne Town Hall is at the heart of the city's cultural and civic activity",
                                                 "The magnificent octagonal domed reading room is both a quiet space for study and an iconic Melbourne location to take an unforgettable selfie.?"]
example_sentence_embeddings = embed_texts(example_sentences)
compute_similarities(example_poi_osm, example_sentences, example_sentence_embeddings)

[32m2025-01-25 20:26:15.807[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	112.580322265625	Completed in 1870, the Melbourne Town Hall is at the heart of the city's cultural and civic activity[0m
[32m2025-01-25 20:26:15.808[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	92.63180541992188	Gasworks Park: There are artists studios, a theatre and a cafe. Every 3rd Saturday there is a Farmers' Market.  In the park you'll also come across various wonderful sculptures and installations.[0m
[32m2025-01-25 20:26:15.809[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	84.313232421875	The magnificent octagonal domed reading room is both a quiet space for study and an iconic Melbourne location to take an unforgettable selfie.?[0m
[32m2025-01-25 20:26:15.809[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	62.572380065

[("Completed in 1870, the Melbourne Town Hall is at the heart of the city's cultural and civic activity",
  112.580322265625),
 ("Gasworks Park: There are artists studios, a theatre and a cafe. Every 3rd Saturday there is a Farmers' Market.  In the park you'll also come across various wonderful sculptures and installations.",
  92.63180541992188),
 ('The magnificent octagonal domed reading room is both a quiet space for study and an iconic Melbourne location to take an unforgettable selfie.?',
  84.313232421875),
 ("Australia's Number One university and world leader in education, teaching and research excellence.",
  62.57238006591797)]

In [24]:
# with preprocessing
only_noun_example_sentences = [only_noun_phrases(sentence) for sentence in example_sentences]
example_sentence_embeddings = embed_texts(only_noun_example_sentences)
compute_similarities(example_poi_osm, only_noun_example_sentences, example_sentence_embeddings)

[32m2025-01-25 20:26:16.815[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	125.21781921386719	a theatre and a cafe a cafe various wonderful sculptures and installations a Farmers' Market the park artists studios artists studios, a theatre and a cafe various wonderful sculptures you a theatre Gasworks Park: Gasworks Park installations[0m
[32m2025-01-25 20:26:16.816[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	121.1483154296875	the heart of the city's cultural and civic activity the city's cultural and civic activity the heart the Melbourne Town Hall[0m
[32m2025-01-25 20:26:16.816[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	72.54638671875	an iconic Melbourne location The magnificent octagonal domed reading room a quiet space for study and an iconic Melbourne location to take an unforgettable selfie a quiet space study an unforgettable selfie an 

[("a theatre and a cafe a cafe various wonderful sculptures and installations a Farmers' Market the park artists studios artists studios, a theatre and a cafe various wonderful sculptures you a theatre Gasworks Park: Gasworks Park installations",
  125.21781921386719),
 ("the heart of the city's cultural and civic activity the city's cultural and civic activity the heart the Melbourne Town Hall",
  121.1483154296875),
 ('an iconic Melbourne location The magnificent octagonal domed reading room a quiet space for study and an iconic Melbourne location to take an unforgettable selfie a quiet space study an unforgettable selfie an iconic Melbourne location to take an unforgettable selfie',
  72.54638671875),
 ("education Australia's Number One university and world leader research Australia's Number One university and world leader in education, teaching and research excellence. teaching teaching and research excellence excellence education, teaching and research excellence",
  60.1592559814

In [25]:
example_sentence_embeddings = embed_texts(example_sentences, model=msmarco_model)
compute_similarities(example_poi_osm, example_sentences, example_sentence_embeddings, model=msmarco_model)

[32m2025-01-25 20:26:17.606[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	42.83140563964844	Gasworks Park: There are artists studios, a theatre and a cafe. Every 3rd Saturday there is a Farmers' Market.  In the park you'll also come across various wonderful sculptures and installations.[0m
[32m2025-01-25 20:26:17.606[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	30.717838287353516	Completed in 1870, the Melbourne Town Hall is at the heart of the city's cultural and civic activity[0m
[32m2025-01-25 20:26:17.607[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	25.081661224365234	The magnificent octagonal domed reading room is both a quiet space for study and an iconic Melbourne location to take an unforgettable selfie.?[0m
[32m2025-01-25 20:26:17.607[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	18.9592

[("Gasworks Park: There are artists studios, a theatre and a cafe. Every 3rd Saturday there is a Farmers' Market.  In the park you'll also come across various wonderful sculptures and installations.",
  42.83140563964844),
 ("Completed in 1870, the Melbourne Town Hall is at the heart of the city's cultural and civic activity",
  30.717838287353516),
 ('The magnificent octagonal domed reading room is both a quiet space for study and an iconic Melbourne location to take an unforgettable selfie.?',
  25.081661224365234),
 ("Australia's Number One university and world leader in education, teaching and research excellence.",
  18.959270477294922)]

In [26]:
example_sentence_embeddings = embed_texts(only_noun_example_sentences, model=msmarco_model)
compute_similarities(example_poi_osm, only_noun_example_sentences, example_sentence_embeddings, model=msmarco_model)

[32m2025-01-25 20:26:19.009[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	43.246639251708984	a theatre and a cafe a cafe various wonderful sculptures and installations a Farmers' Market the park artists studios artists studios, a theatre and a cafe various wonderful sculptures you a theatre Gasworks Park: Gasworks Park installations[0m
[32m2025-01-25 20:26:19.010[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	30.413190841674805	the heart of the city's cultural and civic activity the city's cultural and civic activity the heart the Melbourne Town Hall[0m
[32m2025-01-25 20:26:19.010[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities[0m:[36m17[0m - [1m	24.648412704467773	an iconic Melbourne location The magnificent octagonal domed reading room a quiet space for study and an iconic Melbourne location to take an unforgettable selfie a quiet space study an unforgettable selfi

[("a theatre and a cafe a cafe various wonderful sculptures and installations a Farmers' Market the park artists studios artists studios, a theatre and a cafe various wonderful sculptures you a theatre Gasworks Park: Gasworks Park installations",
  43.246639251708984),
 ("the heart of the city's cultural and civic activity the city's cultural and civic activity the heart the Melbourne Town Hall",
  30.413190841674805),
 ('an iconic Melbourne location The magnificent octagonal domed reading room a quiet space for study and an iconic Melbourne location to take an unforgettable selfie a quiet space study an unforgettable selfie an iconic Melbourne location to take an unforgettable selfie',
  24.648412704467773),
 ("education Australia's Number One university and world leader research Australia's Number One university and world leader in education, teaching and research excellence. teaching teaching and research excellence excellence education, teaching and research excellence",
  18.05719

#### Conclusions

The process of matching cannot be fully autoamted - the tasks is more complex than using BERT embeddings for matching. Even trained embedding on MSMARCO do not lead to good performance and seems to be confused between correct and incorrect matches.

### Matching: Case Investigation

Checking the POIs in description with respect to OSM POIs

**Aim**: Manually checking few examples in the dataset to see how the descriptions provided by people is different from tags stored in OSM to design a better approach for labelling the dataset.

**Approach**: Given a case_id (walk), we pull all the POI information in WalkingMap dataset and extracted POIs from OSM in previous step, and we analysis spatial and semantic criteria and their success/failure in performing matching process.

In [28]:
def get_case(idx):
    record = dataset[idx]  
    pois = {'title': [], 'summary': [], 'lat': [], 'lng': []}
    for poi in record['pois']:
        pois['title'].append(poi['title'])
        pois['summary'].append(poi['summary'])
        pois['lat'].append(poi['lat'])
        pois['lng'].append(poi['lng'])
        
    df = pd.DataFrame(pois)
    gdf = gpd.GeoDataFrame(df[['title', 'summary', 'lat', 'lng']],
                           geometry=gpd.points_from_xy(df.lng, df.lat), crs="EPSG:4326")
    
    if os.path.isfile('dataset/features-osm-{}.geojson'.format(idx)):
        osm_pois = gpd.read_file('dataset/features-osm-{}.geojson'.format(idx))
    else:
        logger.warning('OSM features are not loaded - potentially empty dataframe')
        osm_pois = None
    return gdf, osm_pois

In [29]:
lw_poi, osm_poi = get_case(1)

In [30]:
lw_poi

Unnamed: 0,title,summary,lat,lng,geometry
0,Fairhaven Surf Life Saving Club,Fairhaven is a well known surf beach. The beac...,-38.468759,144.084459,POINT (144.08446 -38.46876)
1,Beach walk,"From Sprout Creek, Eastern View, Moggs Creek, ...",-38.468542,144.089693,POINT (144.08969 -38.46854)
2,Rock pools,See what sort of shells and stones you can col...,-38.468459,144.09242,POINT (144.09242 -38.46846)
3,Sand dunes,The beautiful rolling sand dunes shape the bea...,-38.468418,144.095318,POINT (144.09532 -38.46842)
4,Painkalac Creek,The creek separates Aireys Inlet from Fairhave...,-38.46839,144.097312,POINT (144.09731 -38.46839)
5,Rocks and caves under the light house,There are more rockpools and rocky outcrops to...,-38.468822,144.100861,POINT (144.10086 -38.46882)
6,Aireys Inlet playground and picnic ground,There is small skateboard ramp for children to...,-38.466199,144.098772,POINT (144.09877 -38.46620)
7,Aireys Inlet lower shops,"Pick up a coffee, newspaper or Fish and Chips!...",-38.465536,144.098801,POINT (144.09880 -38.46554)
8,Loutit Bay lookout,Return to Painkalac Creek inlet and walk to th...,-38.467916,144.103435,POINT (144.10344 -38.46792)
9,Historical homestead and building,At the lighthouse is the original homestead fo...,-38.468048,144.103832,POINT (144.10383 -38.46805)


In [31]:
osm_poi

Unnamed: 0,element_type,osmid,name,amenity,natural,leisure,geometry
0,node,831201200,,toilets,,,POINT (144.09837 -38.46594)
1,node,831201305,,toilets,,,POINT (144.10104 -38.46734)
2,node,831201411,,bbq,,,POINT (144.09857 -38.46592)
3,node,831201826,,shelter,,,POINT (144.09864 -38.46595)
4,node,5315720235,,,,picnic_table,POINT (144.10073 -38.46681)
5,node,8568393481,,waste_basket,,,POINT (144.10029 -38.46666)
6,way,30501938,Painkalac Creek Estuary,,water,,"POLYGON ((144.09591 -38.46359, 144.09625 -38.4..."
7,way,69366065,,parking,,,"POLYGON ((144.10000 -38.46659, 144.09998 -38.4..."
8,way,69366070,,parking,,,"POLYGON ((144.09836 -38.46586, 144.09836 -38.4..."
9,way,69366078,,,,playground,"POLYGON ((144.10198 -38.46581, 144.10217 -38.4..."


In [32]:
# projection 

lw_projected = lw_poi.to_crs("EPSG:32755")
osm_projected = osm_poi.to_crs("EPSG:32755")

In [33]:
poi_case = 6  # analysing a specific POI in the fetched case 

lw_poi.iloc[poi_case]

title               Aireys Inlet playground and picnic ground
summary     There is small skateboard ramp for children to...
lat                                                -38.466199
lng                                                144.098772
geometry                    POINT (144.09877169 -38.46619881)
Name: 6, dtype: object

In [34]:
osm_projected['distance_to_{}'.format(poi_case)] = osm_projected.distance(lw_projected.iloc[poi_case]['geometry'])
osm_projected.sort_values(by='distance_to_{}'.format(poi_case))

Unnamed: 0,element_type,osmid,name,amenity,natural,leisure,geometry,distance_to_6
17,way,283542690,,parking,,,"POLYGON ((246877.221 5738486.891, 246882.476 5...",0.902516
14,way,69560073,Aireys Inlet Reserve,,,park,"POLYGON ((246878.250 5738531.576, 246875.373 5...",8.479294
10,way,69366081,Aireys Inlet Skate Park,,,pitch,"POLYGON ((246840.654 5738478.916, 246862.497 5...",13.688306
12,way,69366108,,,,playground,"POLYGON ((246848.251 5738495.055, 246867.426 5...",15.267822
3,node,831201826,,shelter,,,POINT (246863.920 5738496.515),30.304774
2,node,831201411,,bbq,,,POINT (246857.373 5738499.131),35.68771
8,way,69366070,,parking,,,"POLYGON ((246838.886 5738505.614, 246838.829 5...",37.170695
0,node,831201200,,toilets,,,POINT (246840.125 5738496.254),45.318589
6,way,30501938,Painkalac Creek Estuary,,water,,"POLYGON ((246617.006 5738750.566, 246645.487 5...",51.867211
13,way,69366116,,parking,,,"POLYGON ((246876.055 5738551.594, 246888.002 5...",78.70546


#### Conclusions:

With manual investigation, we find out that spatial criterion can be a case to filter unwanted records but matching process needs more information than just matching by location because:

1. POIs visible vs. POIs in nearby: Sometime people describe a place or object in nearby, sometimes the actual POI is far and the location in leisure walk is just a place to see that POI
2. OSM and LW location errors
3. No match: Not always we can match POIs in LW to OSM, as there are cases missing...
4. Multiple matches: Not always a POI described in LW can be matched with only 1 OSM record - different conceptualization, possible ambiguity in description or OSM data -- e.g., a playground described in a park, but in OSM we have three different objects labelled as playground and all near to the location provided in LW.

In [35]:
# todo - maybe creating a dataset as well! the task is actually difficult!
def generate_req_id(row):
    return row['element_type'][0].upper()+str(row['osmid'])

osm_poi['req_id'] = osm_poi.apply(generate_req_id, axis=1)
logger.debug(f'{osm_poi.req_id.tolist()}')

In [36]:
req_ids = set()

# read all osm ids and save in a file
for idx, path in enumerate(paths):
    if os.path.isfile('dataset/features-osm-{}.geojson'.format(idx)):
        osm_poi = gpd.read_file('dataset/features-osm-{}.geojson'.format(idx))
        osm_poi['req_id'] = osm_poi.apply(generate_req_id, axis=1)
        req_ids.update(osm_poi.req_id.tolist())

In [37]:
headers = {"Content-Type": "application/json; charset=utf-8"}
address_endpoint_template = "https://nominatim.openstreetmap.org/lookup?osm_ids={}&format=json&extratags=1"

def download_osm_details(rids):
    resp = requests.get(address_endpoint_template.format(','.join(rids), headers=headers))
    return resp.json()

In [38]:
req_ids = list(req_ids)

all_osm_info = []

if os.path.isfile('dataset/osm-detailed-pois.json'):
    with open('dataset/osm-detailed-pois.json', 'r') as fp:
        all_osm_info = json.load(fp)
else:
    bucket_size = 50  # maximum value for OSM lookup!
    for i in range(0, len(req_ids), bucket_size):
        try:
            all_osm_info.append(download_osm_details(req_ids[i:i+bucket_size]))
            logger.info('bucket done: {}'.format(i))
            time.sleep(0.5)
        except Exception as e:
            logger.warning(e)
            logger.warning('error in bucket: {}'.format(i))

In [39]:
all_osm_list = []
for bucket in all_osm_info:
    all_osm_list.extend(bucket)

In [40]:
with open('dataset/osm-detailed-pois.json', 'w', encoding='utf-8') as fp:
    json.dump(all_osm_info, fp)

with open('dataset/processed-osm-detailed-pois.json', 'w', encoding='utf-8') as fp:
    json.dump(all_osm_list, fp)
    
logger.info('Detailed information about OSM pois are stored in `dataset/osm-detailed-pois.json`')

[32m2025-01-25 20:27:16.738[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mDetailed information about OSM pois are stored in `dataset/osm-detailed-pois.json`[0m


In [41]:
print(len(req_ids))
print(len(all_osm_info))
print(len(all_osm_list))
osm_poi_details_df = pd.DataFrame(all_osm_list)
osm_poi_details_df.head()

84203
1685
27900


Unnamed: 0,place_id,licence,osm_type,osm_id,lat,lon,class,type,place_rank,importance,addresstype,name,display_name,address,extratags,boundingbox
0,50105769,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,210529635,-37.77131895,144.88922947664923,amenity,parking,30,1e-05,amenity,David Jones Carpark,"David Jones Carpark, Primary Place, Maribyrnon...","{'amenity': 'David Jones Carpark', 'road': 'Pr...","{'parking': 'multi-storey', 'building': 'parki...","[-37.7716973, -37.7708987, 144.8879962, 144.89..."
1,50264145,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,1005592702,-37.755842,144.79434671579116,amenity,parking,30,1e-05,amenity,,"Ken Jordan Road, Cairnlea, Melbourne, City of ...","{'road': 'Ken Jordan Road', 'suburb': 'Cairnle...",{'parking': 'street_side'},"[-37.7559597, -37.7557312, 144.7942967, 144.79..."
2,49748802,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,948227337,-38.33887055,144.72523383795718,leisure,swimming_pool,30,1e-05,leisure,,"Stonecutters Road, Portsea, Melbourne, Shire o...","{'road': 'Stonecutters Road', 'suburb': 'Ports...",,"[-38.3389150, -38.3388240, 144.7251718, 144.72..."
3,50013438,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,542417354,-37.9853244,145.2116291306154,amenity,parking,30,1e-05,amenity,,"Robinson Street, Dandenong, Melbourne, City of...","{'road': 'Robinson Street', 'suburb': 'Dandeno...",,"[-37.9856572, -37.9849960, 145.2114001, 145.21..."
4,50156137,"Data © OpenStreetMap contributors, ODbL 1.0. h...",node,678349689,-37.800412,144.966749,amenity,restaurant,30,1e-05,amenity,Il Cantuccio,"Il Cantuccio, 209, Lygon Street, Little Italy,...","{'amenity': 'Il Cantuccio', 'house_number': '2...","{'phone': '+61 3 9347 9959', 'cuisine': 'itali...","[-37.8004620, -37.8003620, 144.9666990, 144.96..."


In [42]:
osm_poi_details_df[(osm_poi_details_df['osm_type'] == 'way') & (osm_poi_details_df['osm_id'] == 542417354)]['display_name'].values[0]  # example

'Robinson Street, Dandenong, Melbourne, City of Greater Dandenong, Victoria, 3177, Australia'

In [43]:
def enrich(row):
    info = osm_poi_details_df[(osm_poi_details_df['osm_type'] == row['element_type']) & 
    (osm_poi_details_df['osm_id'] == row['osmid'])]
    if len(info) > 0:
        t_name = ' '.join(info['display_name'].values[0].split(',')[:2])
        if info['extratags'].values[0] is not None:
            t_name += ' '.join([k+' '+v for (k,v) in info['extratags'].values[0].items()])
    else:
        t_name = ''
    h_name = ''
    parents = osm_poi.loc[(osm_poi.geometry.contains(row.geometry)) & (osm_poi.id != row.id)]['name'].values.tolist()
    if len(parents) > 0:
        h_name += ' in '+ ', '.join([p for p in parents if p != 'nan'])
    p_name = ''
    if row['amenity'] != 'nan':
        p_name += 'amenity {} '.format(row['amenity']).replace('_', ' ')
    if row['natural'] != 'nan':
        p_name += 'natural {} '.format(row['natural']).replace('_', ' ')
    if row['leisure'] != 'nan':
        p_name += 'leisure {} '.format(row['leisure']).replace('_', ' ')
    if row['name'] == 'nan':
        return p_name+ t_name + h_name
    return row['name'] + ' ' + p_name + t_name + h_name

In [44]:
osm_poi.reset_index(inplace=True)
osm_poi = osm_poi.rename(columns= {'index': 'id'})
osm_poi.head()

Unnamed: 0,id,element_type,osmid,name,amenity,natural,leisure,geometry,req_id
0,0,node,10889194475,,,tree,,POINT (145.05762 -37.65706),N10889194475
1,1,node,10889194476,,,tree,,POINT (145.05774 -37.65697),N10889194476
2,2,node,10889194477,,,tree,,POINT (145.05782 -37.65706),N10889194477
3,3,node,10889194478,,,tree,,POINT (145.05787 -37.65710),N10889194478
4,4,node,10889194481,,,tree,,POINT (145.05794 -37.65696),N10889194481


In [45]:
osm_poi['full_name'] = osm_poi.apply(enrich, axis=1)
osm_poi.head()

Unnamed: 0,id,element_type,osmid,name,amenity,natural,leisure,geometry,req_id,full_name
0,0,node,10889194475,,,tree,,POINT (145.05762 -37.65706),N10889194475,natural tree
1,1,node,10889194476,,,tree,,POINT (145.05774 -37.65697),N10889194476,natural tree
2,2,node,10889194477,,,tree,,POINT (145.05782 -37.65706),N10889194477,natural tree
3,3,node,10889194478,,,tree,,POINT (145.05787 -37.65710),N10889194478,natural tree
4,4,node,10889194481,,,tree,,POINT (145.05794 -37.65696),N10889194481,natural tree


In [46]:
osm_poi.loc[(osm_poi.geometry.contains(osm_poi.loc[70].geometry)) & (osm_poi.index != 70)]['name'].values.tolist()

['Mill Park Recreation Reserve']

In [47]:
osm_poi.iloc[70]['full_name']

'leisure pitch Lady Penrhyn Avenue  Mill Parksport softball in Mill Park Recreation Reserve'

In [48]:
for idx, path in enumerate(paths):
    if os.path.isfile('dataset/features-osm-{}.geojson'.format(idx)):
        if os.path.isfile("dataset/features-osm-poi-{}.geojson".format(idx)):
            continue
        osm_poi = gpd.read_file('dataset/features-osm-{}.geojson'.format(idx))
        logger.info('analysing: {0} - number of features: {1}'.format(idx, len(osm_poi)))
        osm_poi.reset_index(inplace=True)
        osm_poi = osm_poi.rename(columns= {'index': 'id'})
        osm_poi['full_name'] = osm_poi.apply(enrich, axis=1)
        osm_poi.to_file("dataset/features-osm-poi-{}.geojson".format(idx), driver='GeoJSON')
        logger.info('enriched features for path {0} out of {1} is loaded from OSM and saved ...'.format(idx, len(paths)))

In [49]:
osm_poi.head()

Unnamed: 0,id,element_type,osmid,name,amenity,natural,leisure,geometry,req_id,full_name
0,0,node,10889194475,,,tree,,POINT (145.05762 -37.65706),N10889194475,natural tree
1,1,node,10889194476,,,tree,,POINT (145.05774 -37.65697),N10889194476,natural tree
2,2,node,10889194477,,,tree,,POINT (145.05782 -37.65706),N10889194477,natural tree
3,3,node,10889194478,,,tree,,POINT (145.05787 -37.65710),N10889194478,natural tree
4,4,node,10889194481,,,tree,,POINT (145.05794 -37.65696),N10889194481,natural tree


### Matching LW POIs to OSM POIs: Experiment

**Aim**: To investigate how to perform automatic matching of POIs in LW to OSM, or provide a set of candidate for a semi-automatic matching process (filtering: automatic, matching: manual)

Using:

- *Spatial criterion*: nearby or contained
- *Thematic criterion*: topic representation of POI with types in OSM POIs
- *Linguistic criterion*: description of the POI with detailed contextual information from OSM (name, type, hierarchy)

**Note**: The POIs might be missing in OSM data

In [50]:
def get_case_with_details(idx):
    record = dataset[idx]  
    pois = {'title': [], 'summary': [], 'lat': [], 'lng': []}
    for poi in record['pois']:
        pois['title'].append(poi['title'])
        pois['summary'].append(poi['summary'])
        pois['lat'].append(poi['lat'])
        pois['lng'].append(poi['lng'])
        
    df = pd.DataFrame(pois)
    gdf = gpd.GeoDataFrame(df[['title', 'summary', 'lat', 'lng']],
                           geometry=gpd.points_from_xy(df.lng, df.lat), crs="EPSG:4326")
    
    if os.path.isfile('dataset/features-osm-poi-{}.geojson'.format(idx)):
        osm_pois = gpd.read_file('dataset/features-osm-poi-{}.geojson'.format(idx))
    else:
        logger.warning('OSM features are not loaded - potentially empty dataframe')
        osm_pois = None
    return gdf, osm_pois

In [51]:
test_case_idx = 2
test_case_gdf, test_case_pois = get_case_with_details(test_case_idx)

In [52]:
test_case_gdf = test_case_gdf.to_crs("EPSG:32755")
test_case_gdf

Unnamed: 0,title,summary,lat,lng,geometry
0,1. Tramway signal box,Built in 1928 soon after the electrification o...,-37.806953,144.962813,POINT (320663.065 5813648.747)
1,2. City Baths,"Built in 1903, the design reflected the social...",-37.807382,144.96299,POINT (320679.723 5813601.482)
2,3. Magistrates Court,Built on the site of the earlier Supreme Court...,-37.808828,144.966112,POINT (320958.059 5813447.052)
3,4. Old Melbourne Gaol,Built between 1851 - 1864. As the oldest survi...,-37.807569,144.96571,POINT (320919.660 5813585.973)
4,5. Eight Hour Day Monument,"Built in 1923, the monument commemorates the E...",-37.807126,144.965808,POINT (320927.197 5813635.299)
5,6. Trades Hall,"Built in stages from 1873 - 1926, Trades Hall ...",-37.806905,144.965989,POINT (320942.628 5813660.156)
6,7. Medley Hall,"Built in 1893 as a private residence, the buil...",-37.805803,144.967618,POINT (321083.333 5813785.590)
7,8. Lygon shop corner,Lygon Buildings is architecturally significant...,-37.804863,144.966279,POINT (320963.170 5813887.275)
8,9. Matthais House,A two storeyed stucco faced bluestone house of...,-37.803827,144.967759,POINT (321091.019 5814005.140)
9,10. Sacred Heart Catholic Church,Built in 1855-56. In the 1930s and 1940s the C...,-37.803051,144.969378,POINT (321231.646 5814094.295)


In [53]:
test_case_pois = test_case_pois.to_crs("EPSG:32755")
logger.info(f'size of the OSM POI dataframe: {len(test_case_pois.id)}')
test_case_pois.head()

[32m2025-01-25 20:27:34.150[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1msize of the OSM POI dataframe: 2380[0m


Unnamed: 0,id,element_type,osmid,name,amenity,natural,leisure,full_name,geometry
0,0,node,242538793,,post_box,,,amenity post box Queensberry Street Carlton,POINT (320727.351 5813899.378)
1,1,node,242540159,,telephone,,,amenity telephone Swanston Street East End Th...,POINT (320749.536 5813412.933)
2,2,node,242823091,,telephone,,,amenity telephone Swanston Street East End Th...,POINT (320730.759 5813422.172)
3,3,node,242823102,,post_box,,,amenity post box Pelham Street Carlton,POINT (321220.734 5814072.447)
4,4,node,242823114,,toilets,,,amenity toilets Rathdowne Street Carltonfee n...,POINT (321272.049 5814137.476)


In [54]:
test_row_id = 6
test_row = test_case_gdf.loc[test_row_id]
test_row

title                                          7. Medley Hall
summary     Built in 1893 as a private residence, the buil...
lat                                                -37.805803
lng                                                144.967618
geometry         POINT (321083.33274179127 5813785.590421503)
Name: 6, dtype: object

In [55]:
test_case_pois.loc[test_row.geometry.distance(test_case_pois.geometry) < 100]

Unnamed: 0,id,element_type,osmid,name,amenity,natural,leisure,full_name,geometry
2153,2153,way,265141186,Lygon Street Christian Chapel,place_of_worship,,,Lygon Street Christian Chapel amenity place of...,"POLYGON ((321008.928 5813761.985, 320994.084 5..."
2231,2231,way,710777495,,parking,,,amenity parking McDonald Lane Carltonaccess p...,"POLYGON ((321000.885 5813812.657, 320997.612 5..."
2232,2232,way,710777496,,parking,,,amenity parking Elm Tree Place Carltonaccess ...,"POLYGON ((321135.119 5813776.077, 321133.814 5..."
2254,2254,way,743141724,,parking,,,amenity parking McDonald Lane Carltonaccess c...,"POLYGON ((320978.251 5813859.025, 321001.671 5..."
2283,2283,way,831017470,,parking,,,amenity parking Hudson Place Carltonaccess pr...,"POLYGON ((321146.543 5813801.593, 321159.873 5..."
2284,2284,way,831017484,,parking,,,amenity parking Trades Hall Place Carltonacce...,"POLYGON ((321003.601 5813711.301, 321007.125 5..."
2285,2285,way,831017485,,parking,,,amenity parking Trades Hall Place Carltonacce...,"POLYGON ((321009.308 5813760.340, 321010.362 5..."


In [56]:
# geocoding results for all POIs
titles = [pt for pt in data_structure['poi_title']]

In [57]:
geocoder = Nominatim(user_agent='research_app')  # can geocoding be of help?

In [58]:
paths[test_case_idx]

{'min_lat': -37.80922028,
 'max_lat': -37.79740113,
 'min_lng': 144.96275961,
 'max_lng': 144.97345358}

In [59]:
result = geocoder.geocode("Medley Hall", viewbox=[(paths[test_case_idx]['max_lat'], paths[test_case_idx]['max_lng']), 
                                                         (paths[test_case_idx]['min_lat'], paths[test_case_idx]['min_lng'])],
                          bounded=True)

In [60]:
result

In [61]:
def geocode_by_name(name, path):
    name = name.lstrip('0123456789.- ')
    return geocoder.geocode(name, viewbox=[(path['max_lat'], path['max_lng']), 
                                                         (path['min_lat'], path['min_lng'])],
                          bounded=True)

In [62]:
if os.path.isfile('dataset/nominatim-geocoding.json'):
    with open('dataset/nominatim-geocoding.json') as fp:
        nominatim_output = json.load(fp)
    logger.info('nominatim dump file is already loaded')
else:
    geocoding_results = []
    for idx, path in enumerate(paths):
        record = dataset[idx]
        for poi in record['pois']:
            name = poi['title']
            result = geocode_by_name(name, path)
            geocoding_results.append(result)
            if result is not None:
                logger.info(name, path)
        if idx%10 == 0:
            time.sleep(1)
            logger.info('idx: {}'.format(idx))

    nominatim_output = {}
    counter = 0
    for idx, path in enumerate(paths):
        record = dataset[idx]
        for poi in record['pois']:
            geocoding_result = geocoding_results[counter]
        
            name = poi['title']
            description = poi['summary']
            lat = poi['lat']
            lng = poi['lng']

            nominatim_output[counter] = {'walk_id': idx, 'title': name, 'summary': description, 'lat': lat, 'lng': lng}
            if geocoding_result is not None:
                nominatim_output[counter]['osm'] = geocoding_result.raw
            else:
                nominatim_output[counter]['osm'] = None
            counter += 1

[32m2025-01-25 20:27:42.353[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mnominatim dump file is already loaded[0m


In [65]:
nominatim_output['23']

{'walk_id': 2,
 'title': '13. Royal Exhibition Building',
 'summary': 'The Royal Exhibition Building is the only surviving Great Hall that once housed a 19th-century international exhibition and is still used for exhibitions. ',
 'lat': -37.80513488,
 'lng': 144.97123539,
 'osm': {'place_id': 17546919,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
  'osm_type': 'way',
  'osm_id': 4817059,
  'lat': '-37.804666850000004',
  'lon': '144.9714669305319',
  'class': 'historic',
  'type': 'building',
  'place_rank': 30,
  'importance': 0.39044459367468287,
  'addresstype': 'historic',
  'name': 'Royal Exhibition Building',
  'display_name': 'Royal Exhibition Building, 9, Nicholson Street, Carlton, Melbourne, City of Melbourne, Victoria, 3053, Australia',
  'boundingbox': ['-37.8051500', '-37.8041865', '144.9705305', '144.9724671']}}

In [66]:
logger.debug(f'all nominatim outputs: {nominatim_output}')
logger.info(f'size of nominatim outputs: {len(nominatim_output.keys())}')

[32m2025-01-25 20:27:52.327[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1msize of nominatim outputs: 4392[0m


## Labelling Process:

- label if the geocoding was correct - if geocoding was successful
- find 10 most likely using textual and spatial criteria and record the element as a match

In [71]:
def get_feature_from_osm(lat, lng, dist=200, tags=tags):
    return ox.features_from_point((lat, lng), tags, dist)

In [72]:
case_id = 23
test_case = nominatim_output[str(case_id)]

In [73]:
test_case

{'walk_id': 2,
 'title': '13. Royal Exhibition Building',
 'summary': 'The Royal Exhibition Building is the only surviving Great Hall that once housed a 19th-century international exhibition and is still used for exhibitions. ',
 'lat': -37.80513488,
 'lng': 144.97123539,
 'osm': {'place_id': 17546919,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
  'osm_type': 'way',
  'osm_id': 4817059,
  'lat': '-37.804666850000004',
  'lon': '144.9714669305319',
  'class': 'historic',
  'type': 'building',
  'place_rank': 30,
  'importance': 0.39044459367468287,
  'addresstype': 'historic',
  'name': 'Royal Exhibition Building',
  'display_name': 'Royal Exhibition Building, 9, Nicholson Street, Carlton, Melbourne, City of Melbourne, Victoria, 3053, Australia',
  'boundingbox': ['-37.8051500', '-37.8041865', '144.9705305', '144.9724671']}}

In [74]:
dist_threshold = 200
features = get_feature_from_osm(test_case['lat'], test_case['lng'], dist=dist_threshold)
features

Unnamed: 0_level_0,Unnamed: 1_level_0,access,amenity,changing_table,fee,operator,operator:wikidata,toilets:disposal,unisex,wheelchair,geometry,...,nodes,layer,leaf_type,ways,addr:suburb,type,wikipedia,intermittent,salt,water
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,319157917,yes,toilets,no,no,Melbourne City Council,Q56477763,flush,yes,yes,POINT (144.96920 -37.80626),...,,,,,,,,,,
node,368393200,,cinema,,,Melbourne museum,,,,,POINT (144.97064 -37.80351),...,,,,,,,,,,
node,371974432,,fountain,,,,,,,,POINT (144.97138 -37.80545),...,,,,,,,,,,
node,767585574,,bench,,,,,,,,POINT (144.97296 -37.80630),...,,,,,,,,,,
node,767689503,,parking_entrance,,yes,,,,,,POINT (144.96975 -37.80406),...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
way,1249828891,,,,,,,,,,"POLYGON ((144.97256 -37.80592, 144.97252 -37.8...",...,"[11618112476, 11618112477, 11618112478, 116181...",,,,,,,,,
way,1249828892,,,,,,,,,,"POLYGON ((144.96983 -37.80600, 144.96980 -37.8...",...,"[11618112487, 11618112488, 11618112489, 116181...",,,,,,,,,
relation,6614802,,,,,Melbourne City Council,,,,,"MULTIPOLYGON (((144.96913 -37.80719, 144.96903...",...,"[[[4421481857, 4421481858, 4421481859, 4421481...",,,"[444681638, 444681637]",Carlton,multipolygon,en:Carlton Gardens,,,
relation,17205856,,,,,,,,,,"POLYGON ((144.97226 -37.80608, 144.97222 -37.8...",...,"[[[371970971, 371970972, 11618112514, 37197097...",,,"[33005170, 1249828890, 1249828891]",,multipolygon,,no,no,pond


In [75]:
logger.info(f'list of features from OSM results: {list(features.columns)}')

[32m2025-01-25 20:29:38.982[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mlist of features from OSM results: ['access', 'amenity', 'changing_table', 'fee', 'operator', 'operator:wikidata', 'toilets:disposal', 'unisex', 'wheelchair', 'geometry', 'check_date', 'name', 'payment:mastercard', 'payment:visa', 'phone', 'screen', 'website', 'wikidata', 'artist', 'covered', 'drinking_water', 'indoor', 'backrest', 'source', 'parking', 'bicycle_parking', 'fountain', 'leisure', 'brand', 'brand:wikidata', 'brand:wikipedia', 'operator:wikipedia', 'payment:cash', 'payment:credit_cards', 'toilets:wheelchair', 'natural', 'capacity', 'location', 'material', 'recycling:cans', 'recycling:glass_bottles', 'recycling:paper', 'recycling_type', 'nodes', 'layer', 'leaf_type', 'ways', 'addr:suburb', 'type', 'wikipedia', 'intermittent', 'salt', 'water'][0m


In [76]:
for key, value in nominatim_output.items():
    logger.debug('key: {}'.format(key))
    if os.path.isfile('dataset/osm-poi-{0}-dist-{1}-features.geojson'.format(key, dist_threshold)):
        logger.debug('already investigated...')
        continue;
    try:
        # only set for next round - first run dist=200 to collect initial results and the empty frames are completed by considering 1000 meters thresholds.
        features = get_feature_from_osm(value['lat'], value['lng'], dist=1000)
        cols = list(features.columns)
        cols.remove('geometry')
        features[cols] = features[cols].astype(str)
        features.to_file('dataset/osm-poi-{0}-dist-{1}-features.geojson'.format(key, dist_threshold), driver="GeoJSON") 
        logger.info('done')
    except Exception as e:
        logger.warning(e)

In [77]:
import pyproj
from shapely.geometry import Point
from shapely.ops import transform

wgs84_pt = Point(test_case['lng'], test_case['lat'])
wgs84 = pyproj.CRS('EPSG:4326')
utm = pyproj.CRS('EPSG:32755')

project = pyproj.Transformer.from_crs(wgs84, utm, always_xy=True).transform

utm_point = transform(project, wgs84_pt)

features = gpd.read_file('dataset/osm-poi-{0}-dist-{1}-features.geojson'.format(case_id, dist_threshold))
features = features.to_crs('EPSG:32755')

features['distance'] = [utm_point.distance(geom) for geom in features.geometry]

In [78]:
features

Unnamed: 0,element_type,osmid,highway,traffic_signals:direction,access,amenity,fee,operator,toilets:disposal,unisex,...,source:population,name:mk,short_name,political_division,heritage,heritage:operator,heritage:website,area,geometry,distance
0,node,319157917,,,yes,toilets,no,Melbourne City Council,flush,yes,...,,,,,,,,,POINT (321223.384 5813738.129),218.599034
1,node,368393200,,,,cinema,,Melbourne museum,,,...,,,,,,,,,POINT (321343.567 5814046.274),188.373535
2,node,371974432,,,,fountain,,,,,...,,,,,,,,,POINT (321413.415 5813831.827),37.211904
3,node,493873180,crossing,,,,,,,,...,,,,,,,,,POINT (321256.686 5813883.765),144.563983
4,node,501027469,crossing,,,,,,,,...,,,,,,,,,POINT (321600.961 5813827.093),204.586235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,relation,6614802,,,,,,Melbourne City Council,,,...,,,,,,,,,"MULTIPOLYGON (((321219.805 5813634.926, 321210...",3.048837
436,relation,6623509,,,,,,,,,...,,,,,1,whc,http://whc.unesco.org/en/list/1131,,"MULTIPOLYGON (((321415.382 5813963.959, 321416...",3.048837
437,relation,13238592,pedestrian,,,,,,,,...,,,,,,,,yes,"POLYGON ((321571.451 5813919.342, 321569.416 5...",96.331620
438,relation,16464561,,,,,,,,,...,,,,,,,,,"POLYGON ((321163.337 5813990.989, 321180.796 5...",209.794588


In [79]:
not_consider = ['geometry', 'distance', 'element_type', 'osmid']
consider_first = ['name', 'short_name']
cols = list(features.columns)

In [80]:
def generate_textual_descriptions(row, cols=cols):
    full_name = ''
    for c in consider_first:
        if c in cols and row[c] != 'nan':
            full_name += ' ' + row[c]
    for c in cols:
        if c not in consider_first and c not in not_consider and ':' not in c and row[c] != 'nan' and row[c] != 'no' and 'http' not in row[c] and '[' not in row[c] and 'wiki' not in c:
            if row[c] == 'yes':
                full_name += ' ' + c
            else:
                full_name += ' {0} {1}'.format(c, row[c])
    return full_name.strip()

In [81]:
features['full_name'] = features.apply(generate_textual_descriptions, axis=1)
features

Unnamed: 0,element_type,osmid,highway,traffic_signals:direction,access,amenity,fee,operator,toilets:disposal,unisex,...,name:mk,short_name,political_division,heritage,heritage:operator,heritage:website,area,geometry,distance,full_name
0,node,319157917,,,yes,toilets,no,Melbourne City Council,flush,yes,...,,,,,,,,POINT (321223.384 5813738.129),218.599034,access amenity toilets operator Melbourne City...
1,node,368393200,,,,cinema,,Melbourne museum,,,...,,,,,,,,POINT (321343.567 5814046.274),188.373535,IMAX Melbourne amenity cinema operator Melbour...
2,node,371974432,,,,fountain,,,,,...,,,,,,,,POINT (321413.415 5813831.827),37.211904,amenity fountain
3,node,493873180,crossing,,,,,,,,...,,,,,,,,POINT (321256.686 5813883.765),144.563983,highway crossing crossing zebra
4,node,501027469,crossing,,,,,,,,...,,,,,,,,POINT (321600.961 5813827.093),204.586235,highway crossing crossing traffic_signals tact...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,relation,6614802,,,,,,Melbourne City Council,,,...,,,,,,,,"MULTIPOLYGON (((321219.805 5813634.926, 321210...",3.048837,Carlton Gardens operator Melbourne City Counci...
436,relation,6623509,,,,,,,,,...,,,,1,whc,http://whc.unesco.org/en/list/1131,,"MULTIPOLYGON (((321415.382 5813963.959, 321416...",3.048837,Royal Exhibition Building and Carlton Gardens ...
437,relation,13238592,pedestrian,,,,,,,,...,,,,,,,yes,"POLYGON ((321571.451 5813919.342, 321569.416 5...",96.331620,highway pedestrian type multipolygon area
438,relation,16464561,,,,,,,,,...,,,,,,,,"POLYGON ((321163.337 5813990.989, 321180.796 5...",209.794588,building type multipolygon


In [82]:
def compute_similarities_topk(query, sentences, sentence_embeddings, model=sbert_model, k=10, verbose=False):
    query_vec = embed_texts(query)
    scores = util.dot_score(query_vec, sentence_embeddings)[0].cpu().tolist()
    doc_score_pairs = list(zip(sentences, scores))
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    if verbose:
        logger.info(f"Query: {query}")
        for doc, score in doc_score_pairs:
            logger.info(f'\t{score}\t{doc}')
    return np.argsort(scores)[-k:]

In [83]:
feature_descriptions = list(features['full_name'])
feature_embeddings = embed_texts(feature_descriptions, model=msmarco_model)
case_description = nominatim_output[str(case_id)]['title'] + ' ' + nominatim_output[str(case_id)]['summary']
k_similar = compute_similarities_topk(case_description, feature_descriptions, feature_embeddings, model=msmarco_model, verbose=True)
k_similar

[32m2025-01-25 20:30:03.818[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities_topk[0m:[36m7[0m - [1mQuery: 13. Royal Exhibition Building The Royal Exhibition Building is the only surviving Great Hall that once housed a 19th-century international exhibition and is still used for exhibitions. [0m
[32m2025-01-25 20:30:03.819[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities_topk[0m:[36m9[0m - [1m	44.04185104370117	Royal Exhibition Building and Carlton Gardens tourism attraction type multipolygon heritage 1[0m
[32m2025-01-25 20:30:03.819[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities_topk[0m:[36m9[0m - [1m	40.16722869873047	Royal Exhibition Building source Vicmap Address historic building tourism attraction building height 20 layer 1[0m
[32m2025-01-25 20:30:03.820[0m | [1mINFO    [0m | [36m__main__[0m:[36mcompute_similarities_topk[0m:[36m9[0m - [1m	39.156288146972656	Royal Exhibition Building Opening h

array([ 46,   1,  26,  27,  44,  49, 298,  43, 297, 436])

In [84]:
features.iloc[k_similar]

Unnamed: 0,element_type,osmid,highway,traffic_signals:direction,access,amenity,fee,operator,toilets:disposal,unisex,...,name:mk,short_name,political_division,heritage,heritage:operator,heritage:website,area,geometry,distance,full_name
46,node,9106562132,,,,,,,,,...,,,,,,,,POINT (321559.646 5813927.545),170.659614,artwork_type sculpture tourism artwork
1,node,368393200,,,,cinema,,Melbourne museum,,,...,,,,,,,,POINT (321343.567 5814046.274),188.373535,IMAX Melbourne amenity cinema operator Melbour...
26,node,4061250667,bus_stop,,,,,,,,...,,,,,,,,POINT (321243.477 5813954.530),179.719018,Exhibition Building/Rathdowne Street highway b...
27,node,4332324003,,,,charging_station,,Museums Victoria,,,...,,,,,,,,POINT (321463.713 5813962.011),114.58008,amenity charging_station operator Museums Vict...
44,node,7248901076,bus_stop,,,,,,,,...,,,,,,,,POINT (321211.629 5813902.092),191.907516,Exhibition Building/Rathdowne Street highway b...
49,node,9307551791,,,yes,,,,,,...,,,,,,,,POINT (321592.650 5813874.406),192.577146,access historic monument inscription To Victor...
298,way,4817074,,,,,yes,Museum Victoria,,,...,,,,,,,,"POLYGON ((321505.691 5814023.082, 321459.066 5...",159.043248,Melbourne Museum fee operator Museum Victoria ...
43,node,6810298878,,,,,,,,,...,,,,,,,,POINT (321503.096 5813888.150),105.093724,Royal Exhibition Building Opening historic mem...
297,way,4817059,,,,,,,,,...,,,,,,,,"POLYGON ((321415.382 5813963.959, 321416.338 5...",11.700249,Royal Exhibition Building source Vicmap Addres...
436,relation,6623509,,,,,,,,,...,,,,1.0,whc,http://whc.unesco.org/en/list/1131,,"MULTIPOLYGON (((321415.382 5813963.959, 321416...",3.048837,Royal Exhibition Building and Carlton Gardens ...


In [86]:
def get_top_k(case_id, k=10, model = msmarco_model, verbose=False):
    test_case = nominatim_output[str(case_id)]
    
    features = gpd.read_file('dataset/osm-poi-{0}-dist-{1}-features.geojson'.format(case_id, dist_threshold))
    cols = list(features.columns)
    if verbose:
        print(features.head())
        logger.info(cols)
    features['full_name'] = features.apply(lambda row: generate_textual_descriptions(row, cols), axis=1)
    
    feature_descriptions = list(features['full_name'])
    feature_embeddings = embed_texts(feature_descriptions, model=model)
    
    case_description = test_case['title'] + ' ' + test_case['summary']
    k_similar = compute_similarities_topk(case_description, feature_descriptions, feature_embeddings, model=model, verbose=verbose)
    return features.iloc[np.flip(k_similar)]

In [200]:
len(top_k_dfs)

4392

In [201]:
case_id

4391

In [202]:
top_k_df.

Unnamed: 0,element_type,osmid,highway,source,surface,leisure,sport,nodes,name,bicycle,...,service,fee,footway,shelter_type,geometry,full_name,osm_id,osm_type,lat,lng
51,way,1105105836,footway,,paved,,,"[ 668646878, 10112835040, 10112835039, 1011283...",,,...,,,,,"LINESTRING (145.05682 -37.65699, 145.05665 -37...",highway footway surface paved,1105105836,way,-37.657301,145.056369
53,way,1105106176,footway,,concrete,,,"[ 10112832233, 11098668126, 10112832241, 10112...",,,...,,,,,"LINESTRING (145.05658 -37.65664, 145.05668 -37...",highway footway surface concrete,1105106176,way,-37.656814,145.056755
50,way,1105105835,service,,,,,"[ 10112835025, 10112835039, 10112835035 ]",,,...,parking_aisle,,,,"LINESTRING (145.05615 -37.65695, 145.05647 -37...",highway service service parking_aisle,1105105835,way,-37.657204,145.056434
49,way,1105105834,service,,,,,"[ 10112835033, 10112835038, 10112835034 ]",,,...,parking_aisle,,,,"LINESTRING (145.05599 -37.65704, 145.05631 -37...",highway service service parking_aisle,1105105834,way,-37.657308,145.056267
48,way,1105105833,service,,,,,"[ 10112835024, 10112835028, 10112835047, 10112...",,,...,parking_aisle,,,,"LINESTRING (145.05583 -37.65713, 145.05583 -37...",highway service service parking_aisle,1105105833,way,-37.657328,145.056424
47,way,1105105832,service,,,,,"[ 10112835024, 978035608, 10112835033, 1011283...",,,...,parking_aisle,,,,"LINESTRING (145.05583 -37.65713, 145.05595 -37...",highway service service parking_aisle,1105105832,way,-37.656905,145.05634
34,way,277564665,,,,,,"[ 2820691440, 10155904575, 10794945280, 107949...",,,...,,,,,"POLYGON ((145.05939 -37.65564, 145.05933 -37.6...",natural wood,277564665,way,-37.655313,145.058038
85,way,1171940509,,,,pitch,softball,"[ 10889194401, 10889194402, 10889194403, 10889...",,,...,,,,,"POLYGON ((145.05677 -37.65641, 145.05675 -37.6...",leisure pitch sport softball,1171940509,way,-37.656364,145.056266
84,way,1171940508,,,,pitch,softball,"[ 10889194386, 10889194387, 10889194388, 10889...",,,...,,,,,"POLYGON ((145.05563 -37.65674, 145.05557 -37.6...",leisure pitch sport softball,1171940508,way,-37.656532,145.055199
83,way,1171940507,,,,pitch,softball,"[ 10889194371, 10889194372, 10889194373, 10889...",,,...,,,,,"POLYGON ((145.05555 -37.65548, 145.05560 -37.6...",leisure pitch sport softball,1171940507,way,-37.6557,145.055967


In [198]:
top_k_df_all = pd.concat(top_k_dfs)
top_k_df_all.to_csv(os.path.join(WRITE_DIR, 'poi_top10.csv'))