# Analysing the annotations 
This analysis includes:

1. describing the POIs using OSM tags
2. describing the relationship between the OSM tags and the topics for matches
3. finding patterns in unmatched topics
4. comparing the matched OSM records with their surrounding features (not selected - including and excluding the route)

In [1]:
# working with files
import os.path
# sys
import sys
# warning off
import warnings
# IO
import json

# requests
import requests

# dataframe 
import numpy as np
import pandas as pd
import geopandas as gpd


# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# logging
from loguru import logger

# set logger level
logger.remove(0)
logger.add(sys.stderr, level="INFO")

warnings.filterwarnings("ignore")

## 2.1. Walking Map Dataset

Creating a detailed dataframe of all POIs, their description, topic and sub-topic and their point-based representation.

In [3]:
# dataset:

# 0. Geodataframe of all POIs
with open('dataset/walkingmaps.json', 'r', encoding='utf-8') as fp:
    dataset = json.load(fp)

data_structure = {
    'record_id': [], 'record_title':[], 'record_description':[],
    'poi_title':[], 'poi_summary':[], 'latitude': [], 'longitude': []}

for record_id, record in enumerate(dataset):
    if 'pois' in record.keys() and len(record['pois']) > 0:
        for poi in record['pois']:
            data_structure['record_id'].append(record_id)
            data_structure['record_title'].append(record['title'])
            data_structure['record_description'].append(record['description'])
            data_structure['poi_title'].append(poi['title'])
            data_structure['poi_summary'].append(poi['summary'])
            data_structure['latitude'].append(poi['lat'])
            data_structure['longitude'].append(poi['lng'])

df = pd.DataFrame(data_structure)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326")

print(gdf.head())

# 1. POI - topic
poi_classified = pd.read_csv('outputs/dataframes/refined-topic-poi-description-classified.csv')
poi_classified.head()

   record_id                                       record_title  \
0          1  Fairhaven to Aireys Inlet Walk created by tedm...   
1          1  Fairhaven to Aireys Inlet Walk created by tedm...   
2          1  Fairhaven to Aireys Inlet Walk created by tedm...   
3          1  Fairhaven to Aireys Inlet Walk created by tedm...   
4          1  Fairhaven to Aireys Inlet Walk created by tedm...   

                                  record_description  \
0  Apart from the points of interested listed, he...   
1  Apart from the points of interested listed, he...   
2  Apart from the points of interested listed, he...   
3  Apart from the points of interested listed, he...   
4  Apart from the points of interested listed, he...   

                         poi_title  \
0  Fairhaven Surf Life Saving Club   
1                       Beach walk   
2                       Rock pools   
3                       Sand dunes   
4                  Painkalac Creek   

                               

Unnamed: 0.1,Unnamed: 0,index,topic,document,is_found,class,Count,Name,Representation,Representative_Docs,Class ID,Class,Subclass
0,0,0,4,Surf Life Saving Club: is a well known surf be...,False,,129,4_beach_bay_tide_sand,"['beach', 'bay', 'tide', 'sand', 'coast', 'pat...",['Broad Tidal Flats: Bay is sow on the western...,0,nature,natural landmarks
1,1,1,4,"Beach walk: From Sprout Creek, , , to the bott...",False,,129,4_beach_bay_tide_sand,"['beach', 'bay', 'tide', 'sand', 'coast', 'pat...",['Broad Tidal Flats: Bay is sow on the western...,0,nature,natural landmarks
2,2,3,4,Sand dunes: The beautiful rolling sand dune sh...,False,,129,4_beach_bay_tide_sand,"['beach', 'bay', 'tide', 'sand', 'coast', 'pat...",['Broad Tidal Flats: Bay is sow on the western...,0,nature,natural landmarks
3,3,96,4,"Surf' up: A good spot to watch the surf, have ...",False,,129,4_beach_bay_tide_sand,"['beach', 'bay', 'tide', 'sand', 'coast', 'pat...",['Broad Tidal Flats: Bay is sow on the western...,0,nature,natural landmarks
4,4,99,4,"Heading down to the beach: Follow the track, w...",False,,129,4_beach_bay_tide_sand,"['beach', 'bay', 'tide', 'sand', 'coast', 'pat...",['Broad Tidal Flats: Bay is sow on the western...,0,nature,natural landmarks


In [5]:
gdf.reset_index()
detailed_df = pd.merge(gdf.reset_index(), poi_classified, left_on='index', right_on='index', how='inner')
detailed_df.head()

Unnamed: 0.1,index,record_id,record_title,record_description,poi_title,poi_summary,latitude,longitude,geometry,Unnamed: 0,...,document,is_found,class,Count,Name,Representation,Representative_Docs,Class ID,Class,Subclass
0,0,1,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Fairhaven Surf Life Saving Club,Fairhaven is a well known surf beach. The beac...,-38.468759,144.084459,POINT (144.08446 -38.46876),0,...,Surf Life Saving Club: is a well known surf be...,False,,129,4_beach_bay_tide_sand,"['beach', 'bay', 'tide', 'sand', 'coast', 'pat...",['Broad Tidal Flats: Bay is sow on the western...,0,nature,natural landmarks
1,1,1,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Beach walk,"From Sprout Creek, Eastern View, Moggs Creek, ...",-38.468542,144.089693,POINT (144.08969 -38.46854),1,...,"Beach walk: From Sprout Creek, , , to the bott...",False,,129,4_beach_bay_tide_sand,"['beach', 'bay', 'tide', 'sand', 'coast', 'pat...",['Broad Tidal Flats: Bay is sow on the western...,0,nature,natural landmarks
2,2,1,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Rock pools,See what sort of shells and stones you can col...,-38.468459,144.09242,POINT (144.09242 -38.46846),154,...,Rock pools: See what sort of shell and stone y...,False,,11,66_pools_rock_luna_rockpool,"['pools', 'rock', 'luna', 'rockpool', 'bristle...",['Expansive view and little rock pools.: The o...,4,unknown,unknown
3,3,1,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Sand dunes,The beautiful rolling sand dunes shape the bea...,-38.468418,144.095318,POINT (144.09532 -38.46842),2,...,Sand dunes: The beautiful rolling sand dune sh...,False,,129,4_beach_bay_tide_sand,"['beach', 'bay', 'tide', 'sand', 'coast', 'pat...",['Broad Tidal Flats: Bay is sow on the western...,0,nature,natural landmarks
4,4,1,Fairhaven to Aireys Inlet Walk created by tedm...,"Apart from the points of interested listed, he...",Painkalac Creek,The creek separates Aireys Inlet from Fairhave...,-38.46839,144.097312,POINT (144.09731 -38.46839),176,...,Painkalac Creek: The creek sete from and form ...,False,,31,25_creek_leary_roderick_flooding,"['creek', 'leary', 'roderick', 'flooding', 'to...",['Painkalac Creek: The creek sete from and for...,0,nature,natural landmarks


## 2.2 Annotations

Loading and parsing the annotations for each point of interest

In [6]:
# reading annotation files
with open('dataset/annotations.json') as fp:
    raw_annotations = json.load(fp)


annotations = {}
for annotation_record in raw_annotations:
    page = None
    annotation = []
    for info in annotation_record:
        if 'page' in info.keys():
            page = str(info['page'])
        elif 'type' in info.keys() and 'osm_id' in info.keys():
            annotation.append(f"{info['type']}:{info['osm_id']}")
        else:
            logger.error(f'structural issues: {info}')

    if page is not None:
        annotations[page] = annotation

logger.info(f'total annotations: {len(annotations)}')

[32m2024-08-03 09:50:48.667[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mtotal annotations: 4386[0m


In [7]:
annotations

{'0': ['way:1089591567'],
 '1': ['way:1007494584'],
 '2': [],
 '3': [],
 '4': ['way:30501938', 'way:30501933'],
 '5': ['relation:9212157'],
 '6': ['way:69366108', 'way:69366081'],
 '7': ['node:4583492091',
  'node:11158854688',
  'node:11158854687',
  'node:11158854665'],
 '8': ['node:3621405496'],
 '9': [],
 '10': ['node:831201041'],
 '11': [],
 '12': ['way:26564416'],
 '13': ['way:49961266'],
 '14': ['way:26564392', 'node:6758200723'],
 '15': ['node:5371634608'],
 '16': ['node:7228362154', 'way:32710391'],
 '17': ['way:435967756'],
 '18': ['way:436339688',
  'way:436339689',
  'way:436339690',
  'way:1215583957',
  'way:436339691',
  'way:436339692',
  'way:1215583958',
  'way:436339693',
  'way:1215583959',
  'way:436339694',
  'way:436339695',
  'way:1215583956',
  'way:436339696',
  'way:436339697',
  'way:1215583960',
  'way:436339698'],
 '19': [],
 '21': ['node:368393200'],
 '22': ['way:4817074'],
 '23': ['relation:6623509', 'way:4817059', 'node:6810298878'],
 '24': ['relation:6

In [9]:
# download all information from OSM for these matched POIs
api_endpoint = 'https://www.openstreetmap.org/api/0.6/{0}/{1}.json'
headers = {"Content-Type": "application/json; charset=utf-8"}

def get_info(info):
    if info.lower() == 'flag':
        logger.warning('still a flagged record')
        return {}
    otype = info.lower().split(':')[0]
    oid = info.lower().split(':')[1]
    if otype not in ['node', 'way', 'relation']:
        logger.warning('wrong type - {}'.format(otype))
    resp = requests.get(api_endpoint.format(otype, oid), headers=headers)
    return resp.json()

get_info('way:26564416')  # testing

{'version': '0.6',
 'generator': 'CGImap 0.9.3 (2298250 spike-08.openstreetmap.org)',
 'copyright': 'OpenStreetMap and contributors',
 'attribution': 'http://www.openstreetmap.org/copyright',
 'license': 'http://opendatacommons.org/licenses/odbl/1-0/',
 'elements': [{'type': 'way',
   'id': 26564416,
   'timestamp': '2022-11-08T23:44:08Z',
   'version': 13,
   'changeset': 128661596,
   'user': 'MapAbility',
   'uid': 13782447,
   'nodes': [291210982,
    291210974,
    291210988,
    291210981,
    291210998,
    5651652284,
    5651652286,
    5651652285,
    291210982],
   'tags': {'brand': 'YMCA',
    'brand:wikidata': 'Q157169',
    'building': 'yes',
    'heritage': 'yes',
    'leisure': 'sports_centre',
    'name': 'Melbourne City Baths',
    'operator': 'ymca',
    'sport': 'swimming',
    'wikidata': 'Q2786840',
    'wikipedia': 'en:City Baths, Melbourne'}}]}

In [13]:
from tqdm import tqdm
import time 
# save all information about the matched records
osm_detailed_information = {}
for poi_id, matches in tqdm(annotations.items()):
    osm_detailed_information[poi_id] = {}
    for match in matches:
        try:
            osm_detailed_information[poi_id][match] = get_info(match)
            time.sleep(0.5)  # for polite crawling
        except Exception as e:
            logger.error(f'{poi_id}:{match} error', exc_info=True)

  0%|                                                                                                                             | 2/4386 [00:03<1:56:13,  1.59s/it][32m2024-08-03 10:03:47.158[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [31m[1m4:way:30501938 error[0m
  2%|██▉                                                                                                                        | 103/4386 [02:56<2:25:38,  2.04s/it][32m2024-08-03 10:06:40.457[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [31m[1m104:node:flag error[0m
  2%|██▉                                                                                                                        | 104/4386 [02:57<2:05:28,  1.76s/it][32m2024-08-03 10:06:41.566[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [31m[1m105:node:flag error[0m
  3%|███▍                                                                     

In [14]:
with open('outputs/annotations_details_osm_info.json', 'w', encoding='utf-8') as fp:
    json.dump(osm_detailed_information, fp)
logger.info('annotation details are saved and can be loaded')

[32m2024-08-03 11:27:03.843[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mannotation details are saved and can be loaded[0m


In [None]:
# load nearby features:
def get_osm_pois_nearby(poi_keys):
    dist_threshold = 200
    osm_pois = {}
    for key in tqdm.tqdm(poi_keys):
        logger.debug('key: {}'.format(key))
        if os.path.isfile('dataset/osm-poi-{0}-dist-{1}-features.geojson'.format(key, dist_threshold)):
            features =  gpd.read_file('dataset/osm-poi-{0}-dist-{1}-features.geojson'.format(key, dist_threshold))
        else:
            features =  gpd.read_file('dataset/osm-poi-{0}-dist-{1}-features.geojson'.format(key, 1000))
        features = features.to_crs('EPSG:32755')
        osm_pois[key] = features
    return osm_pois