In [1]:
import pandas as pd
import numpy as np
import requests
import json
import os
import plistlib

pd.set_option('max_colwidth', 150)

# Perform query

In [2]:
query = """[out:json];
(
  node["name"]["tourism"="museum"];
  node["name"]["tourism"="artwork"];
  node["name"]["amenity"="fountain"];
  node["name"]["historic"];
  node["name"]["amenity"="theatre"];
  way["name"]["tourism"="museum"];
  way["name"]["tourism"="artwork"];
  way["name"]["amenity"="fountain"];
  way["name"]["historic"];
  way["name"]["amenity"="theatre"];
);
out center;
>;
out qt;
"""

In [3]:
def overpass_query(query=None, overwrite=False):
    if os.path.exists("db.pickle") and not overwrite:
        return pd.read_pickle("db.pickle")
    
    if not query:
        print("Database not found on disk. You must provide a valid query.")
        return
    
    try:
        r = requests.get(url="http://127.0.0.1/api/interpreter", data=query)
        json_result = json.loads(r.content)
        df = pd.DataFrame(json_result["elements"])
        df.to_pickle("db.pickle")
        return df
    except Exception as e:
        print(e)

In [4]:
df = overpass_query(query)

# Database preparation: Nodes and Ways

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219441 entries, 0 to 219440
Data columns (total 7 columns):
type      219441 non-null object
id        219441 non-null int64
lat       203706 non-null float64
lon       203706 non-null float64
tags      39911 non-null object
center    15735 non-null object
nodes     15735 non-null object
dtypes: float64(2), int64(1), object(4)
memory usage: 11.7+ MB


## Get only records with tags and name

In [6]:
df = df[df.tags.notnull()]
df = df[df.apply(lambda x: "name" in x.tags, axis=1)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37896 entries, 0 to 219155
Data columns (total 7 columns):
type      37896 non-null object
id        37896 non-null int64
lat       22161 non-null float64
lon       22161 non-null float64
tags      37896 non-null object
center    15735 non-null object
nodes     15735 non-null object
dtypes: float64(2), int64(1), object(4)
memory usage: 2.3+ MB


## Create latitude and longitude from center for Ways

In [7]:
df_nodes = df.query("type == 'node'")
df_ways = df.query("type == 'way'")

df_ways = df_ways.assign(latitude=df_ways["center"].apply(lambda x: x["lat"]), longitude=df_ways["center"].apply(lambda x: x["lon"]))

In [8]:
df_ways.head()

Unnamed: 0,type,id,lat,lon,tags,center,nodes,latitude,longitude
21919,way,4497863,,,"{'addr:city': 'Milano', 'addr:country': 'IT', 'addr:housenumber': '1', 'addr:street': 'Via Giovanni Rota', 'amenity': 'theatre', 'building': 'publ...","{'lat': 45.4667635, 'lon': 9.1539554}","[27585621, 1532988874, 1532988867, 27585622, 27585619, 27585620, 27585621]",45.466763,9.153955
21920,way,4519679,,,"{'historic': 'archaeological_site', 'historic:civilization': 'western_roman', 'historic:period': 'dominate', 'name': 'Villa Imperiale', 'ruins': '...","{'lat': 45.46513, 'lon': 9.1805469}","[27732704, 2886739086, 2886739198, 304776292, 3333717879, 27732700, 27732702, 304776291, 27732703, 574389898, 1929235329, 1929235326, 27732704]",45.46513,9.180547
21921,way,4998830,,,"{'addr:city': 'Milano', 'addr:country': 'IT', 'addr:housenumber': '1', 'addr:street': 'Largo Antonio Greppi', 'amenity': 'theatre', 'building': 'p...","{'lat': 45.4725633, 'lon': 9.1823066}","[33198078, 33198069, 33198070, 33198071, 33198072, 395252763, 1692325433, 33198073, 4061689060, 33198074, 33198076, 3789138194, 1692325434, 445757...",45.472563,9.182307
21922,way,8035487,,,"{'building': 'yes', 'building:part': 'no', 'castle_type': 'fortress', 'charge': '14 EUR', 'email': 'sspsae-rm.castelsantangelo@beniculturali.it', ...","{'lat': 41.9030762, 'lon': 12.4663312}","[60071946, 5730657087, 2686698641, 2686698614, 2686698541, 2686698487, 2686698470, 2686698459, 2686698454, 60071948, 2686698432, 2686698430, 26866...",41.903076,12.466331
21923,way,11116333,,,"{'building': 'yes', 'historic': 'city_gate', 'name': 'Porta Pia', 'tourism': 'attraction', 'wikidata': 'Q3908761', 'wikipedia': 'it:Porta Pia (Anc...","{'lat': 43.615323, 'lon': 13.5049486}","[99064812, 99064884, 663913930, 99064891, 99064901, 663913929, 99064812]",43.615323,13.504949


## Concatenate nodes and ways

In [9]:
df_nodes = df_nodes.rename(columns={"lat": "latitude", "lon": "longitude"})
df_nodes = df_nodes[["id", "latitude", "longitude", "tags"]]
df_ways = df_ways[["id", "latitude", "longitude", "tags"]]

df = pd.concat([df_nodes, df_ways])
df.head()

Unnamed: 0,id,latitude,longitude,tags
0,21027591,42.662755,11.636329,"{'historic': 'archaeological_site', 'name': 'Via cava del Cavone'}"
1,26864200,44.543438,9.652994,"{'historic': 'memorial', 'name': 'Croce del Pelpi'}"
2,33148157,45.042277,11.75062,"{'historic': 'monument', 'name': 'Pavajon'}"
3,62512260,45.972641,9.453071,"{'historic': 'yes', 'is_in': 'Lecco, Lombardia, Italy', 'name': 'Introbio', 'place': 'village', 'population': '1605'}"
4,64776777,45.936835,11.09518,"{'historic': 'monument', 'name': 'Kaiserschuetzen 1917', 'tourism': 'artwork', 'wheelchair': 'yes'}"


# Database Cleaning

## Name definition

In [10]:
df["name"] = df.apply(lambda x: x.tags["name"], axis=1)
df.head()

Unnamed: 0,id,latitude,longitude,tags,name
0,21027591,42.662755,11.636329,"{'historic': 'archaeological_site', 'name': 'Via cava del Cavone'}",Via cava del Cavone
1,26864200,44.543438,9.652994,"{'historic': 'memorial', 'name': 'Croce del Pelpi'}",Croce del Pelpi
2,33148157,45.042277,11.75062,"{'historic': 'monument', 'name': 'Pavajon'}",Pavajon
3,62512260,45.972641,9.453071,"{'historic': 'yes', 'is_in': 'Lecco, Lombardia, Italy', 'name': 'Introbio', 'place': 'village', 'population': '1605'}",Introbio
4,64776777,45.936835,11.09518,"{'historic': 'monument', 'name': 'Kaiserschuetzen 1917', 'tourism': 'artwork', 'wheelchair': 'yes'}",Kaiserschuetzen 1917


Some elements have only numeric names. Get rid of them.

In [11]:
numeric_names = df["name"].apply(lambda x: x.isdigit())
df[numeric_names].sample(5)

Unnamed: 0,id,latitude,longitude,tags,name
7173,2476565788,45.128646,7.481365,"{'historic': 'wayside_shrine', 'name': '1'}",1
12210,3978711877,46.070858,11.467549,"{'alt_name': 'Nove', 'artwork_type': 'statue', 'cover': 'roof', 'denomination': 'catholic', 'material': 'wood', 'name': '09', 'religion': 'christi...",9
1819,926618290,45.094873,7.649145,"{'artist_name': 'Luigi Nervo', 'height': '3.6', 'historic': 'monument', 'length': '8.6', 'name': '1706', 'source': 'http://www.comune.torino.it/pa...",1706
12264,3990486582,46.121866,11.17413,"{'disused': 'yes', 'historic': 'mine', 'man_made': 'mineshaft', 'name': '09', 'resource': 'silver;barite'}",9
12216,3978712652,46.070485,11.467019,"{'alt_name': 'Dieci', 'artwork_type': 'statue', 'cover': 'roof', 'denomination': 'catholic', 'material': 'wood', 'name': '10', 'religion': 'christ...",10


In [12]:
# Keep only not numeric names
df = df[~numeric_names]

## Tags

In [14]:
# Load Monument Tags
with open("../Monuments/Support Files/MonumentCategories.plist", "rb") as f:
    monumentTags = pd.DataFrame.from_dict(plistlib.load(f)["categories"], orient="index")
monumentTags

Unnamed: 0,priority,description,category
archaeological_site,1000,"{'it': 'Sito Archeologico', 'en': 'Archaeological Site'}","{'it': 'Siti Archeologici', 'en': 'Archaeological Sites'}"
artwork,250,"{'it': 'Opera d'arte', 'en': 'Artwork'}","{'it': 'Opere d'arte', 'en': 'Artworks'}"
cemetery,1000,"{'it': 'Cimitero - Tomba', 'en': 'Cimitery - Tomb'}","{'it': 'Cimiteri - Tombe', 'en': 'Cimiteries - Tombs'}"
fountain,1000,"{'it': 'Fontana', 'en': 'Fountain'}","{'it': 'Fontane', 'en': 'Fountains'}"
memorial,250,"{'it': 'Memoriale', 'en': 'Memorial'}","{'it': 'Memoriali', 'en': 'Memorials'}"
monument,1000,"{'it': 'Monumento', 'en': 'Monument'}","{'it': 'Monumenti', 'en': 'Monuments'}"
museum,1000,"{'it': 'Museo', 'en': 'Museum'}","{'it': 'Musei', 'en': 'Museums'}"
palace,750,"{'it': 'Palazzo Storico', 'en': 'Historical Palace'}","{'it': 'Palazzi Storici', 'en': 'Historical Palaces'}"
place_of_worship,1000,"{'it': 'Luogo di Culto', 'en': 'Place of Worship'}","{'it': 'Luoghi di Culto', 'en': 'Places of Worship'}"
ruin,750,"{'it': 'Rovina', 'en': 'Ruin'}","{'it': 'Rovine', 'en': 'Ruins'}"


Let's find whose entires have categories matching those required

In [15]:
def find_most_significant_category(tags):
    tag_values = list(tags.values())
    for i, value in enumerate(tag_values):
        if value == "tomb" or value == "tombstone":
            tag_values[i] = "cemetery"
            
    categories = list(set(tag_values) & set(monumentTags.index))
    return categories if len(categories) > 0 else None

significant_tags = df["tags"].apply(find_most_significant_category)

In [16]:
insignificant_tags = significant_tags.isna()
print(f"Found {insignificant_tags.sum()} entries without significant categories.")

Found 16186 entries without significant categories.


In [17]:
significant_tags = significant_tags.dropna()
print("Numer of significant tags before selection: %d" % len(significant_tags))

Numer of significant tags before selection: 21654


Let's find whose entries have more than one desired category

### Not unique categories

In [18]:
not_unique_categories = significant_tags.loc[significant_tags[significant_tags.apply(lambda x: len(x) > 1)].index]
print(f"{len(not_unique_categories)} entires have more than one desired categoriy.")

1353 entires have more than one desired categoriy.


In [19]:
df.loc[not_unique_categories.index].assign(categories=not_unique_categories)[["name", "categories"]].head(5)

Unnamed: 0,name,categories
4,Kaiserschuetzen 1917,"[artwork, monument]"
97,Santa Maria in Portuno,"[archaeological_site, place_of_worship]"
115,Redentore,"[artwork, memorial, statue]"
124,Chiesa di San Gervasio in Bulgaria,"[archaeological_site, place_of_worship]"
221,Statua di Walther von der Vogelweide - Walther-von-der-Vogelweide-Denkmal,"[artwork, statue]"


In [20]:
multiple_categories = []
for l in not_unique_categories.values:
    if sorted(l) not in multiple_categories:
        multiple_categories.append(sorted(l))
print(f"Found {len(multiple_categories)} different groups of multiple categories")

Found 41 different groups of multiple categories


In [21]:
multiple_categories

[['artwork', 'monument'],
 ['archaeological_site', 'place_of_worship'],
 ['artwork', 'memorial', 'statue'],
 ['artwork', 'statue'],
 ['monument', 'place_of_worship'],
 ['memorial', 'statue'],
 ['artwork', 'fountain'],
 ['fountain', 'memorial'],
 ['archaeological_site', 'museum'],
 ['artwork', 'memorial'],
 ['fountain', 'monument'],
 ['artwork', 'monument', 'statue'],
 ['artwork', 'cemetery', 'statue'],
 ['artwork', 'place_of_worship'],
 ['monument', 'museum'],
 ['monument', 'statue'],
 ['artwork', 'place_of_worship', 'statue'],
 ['artwork', 'fountain', 'monument'],
 ['memorial', 'monument'],
 ['artwork', 'fountain', 'statue'],
 ['archaeological_site', 'theatre'],
 ['archaeological_site', 'villa'],
 ['archaeological_site', 'cemetery'],
 ['artwork', 'cemetery'],
 ['archaeological_site', 'artwork'],
 ['memorial', 'place_of_worship', 'statue'],
 ['cemetery', 'place_of_worship'],
 ['archaeological_site', 'artwork', 'statue'],
 ['fountain', 'memorial', 'statue'],
 ['cemetery', 'statue'],
 ['

### Select one category based on priority
If multiple categories occur, choose the one with highest priority

In [22]:
def choose_category(categories):
    return monumentTags.loc[categories].priority.idxmax()

In [23]:
# Filter category based on priority
significant_tags = significant_tags.apply(lambda x: x[0])
filtered_categories = not_unique_categories.apply(choose_category)
significant_tags.loc[filtered_categories.index] = filtered_categories

# Dump cleaned Dataframe
Create a new column _category_

In [24]:
clean_df = df.assign(category=significant_tags).loc[significant_tags.index]
clean_df = clean_df.reset_index(drop=True)
clean_df.sample(50)

Unnamed: 0,id,latitude,longitude,tags,name,category
12847,5672052145,44.470487,11.100138,"{'historic': 'monument', 'name': 'Ai Caduti Di Tutte Le Guerre', 'source': 'local knowledge;streetlevel imagery'}",Ai Caduti Di Tutte Le Guerre,monument
18852,312065617,42.41971,12.108508,"{'addr:city': 'Viterbo', 'addr:country': 'IT', 'addr:housenumber': '33', 'addr:postcode': '01100', 'addr:street': 'Via Casa di Santa Rosa', 'build...",Casa di Santa Rosa,museum
18510,288741816,45.067959,7.511637,"{'amenity': 'theatre', 'building': 'yes', 'name': 'Teatro San Martino'}",Teatro San Martino,theatre
15044,34362827,41.901851,12.522041,"{'artist_name': 'Leopoldo Ansiglioni', 'historic': 'memorial', 'memorial': 'statue', 'name': 'Statua di Cristo che ascende al cielo', 'url': 'http...",Statua di Cristo che ascende al cielo,statue
11075,5083410189,45.408495,11.874444,"{'artist_name': 'Kenny Random', 'artwork_type': 'graffiti', 'name': 'Strani Incontri (Kenny Random)', 'tourism': 'artwork', 'website': 'http://www...",Strani Incontri (Kenny Random),artwork
4684,2510967479,44.353738,7.388513,"{'amenity': 'place_of_worship', 'denomination': 'catholic', 'historic': 'wayside_shrine', 'name': 'pilone votivo', 'religion': 'christian'}",pilone votivo,place_of_worship
9691,4555499617,44.363605,8.464712,"{'description': 'Ruderi, o edifici trasformati, di mulini datati tra il secolo XVII ed il XIX che, oltre le granaglie, macinavano fritta di vetro ...",Mulino da colore,archaeological_site
8052,3876206555,40.832936,14.231188,"{'amenity': 'fountain', 'name': 'Gruppo Europa', 'wikidata': 'Q3777357', 'wikipedia': 'it:Gruppo Europa'}",Gruppo Europa,fountain
7360,3612967724,39.080161,17.136035,"{'historic': 'archaeological_site', 'name': 'Cannoni'}",Cannoni,archaeological_site
12560,5564683848,44.992881,10.425062,"{'historic': 'monument', 'name': 'Statua con stagno'}",Statua con stagno,monument


Reset index

In [25]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21654 entries, 0 to 21653
Data columns (total 6 columns):
id           21654 non-null int64
latitude     21654 non-null float64
longitude    21654 non-null float64
tags         21654 non-null object
name         21654 non-null object
category     21654 non-null object
dtypes: float64(2), int64(1), object(3)
memory usage: 1015.2+ KB


In [26]:
def dataframe_to_plist(df):
    with open("../Monuments/Support Files/Monuments.plist", "wb") as fp:
        df.index = df.index.astype(str)
        plistlib.dump({"monuments": df.to_dict(orient="records")}, fp)

dataframe_to_plist(clean_df)

# Statistics

In [27]:
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
counts = clean_df.groupby("category").count().sort_values("index", ascending=False)["index"]
_, ax = plt.subplots(figsize=(12, 8))
sns.barplot(x=counts, y=counts.index, palette=sns.color_palette("Blues_d", n_colors=len(counts.index)))
plt.xlabel("Counts")
plt.show()

KeyError: 'index'

In [76]:
clean_df[clean_df.category == "ruins"].sample(20)

Unnamed: 0,id,latitude,longitude,tags,name,category
3671,1916249860,41.496999,12.991387,"{'historic': 'ruins', 'historic:civilization': 'ancient_roman', 'name': 'Archi di San Lidano', 'website': 'http://www.lazioturismo.it/asp/scheda_a...",Archi di San Lidano,ruins
22709,380499122,44.581067,8.189462,"{'building': 'yes', 'height': '30.3', 'historic': 'ruins', 'name': 'Torre di Castel Martino'}",Torre di Castel Martino,ruins
6222,2725284456,41.936704,12.77762,"{'historic': 'ruins', 'name': 'Accademia'}",Accademia,ruins
21268,286933999,45.093539,6.709435,"{'building': 'collapsed', 'historic': 'ruins', 'name': 'Teleferica'}",Teleferica,ruins
21987,327544428,43.909189,7.911325,"{'historic': 'ruins', 'name': 'S. Bernardo'}",S. Bernardo,ruins
9885,4222144533,45.963983,11.245207,"{'historic': 'ruins', 'name': 'ex-Osteria alla Stanga'}",ex-Osteria alla Stanga,ruins
23149,416297304,40.488692,14.969804,"{'historic': 'ruins', 'name': 'Thesauros'}",Thesauros,ruins
17126,58974108,46.041987,10.498619,"{'building': 'yes', 'historic': 'ruins', 'name': 'Malga Campo di Sopra'}",Malga Campo di Sopra,ruins
9185,3880594620,40.285437,15.907123,"{'historic': 'ruins', 'name': 'foro'}",foro,ruins
22106,335530415,43.684755,12.146746,"{'ele': '893', 'historic': 'ruins', 'name': 'Valdistori'}",Valdistori,ruins


In [77]:
clean_df[clean_df["name"].str.contains("pompei", False)]

Unnamed: 0,id,latitude,longitude,tags,name,category
7393,3174625766,41.21489,14.695782,"{'historic': 'monument', 'name': 'Effige della Madonna di Pompei'}",Effige della Madonna di Pompei,monument
16653,26707240,40.750919,14.486876,"{'area': 'yes', 'heritage': '1', 'heritage:operator': 'whc', 'historic': 'archaeological_site', 'historic:civilization': 'ancient_roman', 'importa...",Scavi archeologici di Pompei,archaeological_site
22473,369518385,45.595218,11.815971,"{'amenity': 'place_of_worship', 'building': 'chapel', 'denomination': 'catholic', 'historic': 'wayside_shrine', 'name': 'Beata Vergine di Pompei',...",Beata Vergine di Pompei,place_of_worship
22894,395802069,40.749328,14.484842,"{'historic': 'archaeological_site', 'name': 'Foro di Pompei', 'name:cs': 'pompejské forum', 'name:ko': '폼페이 포럼(포로 디 폼페이)', 'website': 'http://www....",Foro di Pompei,archaeological_site
