In [1]:
import osmium as osm
import pandas as pd

In [2]:
class TagGenomeHandler(osm.SimpleHandler):
    def __init__(self):
        osm.SimpleHandler.__init__(self)
        self.taggenome = []

    def tag_inventory(self, elem, elem_type):
        for tag in elem.tags:
            self.taggenome.append([elem_type, 
                                   elem.id, 
                                   elem.version, 
                                   tag.k, 
                                   tag.v])

    def node(self, n):
        self.tag_inventory(n, "node")

    def way(self, w):
        self.tag_inventory(w, "way")

    def relation(self, r):
        self.tag_inventory(r, "relation")

In [3]:
taghandler = TagGenomeHandler()
taghandler.apply_file("data/ottgat.osh.pbf")
colnames = ['type', 'id', 'version', 'tagkey', 'tagvalue']
tag_genome = pd.DataFrame(taghandler.taggenome, columns=colnames)

In [4]:
osm_history = pd.read_csv("output/elements.csv")
enhanced_tag_genome = pd.merge(osm_history[['type', 'id', 'version']], tag_genome, how='left', left_on=['type', 'id', 'version'], right_on=['type', 'id', 'version'])
enhanced_tag_genome.to_csv("output/enhance.csv")

In [5]:
tag_genome.groupby('type')['tagkey'].nunique()

type
node        721
relation    500
way         748
Name: tagkey, dtype: int64

In [6]:
tagkeycount = (tag_genome.groupby(['tagkey','type'])['type']
               .count()
               .unstack()
               .fillna(0))
tagkeycount['total'] = tagkeycount.apply(sum, axis=1)
tagkeycount = tagkeycount.sort_values('total', ascending=False)
tagkeycount.to_csv("output/count.csv")

In [7]:
def tagkey_analysis(genome, pivot_var=['type']):
    return (genome.groupby(['tagkey', *pivot_var])['id']
            .nunique()
            .unstack()
            .fillna(0))
tagkey_overview = tagkey_analysis(enhanced_tag_genome, ['type', 'version'])
tagkey_overview2 = tagkey_overview.sort_values(1, ascending=False).iloc[:5,:5]

In [8]:
tagkey_overview2.to_csv("output/overview.csv")

In [9]:
def total_elem(genome, pivot_var=['type', 'version']):
    return genome.groupby(pivot_var)['id'].nunique().unstack().fillna(0)

In [10]:
totalelem = total_elem(enhanced_tag_genome).iloc[:,:5]

In [11]:
totalelem.to_csv("output/totalelem.csv")

In [12]:
def tag_frequency(genome, pivot_var=['type', 'version']):
    total_uniqelem = total_elem(genome, pivot_var)
    tagcount = tagkey_analysis(genome, pivot_var)
    # Prepare data: group tag counts by element types
    tagcount_groups = tagcount.groupby(level='type')
    # For each type, compute the proportion of element tagged with each tag
    tag_freq = []
    for key, group in tagcount_groups:
        tag_freq.append( group / total_uniqelem.loc[key])
    # Regroup in one single dataframe and return
    tag_freq = pd.concat(tag_freq)
    return 100*tag_freq.round(4)

In [13]:
tagfreq = tag_frequency(enhanced_tag_genome, ['type','version']).sort_values(1, ascending=False).head(20)[[1,3,5,10,15]]
tagfreq.to_csv("output/tagfreq.csv")

In [14]:
def tagvalue_analysis(genome, key, pivot_var=['type']):
    return (genome.query("tagkey==@key")
            .groupby(['tagvalue', *pivot_var])['id']
            .nunique()
            .unstack()
            .fillna(0))

In [15]:
tagvalue_overview = tagvalue_analysis(tag_genome, 'highway', ['type', 'version'])
tagvalue2 = tagvalue_overview.sort_values(1, ascending=False).iloc[:5,:7]
tagvalue2.to_csv("output/highway.csv")

In [16]:
tagvalue_overview = tagvalue_analysis(tag_genome, 'building', ['type', 'version'])
tagvalue2 = tagvalue_overview.sort_values(1, ascending=False).iloc[:5,:7]
tagvalue2.to_csv("output/building.csv")

In [17]:
def tot_values(genome, key, pivot_var=['type', 'version']):
    return (genome.query("tagkey==@key")
                      .groupby(pivot_var)['id']
                      .nunique()
                      .unstack()
                      .fillna(0))

In [18]:
totval = tot_values(tag_genome, 'highway')[[1,2,3,4,5,10,15]]
totval.to_csv("output/highway_total.csv")

In [19]:
totval2 = tot_values(tag_genome, 'building')[[1,2,3,4,5,10,15]]
totval2.to_csv("output/building_total.csv")

In [20]:
def tagvalue_frequency(genome, key, pivot_var=['type', 'version']):
    total_uniqelem = tot_values(genome, key, pivot_var)
    tagcount = tagvalue_analysis(genome, key, pivot_var=['type','version'])
    tagcount_groups = tagcount.groupby(level='type')
    tag_freq = []
    for key, group in tagcount_groups:
        tag_freq.append( group / total_uniqelem.loc[key])
    tag_freq = pd.concat(tag_freq)
    return (100*tag_freq).round(4)

In [21]:
tagvalue_freq = tagvalue_frequency(tag_genome, 'highway', ['type','version']).swaplevel().sort_values(1, ascending=False)
tagvalue_freq.to_csv("output/highway.csv")

In [22]:
tagvalue_freq_rel = tagvalue_freq.loc['relation', [1,3,5,10,15]]
tagvalue_freq_rel.to_csv("output/highway_rel.csv")

In [23]:
tagvalue_freq_way = tagvalue_freq.loc['way', [1,3,5,10,15]]
tagvalue_freq_way.to_csv("output/highway_way.csv")

In [24]:
tagvalue_freq_node = tagvalue_freq.loc['node', [1,3,5,10,15]]
tagvalue_freq_node.to_csv("output/highway_node.csv")

In [25]:
tagvalue_freq2 = tagvalue_frequency(tag_genome, 'building', ['type','version']).swaplevel().sort_values(1, ascending=False)
tagvalue_freq2.to_csv("output/building.csv")

In [26]:
tagvalue_freq2_rel = tagvalue_freq2.loc['relation', [1,3,5,10,15]]
tagvalue_freq2_rel.to_csv("output/building_rel.csv")

In [27]:
tagvalue_freq2_way = tagvalue_freq2.loc['way', [1,3,5,10,15]]
tagvalue_freq2_way.to_csv("output/building_way.csv")

In [28]:
tagvalue_freq2_node = tagvalue_freq2.loc['node', [1,3,5,10,15]]
tagvalue_freq2_node.to_csv("output/building_node.csv")