In [2]:
import osmium
import re
import datetime

import geopandas as gpd
import pandas as pd
import numpy as np

from collections import defaultdict

In [7]:
# class HistoryHandler(osmium.SimpleHandler):
#     def __init__(self):
#         osmium.SimpleHandler.__init__(self)
#         self.count_bid = 0
#         self.area_to_bid = {}
#         self.relation_to_bid = {}

#     def get_area_to_bid(self):
#         return self.area_to_bid

#     def relation(self, r):
#         tags = dict(r.tags)
        
#         # Qualifiers
#         if not ('building' in tags or 'building:part' in tags or tags.get('type') == 'building'):
#             return
#         # Disqualifiers
#         if (tags.get('location') == 'underground' or 'bridge' in tags):
#             return

#         # print(r)
        
#         if r.id not in self.relation_to_bid:
#             self.relation_to_bid[r.id] = self.count_bid
#             self.count_bid +=1
        
#         for member in r.members:
#             if member.ref not in self.area_to_bid:
#                 self.area_to_bid[member.ref] = self.relation_to_bid[r.id]

In [8]:
# h = HistoryHandler()
# h.apply_file("data/osm/rec_historical.osm.pbf")
# area_to_bid = h.get_area_to_bid()

In [4]:
class HistoryHandler(osmium.SimpleHandler):

    def __init__(self):
        osmium.SimpleHandler.__init__(self)
        self.history_data = []
    
    def way(self, w):
        tags = dict(w.tags)
        
        # Qualifiers
        if not ('building' in tags or 'building:part' in tags or tags.get('type') == 'building'):
            return
        # Disqualifiers
        if (tags.get('location') == 'underground' or 'bridge' in tags):
            return
        
        self.history_data.append([w.id,
                                  w.version,
                                  w.visible,
                                  pd.Timestamp(w.timestamp),
                                  w.uid,
                                  w.changeset,
                                  len(w.tags),
                                  tags])

In [5]:
handler = HistoryHandler()
handler.apply_file("data/osm/rec_historical.osm.pbf")
colnames = ['id', 'version', 'visible', 'ts', 'uid', 'chgset', 'ntags', 'tags']
history = pd.DataFrame(handler.history_data, columns=colnames)
history = history.sort_values(by=['id', 'ts'])
history.head(10)

# Some cases to be handled:
# In history data there are erroneous buildings which were later rectified or removed:
# For example way 29163144 after version 28, check https://www.openstreetmap.org/way/29163144/history
# Another example way 44118744 was deleted on version 4, 10 years ago, check https://www.openstreetmap.org/way/44118744

Unnamed: 0,id,version,visible,ts,uid,chgset,ntags,tags
0,29163144,27,True,2015-09-30 03:22:51+00:00,1772173,34338585,1,{'building': 'yes'}
1,29163144,28,True,2015-09-30 04:03:06+00:00,1772173,34338585,3,"{'building': 'yes', 'natural': 'coastline', 's..."
2,44118744,2,True,2011-03-10 18:03:41+00:00,186193,7516216,2,"{'amenity': 'public_building', 'building': 'yes'}"
3,44118744,3,True,2012-04-12 12:23:22+00:00,614513,11275702,2,"{'amenity': 'public_building', 'building': 'yes'}"
4,51816211,2,True,2010-03-06 22:52:16+00:00,148877,4055921,2,"{'building': 'yes', 'tourism': 'museum'}"
5,51816211,3,True,2015-09-21 21:17:57+00:00,1772173,34171819,3,"{'building': 'yes', 'roof:shape': 'gabled', 't..."
6,51816211,4,True,2015-12-14 12:31:36+00:00,31385,35944309,4,"{'addr:housename': 'Armazém 12', 'building': '..."
7,51816211,5,True,2022-02-17 13:07:05+00:00,8107931,117520835,3,"{'addr:housename': 'Armazém 12', 'building': '..."
8,51816216,2,True,2010-03-06 22:52:16+00:00,148877,4055921,3,"{'building': 'yes', 'name': 'Livraria Cultura'..."
9,51816216,3,True,2015-09-21 20:03:44+00:00,1772173,34170456,3,"{'building': 'yes', 'name': 'Livraria Cultura'..."


In [45]:
# number of tag edits, number of users!!
id = []
nedits = []
nusers = []
nversions = []
ntags = []

mp = defaultdict(int)

for item in history.to_numpy():
   item_id = item[0] # 0th column in the id of the dataframe
   if(not mp[item_id]):
      mp[item_id] = 1

      all_versions = history[history['id']==item_id]
      n = len(all_versions)

      num_edits = 0
      num_users = all_versions['uid'].nunique()
      num_versions = all_versions.iloc[n-1]['version']
      num_tags = all_versions.iloc[n-1]['ntags']
      
      deletion = 0
      addition = 0
      change = 0

      for i in range(1, n):
         
         prev = all_versions['tags'].iloc[i-1] 
         next = all_versions['tags'].iloc[i]

         for key in prev:
            if key not in next:
               deletion += 1

         for key in next:
            if key not in prev:
               addition += 1

         for key in next:
            if key in prev:
               if(next[key] != prev[key]):
                  change += 1

      num_edits = deletion + addition + change

      id.append(item_id)
      nedits.append(num_edits)
      nusers.append(num_users)
      nversions.append(num_versions)
      ntags.append(num_tags)

In [38]:
# Also, the latest version number is also available on the latest pbf file

In [None]:
# resolve the issue of relation id and way id! Take help from professor!!
# indicators of trustworthiness for each building using the paper. Check if we can get the attributes!

In [27]:
class OSMHandler(osmium.SimpleHandler):

    def __init__(self):
        osmium.SimpleHandler.__init__(self)
        self.history_data = []
    
    def way(self, w):
        tags = dict(w.tags)
        
        # Qualifiers
        if not ('building' in tags or 'building:part' in tags or tags.get('type') == 'building'):
            return
        # Disqualifiers
        if (tags.get('location') == 'underground' or 'bridge' in tags):
            return
        
        self.history_data.append([w.id,
                                  w.version,
                                  len(w.tags)])

In [29]:
handler = OSMHandler()
handler.apply_file("data/osm/rec.osm.pbf")
colnames = ['id', 'version_no', 'ntags']
df = pd.DataFrame(handler.history_data, columns=colnames)
df = df.sort_values(by=['id'])
df.head(10)

Unnamed: 0,id,version,ntags
0,51816211,5,3
1,51816216,7,4
2,51816217,8,12
3,51816218,8,12
4,51816220,3,3
5,51816221,7,10
6,51816245,3,1
7,51816246,2,1
8,51816248,5,3
9,51816249,8,8
