In [69]:
import osmium
import re
import datetime

import geopandas as gpd
import pandas as pd
import numpy as np

from collections import defaultdict

In [72]:
class HistoryHandler(osmium.SimpleHandler):

    def __init__(self):
        osmium.SimpleHandler.__init__(self)
        self.history_data = []
        
    def way(self, w):
        tags = dict(w.tags)

        # Qualifiers
        if not ('building' in tags or 'building:part' in tags or tags.get('type') == 'building'):
            return
        # Disqualifiers
        if (tags.get('location') == 'underground' or 'bridge' in tags):
            return
        
        nodes = []
        for i in list(w.nodes):
            nodes.append(i.ref)
        
        self.history_data.append([
                                  w.id,
                                  nodes,
                                  w.version,
                                  w.visible,
                                  pd.Timestamp(w.timestamp),
                                  w.uid,
                                  w.changeset,
                                  len(w.tags),
                                  tags])

In [74]:
h = HistoryHandler()
h.apply_file("data/osm/rec_historical.osm.pbf")
colnames = ['id', 'nodes', 'version', 'visible', 'ts', 'uid', 'chgset', 'ntags', 'tags']
history = pd.DataFrame(h.history_data, columns=colnames)
history = history.sort_values(by=['id', 'ts'])
history.head(10)

Unnamed: 0,id,nodes,version,visible,ts,uid,chgset,ntags,tags
0,29163144,"[100182902, 1206805535, 100184767, 1572244647,...",27,True,2015-09-30 03:22:51+00:00,1772173,34338585,1,{'building': 'yes'}
1,29163144,"[100182902, 1206805535, 100184767, 1572244647,...",28,True,2015-09-30 04:03:06+00:00,1772173,34338585,3,"{'building': 'yes', 'natural': 'coastline', 's..."
2,44118744,"[560628970, 560628974, 560628977, 560628980, 5...",2,True,2011-03-10 18:03:41+00:00,186193,7516216,2,"{'amenity': 'public_building', 'building': 'yes'}"
3,44118744,[560628970],3,True,2012-04-12 12:23:22+00:00,614513,11275702,2,"{'amenity': 'public_building', 'building': 'yes'}"
4,51816211,"[661046196, 661046197, 661046231, 661046232, 6...",2,True,2010-03-06 22:52:16+00:00,148877,4055921,2,"{'building': 'yes', 'tourism': 'museum'}"
5,51816211,"[661046196, 661046197, 661046231, 661046232, 6...",3,True,2015-09-21 21:17:57+00:00,1772173,34171819,3,"{'building': 'yes', 'roof:shape': 'gabled', 't..."
6,51816211,"[661046196, 661046197, 661046231, 661046232, 6...",4,True,2015-12-14 12:31:36+00:00,31385,35944309,4,"{'addr:housename': 'Armazém 12', 'building': '..."
7,51816211,"[661046196, 661046197, 661046231, 661046232, 6...",5,True,2022-02-17 13:07:05+00:00,8107931,117520835,3,"{'addr:housename': 'Armazém 12', 'building': '..."
8,51816216,"[661046178, 661046175, 661046176, 661046177, 6...",2,True,2010-03-06 22:52:16+00:00,148877,4055921,3,"{'building': 'yes', 'name': 'Livraria Cultura'..."
9,51816216,"[661046178, 661046175, 661046177, 661046179, 6...",3,True,2015-09-21 20:03:44+00:00,1772173,34170456,3,"{'building': 'yes', 'name': 'Livraria Cultura'..."


In [42]:
# number of tag edits, number of users, number of versions, number of tags

# number of direct confirmations: different users confirming -> geometry (same nodes), name, highway (road_type), oneway, maxspeed, tunnel, bridge
# for buildings, we are only considering geometry for direct confirmations, as correctly mapping the footprints is the most imporant aspect
# And the other changes in tags seemed very minor. Spelling error in name etc.
# but need to discuss with professor

id = []
nedits = []
nusers = []
nversions = []
ntags = []
nconfirmations = []
nrollbacks = []

mp = defaultdict(int)

for item in history.to_numpy():
   item_id = item[0] # 0th column in the id of the dataframe
   if(not mp[item_id]):
      mp[item_id] = 1

      all_versions = history[history['id']==item_id]
      n = len(all_versions)
      
      num_rollbacks = 0
      num_edits = 0
      num_users = all_versions['uid'].nunique() # Number of unique users contributing to per building
      num_versions = all_versions.iloc[n-1]['version'] # Number of versions of the building
      num_tags = all_versions.iloc[n-1]['ntags'] # Number of tags in the latest version of the building
      
      deletion = 0
      addition = 0
      change = 0
      direct_confirmations = 0

      for i in range(1, n):
         
         prev = all_versions.iloc[i-1] 
         cur = all_versions.iloc[i]

         prev_tags = prev['tags']
         cur_tags = cur['tags']

         prev_nodes = prev['nodes']
         cur_nodes = cur['nodes']

         prev_uid = prev['uid']
         cur_uid = cur['uid']

         # Rollback my intuition: 
         # If the geometry or the tags of cur does not match with the prev, check if it matches with any other previous versions. If so, count it as a rollback.
         # In total, 65 buildings had rollback history
         if(cur_nodes != prev_nodes or cur_tags != prev_tags):
            for k in range(i-1):
               if(cur_nodes == all_versions.iloc[k]['nodes'] and cur_tags == all_versions.iloc[k]['tags']):
                  # print(item_id, cur['chgset'], all_versions.iloc[k]['chgset'] )
                  num_rollbacks += 1
                  
         # We could also take into account tags for direct confirmation. But ignoring simple tags such as:
         # addr:city, building, ... etc. First consult with professor.
         if(cur_uid != prev_uid and cur_nodes == prev_nodes): # If different users mapped the same geometry for a building, we would count it as direct confirmation
            direct_confirmations += 1

         for key in prev_tags: # Number of deleted tags in the newer version
            if key not in cur_tags:
               deletion += 1

         for key in cur_tags: # Number of newly added tags in the newer version
            if key not in prev_tags:
               addition += 1

         for key in cur_tags: # Number of tags that were changed in the newer version
            if key in prev_tags:
               if(cur_tags[key] != prev_tags[key]):
                  change += 1

      
      num_edits = deletion + addition + change 

      id.append(item_id)
      nedits.append(num_edits)
      nusers.append(num_users)
      nversions.append(num_versions)
      ntags.append(num_tags)
      nconfirmations.append(direct_confirmations)
      nrollbacks.append(num_rollbacks)

In [50]:
# res = filter(lambda x: x > 0, nrollbacks)
# list(res)

id                                                 385461812
nodes      [3888122507, 3888122664, 3888122744, 388812274...
version                                                    1
visible                                                 True
ts                                 2015-12-11 19:38:42+00:00
uid                                                  3149129
chgset                                              35891668
ntags                                                      1
tags                                     {'building': 'yes'}
Name: 35685, dtype: object