In [55]:
import osmium
import geopandas as gpd

import pandas as pd
import numpy as np

from collections import defaultdict

In [81]:
class RelationHandler(osmium.SimpleHandler):

    def __init__(self):
        osmium.SimpleHandler.__init__(self)
        self.history_relation = []
        self.count_bid = 1
        self.to_bid = defaultdict(lambda: 0)
        
    def relation(self, r):
        tags = dict(r.tags)
        ways_list = []

        if not self.to_bid[r.id]:
            self.to_bid[r.id] = self.count_bid
            self.count_bid +=1

        for member in r.members:
            ways_list.append(member.ref)
            if not self.to_bid[member.ref]:
                self.to_bid[member.ref] = self.to_bid[r.id]

        self.history_relation.append([
                                r.id,
                                self.to_bid[r.id],
                                ways_list,
                                r.version,
                                r.visible,
                                pd.Timestamp(r.timestamp),
                                r.uid,
                                r.changeset,
                                len(r.tags),
                                tags,
                                'R'])

class WayHandler(osmium.SimpleHandler):

    def __init__(self, to_bid):
        osmium.SimpleHandler.__init__(self)
        self.history_way = []
        self.to_bid = to_bid
        max_key = max(self.to_bid, key=self.to_bid.get)
        self.count_bid = self.to_bid[max_key]
        
    def way(self, w):
        tags = dict(w.tags)
        nodes = []

        for i in list(w.nodes):
            nodes.append(i.ref)

        if not self.to_bid[w.id]:
            self.to_bid[w.id] = self.count_bid
            self.count_bid +=1
        
        self.history_way.append([
                                w.id,
                                self.to_bid[w.id],
                                nodes,
                                w.version,
                                w.visible,
                                pd.Timestamp(w.timestamp),
                                w.uid,
                                w.changeset,
                                len(w.tags),
                                tags,
                                'W'])

In [82]:
h_r = RelationHandler()
h_r.apply_file("data/osm/rec_historical.osm.pbf")
to_bid = h_r.to_bid

h_w = WayHandler(to_bid)
h_w.apply_file("data/osm/rec_historical.osm.pbf")

In [84]:
colnames = ['id', 'bid', 'nodes/ways', 'version', 'visible', 'ts', 'uid', 'chgset', 'ntags', 'tags', 'type']
history = pd.DataFrame(h_r.history_relation + h_w.history_way, columns=colnames)
history = history.sort_values(by=['id', 'ts'])

In [88]:
# Extract all ways: building or non-building
history

Unnamed: 0,id,bid,nodes/ways,version,visible,ts,uid,chgset,ntags,tags,type
0,53556,1,"[24602302, 24602307, 24602366, 24602394, 24794...",1,True,2008-11-24 00:26:00+00:00,31385,788874,3,"{'name': 'Rodovia Governador Mário Covas', 're...",R
1,53556,1,"[5082900, 14628111, 14633684, 14695039, 146950...",2,True,2008-11-24 00:41:14+00:00,31385,788874,3,"{'name': 'Rodovia Governador Mário Covas', 're...",R
2,53556,1,"[5082900, 14628111, 14633684, 14695039, 146950...",3,True,2008-11-27 00:02:40+00:00,31385,801314,3,"{'name': 'Rodovia Governador Mário Covas', 're...",R
3,53556,1,"[5082900, 14628111, 14633684, 14695039, 146950...",4,True,2008-11-30 21:15:42+00:00,31385,815109,4,"{'name': 'Rodovia Governador Mário Covas', 're...",R
4,53556,1,"[5082900, 14628111, 14633684, 14695039, 146950...",5,True,2008-12-03 23:32:37+00:00,31385,97585,4,"{'name': 'Rodovia Governador Mário Covas', 're...",R
...,...,...,...,...,...,...,...,...,...,...,...
436630,1074467256,207229,"[9855394773, 4331459750, 9855394774, 985539477...",2,True,2022-06-29 10:00:20+00:00,10529631,122994474,1,{'highway': 'service'},W
436631,1074467257,207230,"[9855394780, 4331459750, 9855394779]",1,True,2022-06-29 09:49:35+00:00,10529631,122993956,1,{'highway': 'service'},W
436632,1074467257,207230,"[9855394780, 9855394779]",2,True,2022-06-29 10:00:20+00:00,10529631,122994474,1,{'highway': 'service'},W
436633,1075801297,3333,"[8958474330, 5268439509]",1,True,2022-07-04 15:23:03+00:00,10460642,123197465,0,{},W


In [94]:
# number of tag edits, number of users, number of versions, number of tags

def building_qualifer(tags):
   return ( ('building' in tags) or ('building:part' in tags) or (tags.get('type') == 'building') ) and ( (tags.get('location') != 'underground') or ('bridge' not in tags) )

def extract_direct_indicators(df):
   direct_indicators = []

   mp = defaultdict(int)

   for item in df.to_numpy():
      item_id = item[0]
      bid = item[1]
      tags = item[9]
      item_type = item[10]
      qualifies = building_qualifer(tags)

      if(not mp[item_id] and qualifies):
         mp[item_id] = 1

         all_versions = df[df['id']==item_id]
         n = len(all_versions)
         
         num_rollbacks = 0
         num_edits = 0
         num_users = all_versions['uid'].nunique() # Number of unique users contributing to per building
         num_versions = all_versions.iloc[n-1]['version'] # Number of versions of the building
         num_tags = all_versions.iloc[n-1]['ntags'] # Number of tags in the latest version of the building
         visibility = "B" # visibility: D: deleted / NB: not building / B: building

         deletion = 0
         addition = 0
         change = 0
         direct_confirmations = 0

         for i in range(1, n):
            
            prev = all_versions.iloc[i-1] 
            cur = all_versions.iloc[i]

            prev_tags = prev['tags']
            cur_tags = cur['tags']

            prev_n_w = prev['nodes/ways']
            cur_n_w = cur['nodes/ways']

            prev_uid = prev['uid']
            cur_uid = cur['uid']

            # Rollback: 
            # If the geometry or the tags of cur does not match with the prev, check if it matches with any other previous versions. If so, count it as a rollback.
            if(cur_n_w != prev_n_w or cur_tags != prev_tags):
               for k in range(i-1):
                  if(cur_n_w == all_versions.iloc[k]['nodes/ways'] and cur_tags == all_versions.iloc[k]['tags']):
                     num_rollbacks += 1

            # Direct Confirmations:         
            if(cur_uid != prev_uid and cur_n_w == prev_n_w): # If different users mapped the same geometry for a building, we would count it as direct confirmation
               direct_confirmations += 1

            for key in prev_tags: # Number of deleted tags in the newer version
               if key not in cur_tags:
                  deletion += 1

            for key in cur_tags: # Number of newly added tags in the newer version
               if key not in prev_tags:
                  addition += 1

            for key in cur_tags: # Number of tags that were changed in the newer version
               if key in prev_tags:
                  if(cur_tags[key] != prev_tags[key]):
                     change += 1

            if(i == n-1):
               num_nw = len(cur['nodes/ways'])
               if(num_tags == 0 and num_nw == 0): visibility = "D" # Deleted: if on the latest version -> no tags and no nodes
               elif(not building_qualifer(cur_tags)): visibility = "NB" # Not Building: else if on the latest version -> no tags that qualify for a building

         # Edits
         num_edits = deletion + addition + change

         direct_indicators.append([item_id,
                                 bid,
                                 num_versions,
                                 num_users,
                                 num_edits,
                                 num_tags,
                                 direct_confirmations,
                                 num_rollbacks,
                                 visibility,
                                 item_type])


   colnames = ['id', 'bid', 'nversions', 'nusers', 'nedits', 'ntags', 'dir_confirmations', 'nrollbacks', 'visibility', 'type']
   dir_ind = pd.DataFrame(direct_indicators, columns=colnames)
   
   return dir_ind

In [95]:
dir_ind = extract_direct_indicators(history)
dir_ind

Unnamed: 0,id,bid,nversions,nusers,nedits,ntags,dir_confirmations,nrollbacks,visibility,type
0,2402565,24,6,5,10,2,4,1,B,R
1,2514001,25,3,3,10,12,2,0,B,R
2,2924667,38,8,7,13,7,6,1,NB,R
3,2959361,39,2,2,1,2,1,0,B,R
4,2959362,40,2,2,1,2,1,0,B,R
...,...,...,...,...,...,...,...,...,...,...
145602,1064791239,207185,1,1,0,3,0,0,B,W
145603,1064791240,207186,1,1,0,1,0,0,B,W
145604,1070775598,207196,1,1,0,11,0,0,B,W
145605,1071164414,207197,1,1,0,6,0,0,B,W
