In [2]:
import osmium
import geopandas as gpd

import pandas as pd
import numpy as np

from collections import defaultdict

In [49]:
class RelationHandler(osmium.SimpleHandler):

    def __init__(self):
        osmium.SimpleHandler.__init__(self)
        self.history_relation = []
        self.count_bid = 0
        self.area_to_bid = {}
        self.relation_to_bid = {}
        
    def relation(self, r):
        tags = r.tags
        ways = []

        if r.id not in self.relation_to_bid:
            self.relation_to_bid[r.id] = self.count_bid
            self.count_bid +=1

        for member in r.members:
            ways.append[member.ref]
            if member.ref not in self.area_to_bid:
                self.area_to_bid[member.ref] = self.relation_to_bid[r.id]

        self.history_relation.append([
                                r.id,
                                ways,
                                r.version,
                                r.visible,
                                pd.Timestamp(r.timestamp),
                                r.uid,
                                r.changeset,
                                len(r.tags),
                                tags])

class WayHandler(osmium.SimpleHandler):

    def __init__(self):
        osmium.SimpleHandler.__init__(self)
        self.history_way = []
        self.building_id = []
        
    def way(self, w):
        tags = dict(w.tags)
        nodes = []

        for i in list(w.nodes):
            nodes.append(i.ref)
        
        self.history_way.append([
                                w.id,
                                nodes,
                                w.version,
                                w.visible,
                                pd.Timestamp(w.timestamp),
                                w.uid,
                                w.changeset,
                                len(w.tags),
                                tags])
# Do the same for relation!!
# Then add geometry to all the columns with latest osm data

In [50]:
h_r = RelationHandler()
h_r.apply_file("data/osm/rec_historical.osm.pbf")
dict_ways = h_r.dict_ways # We use this later for building qualifier

2402565
2402565
2402565
2402565
2514001
2514001
2514001
2924667
2959361
2959362
2959364
3524853
3524853
3524853
5329288
5329288
5329288
5467511
5467511
5467511
5467511
5469203
5469203
5523952
5523964
5524000
5524000
5524000
5524001
5524001
5524002
5535457
5535457
5535457
5535457
5535458
5535458
5535458
5536822
5536822
5536823
5536823
5536823
5536824
5536825
5536826
5536827
5536827
5536827
5538564
5540680
5543656
5543656
5543656
5543656
5543656
5543656
5543680
5543680
5543681
5543765
5543766
5547909
5547910
5547911
5547911
5547912
5547913
5547971
5547971
5547971
5553772
5553772
5553772
5553773
5560076
5560077
5560077
5560271
5560271
5560272
5560507
5560507
5560508
5560509
5560601
5560602
5562423
5562423
5562424
5562425
5562425
5562425
5562498
5562498
5562498
5562499
5562499
5562500
5562500
5562501
5562501
5562501
5595498
5596627
5596627
5640916
5640917
5640918
5640919
5640920
5640921
5640922
5650009
5650009
5669798
5669798
5669798
5669799
5669800
5669801
5669801
5669914
5669916
5670151


In [45]:
h_w = WayHandler()
h_w.apply_file("data/osm/rec_historical.osm.pbf")

colnames = ['id', 'nodes', 'version', 'visible', 'ts', 'uid', 'chgset', 'ntags', 'tags']
history = pd.DataFrame(h_w.history_way, columns=colnames)
history = history.sort_values(by=['id', 'ts'])

In [46]:
# Extract all ways: building or non-building
history

Unnamed: 0,id,nodes/ways,version,visible,ts,uid,chgset,ntags,tags,type
0,11282816,"[100185698, 100186729, 100186546, 100185698]",1,True,2007-11-06 18:06:22+00:00,6270,422103,4,"{'converted_by': 'mikes', 'created_by': 'coast...",W
1,11282816,"[100185698, 100186729, 100186546, 100185698]",2,True,2008-01-11 17:33:16+00:00,7396,603937,4,"{'converted_by': 'mikes', 'created_by': 'coast...",W
2,11282816,[],3,False,2008-12-21 22:21:18+00:00,1596,547324,0,{},W
3,11282820,"[100190254, 100191473, 100192532, 100194876, 1...",1,True,2007-11-06 18:06:22+00:00,6270,422103,4,"{'converted_by': 'mikes', 'created_by': 'coast...",W
4,11282820,[],2,False,2008-12-21 23:13:10+00:00,1596,547324,0,{},W
...,...,...,...,...,...,...,...,...,...,...
421463,1074467256,"[9855394773, 4331459750, 9855394774, 985539477...",2,True,2022-06-29 10:00:20+00:00,10529631,122994474,1,{'highway': 'service'},W
421464,1074467257,"[9855394780, 4331459750, 9855394779]",1,True,2022-06-29 09:49:35+00:00,10529631,122993956,1,{'highway': 'service'},W
421465,1074467257,"[9855394780, 9855394779]",2,True,2022-06-29 10:00:20+00:00,10529631,122994474,1,{'highway': 'service'},W
421466,1075801297,"[8958474330, 5268439509]",1,True,2022-07-04 15:23:03+00:00,10460642,123197465,0,{},W


In [47]:
# number of tag edits, number of users, number of versions, number of tags

# number of direct confirmations: different users confirming -> geometry (same nodes), name, highway (road_type), oneway, maxspeed, tunnel, bridge
# for buildings, we are only considering geometry for direct confirmations, as correctly mapping the footprints is the most imporant aspect
# And the other changes in tags seemed very minor. Spelling error in name etc.
# but need to discuss with professor

def building_qualifer(tags, id):
   qualifier = ( ('building' in tags) or ('building:part' in tags) or (tags.get('type') == 'building') ) and ( (tags.get('location') != 'underground') or ('bridge' not in tags) )
   part_of_relation = (dict_ways[id] == 1)
   if(part_of_relation): print(id, part_of_relation)
   return qualifier or part_of_relation

def extract_direct_indicators(df):
   direct_indicators = []

   mp = defaultdict(int)

   for item in df.to_numpy():
      item_id = item[0]
      tags = item[8]
      qualifies = building_qualifer(tags, item_id)

      if(not mp[item_id] and qualifies):
         mp[item_id] = 1

         all_versions = df[df['id']==item_id]
         n = len(all_versions)
         
         num_rollbacks = 0
         num_edits = 0
         num_users = all_versions['uid'].nunique() # Number of unique users contributing to per building
         num_versions = all_versions.iloc[n-1]['version'] # Number of versions of the building
         num_tags = all_versions.iloc[n-1]['ntags'] # Number of tags in the latest version of the building
         visibility = "B" # visibility: deleted / not building / building

         deletion = 0
         addition = 0
         change = 0
         direct_confirmations = 0

         for i in range(1, n):
            
            prev = all_versions.iloc[i-1] 
            cur = all_versions.iloc[i]

            prev_tags = prev['tags']
            cur_tags = cur['tags']

            prev_nodes = prev['nodes']
            cur_nodes = cur['nodes']

            prev_uid = prev['uid']
            cur_uid = cur['uid']

            # Rollback: 
            # If the geometry or the tags of cur does not match with the prev, check if it matches with any other previous versions. If so, count it as a rollback.
            # In total, 65 buildings had rollback history
            if(cur_nodes != prev_nodes or cur_tags != prev_tags):
               for k in range(i-1):
                  if(cur_nodes == all_versions.iloc[k]['nodes'] and cur_tags == all_versions.iloc[k]['tags']):
                     num_rollbacks += 1

            # Direct Confirmations:         
            # We could also take into account tags for direct confirmation. But ignoring simple tags such as:
            # addr:city, building, ... etc. First consult with professor.
            if(cur_uid != prev_uid and cur_nodes == prev_nodes): # If different users mapped the same geometry for a building, we would count it as direct confirmation
               direct_confirmations += 1

            for key in prev_tags: # Number of deleted tags in the newer version
               if key not in cur_tags:
                  deletion += 1

            for key in cur_tags: # Number of newly added tags in the newer version
               if key not in prev_tags:
                  addition += 1

            for key in cur_tags: # Number of tags that were changed in the newer version
               if key in prev_tags:
                  if(cur_tags[key] != prev_tags[key]):
                     change += 1

            if(i == n-1):
               num_nodes = len(cur['nodes'])
               if(num_tags == 0 and num_nodes == 0): visibility = "D" # Deleted: if on the latest version -> no tags and no nodes
               elif(not building_qualifer(cur_tags)): visibility = "NB" # Not Building: else if on the latest version -> no tags that qualify for a building

         # Edits
         num_edits = deletion + addition + change

         direct_indicators.append([item_id,
                                 num_versions,
                                 num_users,
                                 num_edits,
                                 num_tags,
                                 direct_confirmations,
                                 num_rollbacks,
                                 visibility])


   colnames = ['id', 'nversions', 'nusers', 'nedits', 'ntags', 'dir_confirmations', 'nrollbacks', 'visibility']
   dir_ind = pd.DataFrame(direct_indicators, columns=colnames)
   
   return dir_ind

In [48]:
dir_ind = extract_direct_indicators(history)
dir_ind

KeyError: 'nodes'