In [2]:
import osmium
import re
import datetime

import geopandas as gpd
import pandas as pd
import numpy as np

from collections import defaultdict

In [3]:
class HistoryHandler(osmium.SimpleHandler):

    def __init__(self):
        osmium.SimpleHandler.__init__(self)
        self.history_data = []
        
    def way(self, w):
        tags = dict(w.tags)

        
        
        nodes = []
        for i in list(w.nodes):
            nodes.append(i.ref)
        
        self.history_data.append([
                                  w.id,
                                  nodes,
                                  w.version,
                                  w.visible,
                                  pd.Timestamp(w.timestamp),
                                  w.uid,
                                  w.changeset,
                                  len(w.tags),
                                  tags])

In [4]:
# Extract all ways: building or non-building

h = HistoryHandler()
h.apply_file("data/osm/rec_historical.osm.pbf")
colnames = ['id', 'nodes', 'version', 'visible', 'ts', 'uid', 'chgset', 'ntags', 'tags']
history = pd.DataFrame(h.history_data, columns=colnames)
history = history.sort_values(by=['id', 'ts'])

In [11]:
# number of tag edits, number of users, number of versions, number of tags

# number of direct confirmations: different users confirming -> geometry (same nodes), name, highway (road_type), oneway, maxspeed, tunnel, bridge
# for buildings, we are only considering geometry for direct confirmations, as correctly mapping the footprints is the most imporant aspect
# And the other changes in tags seemed very minor. Spelling error in name etc.
# but need to discuss with professor

def building_qualifer(tags):
   return ( ('building' in tags) or ('building:part' in tags) or (tags.get('type') == 'building') ) and ( (tags.get('location') != 'underground') or ('bridge' not in tags) )

def extract_direct_indicators(df):
   direct_indicators = []

   mp = defaultdict(int)

   for item in df.to_numpy():
      item_id = item[0]
      tags = item[8]
      qualifies = building_qualifer(tags)

      if(not mp[item_id] and qualifies):
         mp[item_id] = 1

         all_versions = df[df['id']==item_id]
         n = len(all_versions)
         
         num_rollbacks = 0
         num_edits = 0
         num_users = all_versions['uid'].nunique() # Number of unique users contributing to per building
         num_versions = all_versions.iloc[n-1]['version'] # Number of versions of the building
         num_tags = all_versions.iloc[n-1]['ntags'] # Number of tags in the latest version of the building
         visibility = "B" # visibility: deleted / not building / building

         deletion = 0
         addition = 0
         change = 0
         direct_confirmations = 0

         for i in range(1, n):
            
            prev = all_versions.iloc[i-1] 
            cur = all_versions.iloc[i]

            prev_tags = prev['tags']
            cur_tags = cur['tags']

            prev_nodes = prev['nodes']
            cur_nodes = cur['nodes']

            prev_uid = prev['uid']
            cur_uid = cur['uid']

            # Rollback: 
            # If the geometry or the tags of cur does not match with the prev, check if it matches with any other previous versions. If so, count it as a rollback.
            # In total, 65 buildings had rollback history
            if(cur_nodes != prev_nodes or cur_tags != prev_tags):
               for k in range(i-1):
                  if(cur_nodes == all_versions.iloc[k]['nodes'] and cur_tags == all_versions.iloc[k]['tags']):
                     num_rollbacks += 1

            # Direct Confirmations:         
            # We could also take into account tags for direct confirmation. But ignoring simple tags such as:
            # addr:city, building, ... etc. First consult with professor.
            if(cur_uid != prev_uid and cur_nodes == prev_nodes): # If different users mapped the same geometry for a building, we would count it as direct confirmation
               direct_confirmations += 1

            for key in prev_tags: # Number of deleted tags in the newer version
               if key not in cur_tags:
                  deletion += 1

            for key in cur_tags: # Number of newly added tags in the newer version
               if key not in prev_tags:
                  addition += 1

            for key in cur_tags: # Number of tags that were changed in the newer version
               if key in prev_tags:
                  if(cur_tags[key] != prev_tags[key]):
                     change += 1

            if(i == n-1):
               num_nodes = len(cur['nodes'])
               if(num_tags == 0 and num_nodes == 0): visibility = "D" # Deleted: if on the latest version -> no tags and no nodes
               elif(not building_qualifer(cur_tags)): visibility = "NB" # Not Building: else if on the latest version -> no tags that qualify for a building

         # Edits
         num_edits = deletion + addition + change

         direct_indicators.append([item_id,
                                 num_versions,
                                 num_users,
                                 num_edits,
                                 num_tags,
                                 direct_confirmations,
                                 num_rollbacks,
                                 visibility])


   colnames = ['id', 'nversions', 'nusers', 'nedits', 'ntags', 'dir_confirmations', 'nrollbacks', 'visibility']
   dir_ind = pd.DataFrame(direct_indicators, columns=colnames)
   
   return dir_ind

In [12]:
dir_ind = extract_direct_indicators(history)
dir_ind

Unnamed: 0,id,nversions,nusers,nedits,ntags,dir_confirmations,nrollbacks,visibility
0,29163144,38,14,19,1,4,2,NB
1,44118744,4,3,5,0,0,0,D
2,51816211,5,4,4,3,3,0,B
3,51816216,7,4,6,4,1,0,B
4,51816217,8,6,12,12,4,0,B
...,...,...,...,...,...,...,...,...
145156,1064791239,1,1,0,3,0,0,B
145157,1064791240,1,1,0,1,0,0,B
145158,1070775598,1,1,0,11,0,0,B
145159,1071164414,1,1,0,6,0,0,B


### Read the latest OSM File along with building tags for recife!!

In [42]:
# Read all possible tags of building
file = open('data/all-tags.txt', 'r')
Lines = file.readlines()
col_names = [line[:-1] for line in Lines]

In [43]:
types_dict = {'id': int, 'height': float, 'min_height': float}
types_dict.update({col: str for col in col_names if col not in types_dict})
latest_osm = pd.read_csv('data/all-building-features.csv', dtype = types_dict, index_col = 0)

In [46]:
del_or_edited = []

hist_id = dir_ind['id'].to_list()
latest_id = latest_osm['id'].to_list()
for i in hist_id:
    if i not in latest_id:
        del_or_edited.append(i)