# Building OSM Full History objects from History Files

> Test case: Nepal

In [1]:
import osmium as o
import numpy as np
import sys, os, json, pprint, copy
from collections import Counter

main runtime, unfortuantely it's single threaded (Can osmium run in parallel in native C?)

In [2]:
class FileHandler(o.SimpleHandler):                                                                                           
    def __init__(self):
        o.SimpleHandler.__init__(self)                                                                                              
        self.w_cnt = 0
        self.n_cnt = 0

    def node(self, n):
        '''
            Only save nodes which have extra attributes...
            TODO: If the first version of a node has 0 tags, but version 2 has tags, then version 1 may not show up in the history... maybe?
        '''
        self.n_cnt +=1
        if n.id in nodes:
            nodes[n.id].add(n)
        elif len(n.tags)>0:
            nodes[n.id] = Node(n)
            
        #Cache these locations:
        if n.id not in node_locations:
            node_locations[n.id] = []

        node_locations[n.id].append({
                'v':n.version,
                'c':n.changeset,
                'g':[n.location.lon, n.location.lat]
            })
        
        if self.n_cnt%10000==0:
            sys.stderr.write("\r{0} nodes processed".format(self.n_cnt))

    def way(self, w):
        self.w_cnt += 1
        if w.id in ways: 
            ways[w.id].add(w)
        else:
            ways[w.id] = Way(w)
        if self.w_cnt%1000==0:
            sys.stderr.write("\r{0} ways processed".format(self.w_cnt))

In [3]:
class OSMObject:
    def __init__(self, w):
        self.id = w.id
        self.history = []
        self.add(w)
        
    def add(self,w):
        self.history.append(
            {
                'version':w.version,
#                 'deleted':w.deleted, # These are not doing their part
#                 'visible':w.visible, #TODO: Why don't these work properly?
                
                'uid' : w.uid,
                'user': w.user,
                'created_at': w.timestamp.isoformat(),
                'timestamp' : w.timestamp,
                
                'tags': dict( (t.k, t.v) for t in list(w.tags) ),
                'geometry': self.get_geometry(w)
            })
        
    def process_history(self):
        if len(self.history[0]['tags']):
            self.history[0]['new_tags'] = self.history[0]['tags']
            
        if len(self.history)>1:

            #Ensure it's in order
            self.history.sort(key=lambda x: x['version'])
            
            for prev_idx, w in enumerate(self.history[1:]):

                prev_keys  = set(self.history[prev_idx]['tags'])
                these_keys = set(w['tags'])

                new_keys = these_keys - prev_keys
                if len(new_keys) > 0:
                    w['new_tags'] = dict( (k, w['tags'][k]) for k in new_keys)

                del_keys = prev_keys - new_keys - these_keys
                if len(del_keys) > 0:
                    w['deleted_tags'] = dict( (k, self.history[prev_idx]['tags'][k]) for k in del_keys)
                
                changed_tags = {}
                for k in these_keys:
                    if k in prev_keys:
                        if w['tags'][k] != self.history[prev_idx]['tags'][k]:
                            changed_tags[k] = (self.history[prev_idx]['tags'][k], w['tags'][k])
                if len(changed_tags):
                    w['changed_tags'] = changed_tags
                    
                w['seconds_since_last_edit'] = int((w['timestamp'] - self.history[prev_idx]['timestamp']).total_seconds())
                    
    def as_geojson(self, geometries=True):
        geojson = {'type':'Feature'}
        geojson['properties']  = copy.deepcopy(self.history[-1]['tags'])
        geojson['properties']['@id']          = self.id
        geojson['properties']['@created_at']  = self.history[-1]['created_at']
        geojson['properties']['@uid']         = self.history[-1]['uid']
        geojson['properties']['@user']        = self.history[-1]['user']
        geojson['properties']['@version']     = self.history[-1]['version']
        
        geojson['geometry'] = copy.deepcopy(self.history[-1]['geometry'])
        
        if len(self.history)>1:
            geojson['properties']['@object_history'] = copy.deepcopy(self.history)
            
            for idx, hist_obj in enumerate(geojson['properties']['@object_history']):
                if 'timestamp' in hist_obj:
                    del hist_obj['timestamp']
                    
                if 'tags' in hist_obj:
                    del hist_obj['tags']
                
                if idx>0:
                    if hist_obj['geometry']['coordinates'] != geojson['properties']['@object_history'][idx-1]['geometry']['coordinates']:
                        if geometries:
                            hist_obj['geometry_change'] = {
                                'old': geojson['properties']['@object_history'][idx-1]['geometry']['coordinates'],
                                'new': hist_obj['geometry']['coordinates']
                            }
                        else:
                            hist_obj['geometry_change'] = {
                                'old':{'nodes': len(geojson['properties']['@object_history'][idx-1]['geometry']['coordinates'])},
                                'new':{'nodes': len(hist_obj['geometry']['coordinates'])}
                            }
            
            for hist_obj in geojson['properties']['@object_history']:
                del hist_obj['geometry']

#         else:
#             if geometries: 
#                 #There is only 1 entry, so delete the geometry from history
#                 del geojson['properties']['@object_history'][0]['geometry']
#             else:
#                 geojson['properties']['@object_history'][0]['geometry'] = {'nodes':len(geojson['geometry']['coordinates'])}
            
        return geojson
    
    def __str__(self):
        """
            Override str() function so when calling print(), we get back the full information
        """
        string = "ID: {0}; revisions: {1}".format(self.id, len(self.history)-1)
        if len(self.history)>1:
            string += "\n-------------------------------------------------------------------------------"
            for o in self.history:
                string += "\n({0}) - {1} - {2}, Nodes: {3}".format(o['version'], o['user'], o['created_at'], len(o['geometry']['coordinates']))
                if 'new_tags' in o:
                    string += "\n\tNew Tags: {0}".format(o['new_tags'])
                if 'deleted_tags' in o:
                    string += "\n\tDeleted Tags: {0}".format(o['deleted_tags'])
                if 'changed_tags' in o:
                    string += "\n\tChanged Tags: {0}".format(o['changed_tags'])
            string += "\n==============================================================================="
        return string
            
class Way(OSMObject):
    def __init__(self, w):
        self.errors = 0
        OSMObject.__init__(self, w)
    
    def get_geometry(self,w):
        coords = []
        for n in w.nodes:
            #If there are multiple versions, then we sort and take the latest
            if len(node_locations[n.ref]) > 1:
                #Sort by changeset id
                node_locations[n.ref].sort(key=lambda x: x['c'])
                
                # Due to silly error from JOSM or Potlatch, we have to get hacky with this...
                try:
                    c = [x for x in node_locations[n.ref] if not x['c'] > w.changeset][-1]
                except:
                    #just take the first one
                    c = node_locations[n.ref][0]
                coords.append(c['g'])
            else:
                coords.append(node_locations[n.ref][0]['g'])
            
        return {"type":"LineString", "coordinates": coords}
#         try:
#             geom = o.geom.WKBFactory.create_linestring(w.nodes)
#         except Exception as e:
#             self.errors += 1
#             print(context(e))
#             sys.exit(1)
        
class Node(OSMObject):
    def __init__(self, n):
        self.errors = 0
        OSMObject.__init__(self, n)
    
    def get_geometry(self, n):
        return {"type": "Point", "coordinates":[ n.location.lon, n.location.lat]}

Run the handler to create the objects

## Now process the objects

In [None]:
node_locations = {}
nodes = dict({})
ways  = dict({})
h = FileHandler()
h.apply_file('/data/osm/nepal.osh.pbf', locations=True)

8160000 nodes processed

In [None]:
#Actually process the data:
for idx, (w_id, way) in enumerate(ways.items()):
    way.process_history()
    if idx%1000==0:
        sys.stderr.write("\r{0} ways processed".format(idx))

In [None]:
for idx, (n_id, node) in enumerate(nodes.items()):
    node.process_history() 
    if idx%1000==0:
        sys.stderr.write("\r{0} nodes processed".format(idx))

In [None]:
# Choose sample of data and print it
sample_ways  = np.random.choice(list(ways.keys()),10)
sample_nodes = np.random.choice(list(nodes.keys()),10)

In [None]:
for way in [ways[w_id] for w_id in sample_ways]:
    print(way)
    pprint.pprint(way.as_geojson())
    
for node in [nodes[n_id] for n_id in sample_nodes]:
    print(node)
    pprint.pprint(node.as_geojson(geometries=False))
    pprint.pprint(node.as_geojson())

In [None]:
# Write out the geojsonl file
with open('nepal_history_no_geometry.geojsonl','w') as outFile:
    for w_id, way in ways.items():
        outFile.write(json.dumps(way.as_geojson(geometries=False))+"\n")
    
    for n_id, node in nodes.items():
        outFile.write(json.dumps(node.as_geojson(geometries=False))+"\n")

Tippecanoe Instructions

    $ tippecanoe -o colorado_history_no_geometry.mbtiles -Pf -ps -pt -pf -pk -Z12 -z12 -d14 -l osm -n osm boulder_history.geojsonl

The geometries are stil _not quite right_... but they're close?