# Building OSM Full History objects from History Files

Test case: **Boulder, CO**

In [16]:
import sys, os, json, pprint
import osmium as o; import numpy as np

### Classes & Functions

In [2]:
class FileHandler(o.SimpleHandler):                                                                                           
    def __init__(self):
        o.SimpleHandler.__init__(self)                                                                                              
        self.w_cnt = 0
        self.n_cnt = 0
    def node(self, n):
        self.n_cnt +=1
        if n.id in nodes:
            nodes[n.id].add(n)
        elif len(n.tags)>0:
            nodes[n.id] = Node(n)            
    def way(self, w):
        self.w_cnt += 1
        if w.id in ways: 
            ways[w.id].add(w)
        else:
            ways[w.id] = Way(w)
        if self.w_cnt%1000==0:
            sys.stderr.write("\r{0} ways processed".format(self.w_cnt))

In [5]:
class OSMObject:
    def __init__(self, w):
        self.id = w.id
        self.history = []
        self.add(w)
        
    def add(self,w):
        self.history.append(
            {
                'version':w.version,
#                 'deleted':w.deleted, # These are not doing their part
#                 'visible':w.visible, #TODO: Why don't these work properly?
                
                'uid' : w.uid,
                'user': w.user,
                'created_at': w.timestamp.isoformat(),
                'timestamp' : w.timestamp,
                
                'tags': dict( (t.k, t.v) for t in list(w.tags) ),
                'geometry': self.get_geometry(w)
            })
        
    def process_history(self):
        if len(self.history[0]['tags']):
            self.history[0]['new_tags'] = self.history[0]['tags']
            
        if len(self.history)>1:

            #Ensure it's in order
            self.history.sort(key=lambda x: x['version'])
            
            for prev_idx, w in enumerate(self.history[1:]):

                prev_keys  = set(self.history[prev_idx]['tags'])
                these_keys = set(w['tags'])

                new_keys = these_keys - prev_keys
                if len(new_keys) > 0:
                    w['new_tags'] = dict( (k, w['tags'][k]) for k in new_keys)

                del_keys = prev_keys - new_keys - these_keys
                if len(del_keys) > 0:
                    w['deleted_tags'] = dict( (k, self.history[prev_idx]['tags'][k]) for k in del_keys)
                
                changed_tags = {}
                for k in these_keys:
                    if k in prev_keys:
                        if w['tags'][k] != self.history[prev_idx]['tags'][k]:
                            changed_tags[k] = (self.history[prev_idx]['tags'][k], w['tags'][k])
                if len(changed_tags):
                    w['changed_tags'] = changed_tags
                    
                w['seconds_since_last_edit'] = int((w['timestamp'] - self.history[prev_idx]['timestamp']).total_seconds())
            
    def as_geojson(self):
        geojson = {'type':'Feature'}
        geojson['properties']   = self.history[-1]['tags'].copy()
        geojson['properties']['@id']          = self.id
        geojson['properties']['@created_at']  = self.history[-1]['created_at']
        geojson['properties']['@uid']         = self.history[-1]['uid']
        geojson['properties']['@user']        = self.history[-1]['user']
        geojson['properties']['@version']     = self.history[-1]['version']

        for hist_obj in self.history:
            if 'timestamp' in hist_obj:
                del hist_obj['timestamp']
        
        if len(self.history)>1:    
            geojson['properties']['@object_history'] = self.history
        
        geojson['geometry'] = self.history[-1]['geometry']
        
        return geojson
    
    def __str__(self):
        """
            Override str() function so when calling print(), we get back the full information
        """
        string = "ID: {0}; revisions: {1}".format(self.id, len(self.history)-1)
        if len(self.history)>1:
            string += "\n-------------------------------------------------------------------------------"
            for o in self.history:
                string += "\n({0}) - {1} - {2}, Nodes: {3}".format(o['version'], o['user'], o['created_at'], len(o['geometry']['coordinates']))
                if 'new_tags' in o:
                    string += "\n\tNew Tags: {0}".format(o['new_tags'])
                if 'deleted_tags' in o:
                    string += "\n\tDeleted Tags: {0}".format(o['deleted_tags'])
                if 'changed_tags' in o:
                    string += "\n\tChanged Tags: {0}".format(o['changed_tags'])
            string += "\n==============================================================================="
        return string
            
class Way(OSMObject):
    def __init__(self, w):
        self.errors = 0
        OSMObject.__init__(self, w)
    
    def get_geometry(self,w):
        """
            Currently does not work properly for historical geometries because pyosmium library does not yet support
            multimap index. See https://github.com/Project-EPIC/epic-osm/blob/master/modules/domain_objects/osm_to_mongo.rb#L53 
            for example of how this can be implemented.
        """
        coords = [[n.location.lon, n.location.lat] for n in w.nodes] 
        return {"type":"LineString", "coordinates": coords}
#         try:
#             geom = o.geom.WKBFactory.create_linestring(w.nodes)
#         except Exception as e:
#             self.errors += 1
#             print(context(e))
#             sys.exit(1)
        
class Node(OSMObject):
    def __init__(self, n):
        self.errors = 0
        OSMObject.__init__(self, n)
    
    def get_geometry(self, n):
        return {"type": "Point", "coordinates":[ n.location.lon, n.location.lat]}

# 1.  Load the file into memory as Nodes & Ways
Unfortunately this is all single-threaded, but the performance is not _as bad_ as I thought it would be.

In [6]:
nodes = dict({})
ways  = dict({})
h = FileHandler()
h.apply_file('/data/osm/boulder.osh.pbf', locations=True)

32000 ways processed

### (Optional) Process a sample of the data first?

In [11]:
# Choose sample of data and process it!
sample_ways  = np.random.choice(list(ways.keys()), 2)
sample_nodes = np.random.choice(list(nodes.keys()),2)

for way in [ways[w_id] for w_id in sample_ways]:
    way.process_history()
    print(way)

for node in [nodes[n_id] for n_id in sample_nodes]:
    node.process_history()
    print(node)

ID: 438929638; revisions: 0
ID: 117092123; revisions: 0
ID: 176492340; revisions: 1
-------------------------------------------------------------------------------
(1) - DaveHansenTiger - 2007-12-18T07:45:43+00:00, Nodes: 2
	New Tags: {'source': 'tiger_import_dch_v0.6_20070809', 'tiger:county': 'Boulder, CO', 'tiger:upload_uuid': 'bulk_upload.pl-1b7afdac-0ecb-47b2-8901-e595d422ff1f', 'tiger:tlid': '188160245:188160246:188204332:188160253:188160212:188160215:188160216'}
(2) - woodpeck_fixbot - 2009-11-10T11:11:49+00:00, Nodes: 2
	Deleted Tags: {'source': 'tiger_import_dch_v0.6_20070809', 'tiger:county': 'Boulder, CO', 'tiger:upload_uuid': 'bulk_upload.pl-1b7afdac-0ecb-47b2-8901-e595d422ff1f', 'tiger:tlid': '188160245:188160246:188204332:188160253:188160212:188160215:188160216'}
ID: 176546835; revisions: 1
-------------------------------------------------------------------------------
(1) - DaveHansenTiger - 2007-12-18T08:25:11+00:00, Nodes: 2
	New Tags: {'source': 'tiger_import_dch_v0.6

# 2. Process histories for all of the data

In [12]:
for w_id, way in ways.items():
    way.process_history()

for n_id, node in nodes.items():
    node.process_history()

# 3. Write out GeoJSON for tippecanoe

In [19]:
with open('boulder_history_proof_of_concept.geojsonl','w') as oFile:
    for w_id, way in ways.items():
        oFile.write(json.dumps(way.as_geojson())+"\n")

    for n_id, node in nodes.items():
        oFile.write(json.dumps(node.as_geojson())+"\n")