In [1]:
import osmium as osm
import pandas as pd
import numpy as np

In [2]:
class TimelineHandler(osm.SimpleHandler):
    def __init__(self):
        osm.SimpleHandler.__init__(self)
        self.elements = []

    def add_elements(self, e, elem_type):
        self.elements.append([elem_type,
                              e.id,
                              e.version,
                              e.visible,
                              pd.Timestamp(e.timestamp),
                              e.uid,
                              e.changeset])

    def node(self, n):
        self.add_elements(n, 'node')

    def way(self, w):
        self.add_elements(w, 'way')

    def relation(self, r):
        self.add_elements(r, 'relation')

In [3]:
tlhandler = TimelineHandler()
tlhandler.apply_file("data/ottgat.osh.pbf")
colnames = ['type', 'id', 'version', 'visible', 'ts', 'uid', 'chgset']
elements = pd.DataFrame(tlhandler.elements, columns=colnames)
elements = elements.sort_values(by=['type', 'id', 'ts'])

In [4]:
user_md = (elements.groupby('uid')['ts']
            .agg(["min", "max"])
            .reset_index())
user_md.columns = ['uid', 'first_at', 'last_at']
user_md['lifespan'] = ((user_md.last_at - user_md.first_at)
                        / pd.Timedelta('1d'))
extraction_date = elements.ts.max()
user_md['n_inscription_days'] = ((extraction_date - user_md.first_at)
                                  / pd.Timedelta('1d'))
elements['ts_round'] = elements.ts.apply(lambda x: x.round('d'))
user_md['n_activity_days'] = (elements
                              .groupby('uid')['ts_round']
                              .nunique()
                              .reset_index())['ts_round']
user_md.sort_values(by=['first_at'])

Unnamed: 0,uid,first_at,last_at,lifespan,n_inscription_days,n_activity_days
15,2490,2006-05-24 03:44:12+00:00,2011-04-19 19:44:48+00:00,1791.667083,4141.838866,2
26,3600,2006-09-21 22:11:45+00:00,2010-07-07 21:18:35+00:00,1384.963079,4021.069734,30
29,4011,2006-10-05 17:45:08+00:00,2006-10-22 02:42:46+00:00,16.373356,4007.254884,2
0,0,2006-10-14 20:54:08+00:00,2008-11-02 19:52:09+00:00,749.956956,3998.123634,9
17,2585,2006-10-21 17:19:39+00:00,2012-07-21 00:43:52+00:00,2099.308484,3991.272581,3
10,1399,2007-03-18 17:09:39+00:00,2007-03-18 17:09:39+00:00,0.000000,3843.279525,1
49,7204,2007-04-24 19:20:13+00:00,2009-06-03 19:58:01+00:00,771.026250,3806.188854,5
12,1679,2007-05-26 04:35:31+00:00,2017-09-10 15:34:28+00:00,3760.457604,3774.803229,642
57,8363,2007-05-28 19:16:01+00:00,2016-11-16 18:04:34+00:00,3459.950382,3772.191771,2
61,9078,2007-06-13 22:58:50+00:00,2007-06-24 20:56:49+00:00,10.915266,3756.037037,6


In [5]:
newfeature = (elements.groupby(['uid'])['id']
              .count()
              .reset_index()
              .fillna(0))
newfeature.columns = ['uid', "n_total_modif"]

In [6]:
user_md = pd.merge(user_md, newfeature, on='uid', how="outer").fillna(0)

In [7]:
newfeature = (elements.query('type == "node"').groupby(['uid'])['id']
              .count()
              .reset_index()
              .fillna(0))
newfeature.columns = ['uid', "n_total_modif_node"]

In [8]:
user_md = pd.merge(user_md, newfeature, on='uid', how="outer").fillna(0)

In [9]:
newfeature = (elements.query('type == "way"').groupby(['uid'])['id']
              .count()
              .reset_index()
              .fillna(0))
newfeature.columns = ['uid', "n_total_modif_way"]

In [10]:
user_md = pd.merge(user_md, newfeature, on='uid', how="outer").fillna(0)

In [11]:
newfeature = (elements.query('type == "relation"').groupby(['uid'])['id']
              .count()
              .reset_index()
              .fillna(0))
newfeature.columns = ['uid', "n_total_modif_relation"]

In [12]:
user_md = pd.merge(user_md, newfeature, on='uid', how="outer").fillna(0)

In [13]:
osmelem_versioning = (elements.groupby(['type', 'id'])['version']
            .agg(["first", "last"])
            .reset_index())
osmelem_versioning.columns = ['type', 'id', 'vmin', 'vmax']

In [14]:
elements = pd.merge(elements, osmelem_versioning, on=['type', 'id'])
elements['init'] = elements.version == elements.vmin
elements['up_to_date'] = elements.version == elements.vmax
# note that 'elements' is sorted by type, id and ts
elements['willbe_corr'] = np.logical_and(elements.id.diff(-1)==0,
                                          elements.uid.diff(-1)!=0)
elements['willbe_autocorr'] = np.logical_and(elements.id.diff(-1)==0,
                                                 elements.uid
                                                 .diff(-1)==0)

In [15]:
def create_count_features(metadata, element_type, data, grp_feat, res_feat, feature_suffix):
    feature_name = 'n_'+ element_type + '_modif' + feature_suffix
    newfeature = (data.groupby([grp_feat])[res_feat]
                  .count()
                  .reset_index()
                  .fillna(0))
    newfeature.columns = [grp_feat, feature_name]
    metadata = pd.merge(metadata, newfeature, on=grp_feat, how="outer").fillna(0)
    return metadata

def extract_modif_features(metadata, data, element_type):
    typed_data = data.query('type==@element_type')
    metadata = create_count_features(metadata, element_type, typed_data,
                               'uid', 'id', '')
    metadata = create_count_features(metadata, element_type,
                               typed_data.query("init"),
                               'uid', 'id', "_cr")
    metadata = create_count_features(metadata, element_type,
                               typed_data.query("not init and visible"),
                               'uid', 'id', "_imp")
    metadata = create_count_features(metadata, element_type,
                               typed_data.query("not init and not visible"),
                               'uid', 'id', "_del")
    metadata = create_count_features(metadata, element_type,
                               typed_data.query("up_to_date"),
                               'uid', 'id', "_utd")
    metadata = create_count_features(metadata, element_type,
                               typed_data.query("willbe_corr"),
                               'uid', 'id', "_cor")
    metadata = create_count_features(metadata, element_type,
                               typed_data.query("willbe_autocorr"),
                               'uid', 'id', "_autocor")
    return metadata

In [16]:
user_md = extract_modif_features(user_md, elements, 'node')
user_md = extract_modif_features(user_md, elements, 'way')
user_md = extract_modif_features(user_md, elements, 'relation')
user_md = user_md.set_index('uid')
user_md.to_csv("usermd.csv")