In [None]:
import pandas as pd, sys
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime, calendar, time
import matplotlib.dates as mdates
import matplotlib.colors as colors
import matplotlib.ticker
%matplotlib inline

### Set the city for analysis
Loads the city file from `../data`

In [None]:
PLACE = "detroit_michigan"

Read in the versions file

In [None]:
header = ['id','type','amenity','highway','building','name','length','area','aA','aD','aM','user', 'version','minorVersion','validSince','validUntil','changeset','center']

In [None]:
df = pd.read_csv('../data/'+PLACE+'-all-stats.data',names=header, sep='\t',index_col=None)
df.sort_values(by='validSince',inplace=True)
df = df.reset_index(drop=True)
df.head(2)

Calculate new columns and convert to Python objects

In [None]:
df['date'] = df.validSince.apply(lambda x: pd.Timestamp(x*1000000000).date())
df.replace("false", np.NaN, inplace=True)
df.validUntil = df['validUntil'].astype(float)

#Cast length and area to floats
df.length = pd.to_numeric(df.length, downcast='float')
df.area = pd.to_numeric(df.length, downcast='float')
df.version = pd.to_numeric(df.version, downcast='integer')

In [None]:
print("Loaded {:,} edits to {:,} objects from {:,} users".format( df.id.count(), df.id.nunique(), df.user.nunique()) );

# 1. Kilometers of roads over time

In [None]:
sns.set_style('whitegrid')
hw = df[~pd.isnull(df.highway)]
ax = hw[hw.version==1].groupby('date').aggregate({'length':sum}).cumsum().plot(figsize=(12,8),legend=None)
hw[hw.version>1].groupby('date').aggregate({'length':sum}).cumsum().plot(ax=ax,legend=None)
ax.set_ylabel("Kilometers of road edits",fontsize=16)
ax.set_title("Kilometers of road edited vs. added over time",fontsize=16);
ax.legend(['Added','Edited'], loc=0, fontsize=12);

# 2 Buildings Over time

In [None]:
sns.set_style('whitegrid')
buildings = df[~pd.isnull(df.building)]
ax = buildings[(buildings.version==1) & (buildings.minorVersion==0)].groupby('date').aggregate({'id':'nunique'}).cumsum().plot(figsize=(12,8),legend=None)
buildings[buildings.version>1].groupby('date').aggregate({'id':'nunique'}).cumsum().plot(ax=ax,legend=None)
ax.set_ylabel("Buildings Edited",fontsize=16)
ax.set_title("Number of buildings edited vs. added over time",fontsize=16);
ax.legend(['Added','Edited'], loc=0, fontsize=12);

### Top Building Contributors (Added)

In [None]:
buildings[(buildings.minorVersion==0) & (buildings.version==1)].groupby('user').aggregate({
    'id':'count'}).sort_values(by='id',ascending=False).head(10)

### Top Building Contributors (Recently)

In [None]:
buildings[(buildings.minorVersion==0) & (buildings.version==1) & 
          (buildings.date > datetime.date(2018,8,1))].groupby('user').aggregate(
            {'id':'count'}).sort_values(by='id',ascending=False).head(10)

# 3 Amenities Over time

In [None]:
sns.set_style('whitegrid')
amenities = df[~pd.isnull(df.amenity)]
ax = amenities[(amenities.version==1) & (amenities.minorVersion==0)].groupby('date').aggregate({'id':'nunique'}).cumsum().plot(figsize=(12,8),legend=None)
amenities[amenities.version>1].groupby('date').aggregate({'id':'nunique'}).cumsum().plot(ax=ax,legend=None)
ax.set_ylabel("Amenities Edited",fontsize=16)
ax.set_title("Number of `Amenities` edited vs. added over time",fontsize=16);
ax.legend(['Added','Edited'], loc=0, fontsize=12);

# 4 Contributors per week ... an editing pulse?

In [None]:
ax = df.groupby('date').aggregate({'user':'nunique'}).rolling(7).mean().plot(figsize=(15,8),legend=None)
ax.set_ylabel("Number of Users",fontsize=16)
ax.set_title("Contributors editing per week",fontsize=16);

# 5 Edits per week?

In [None]:
ax = df.groupby('date').aggregate({'id':'count'}).rolling(7).mean().plot(figsize=(15,8),legend=None)
ax.set_ylabel("Number of edits",fontsize=16)
ax.set_title("Edits per week",fontsize=16);

# 6 Users over time

In [None]:
# Kudos to Seth on this algorithm :) 
unique_users_over_time = df.groupby('user').aggregate(
    {'date':min}).reset_index().groupby('date').aggregate({'user':'count'})

In [None]:
sns.set_style('whitegrid')
ax = unique_users_over_time.cumsum().plot(figsize=(15,8),legend=None)
ax.set_ylabel('Number of Unique Contributors', fontsize=15)
ax.set_xlabel('Year', fontsize=15);

# 7 Contributor Co-Editing Network

In [None]:
import networkx as nx

In [None]:
G = nx.DiGraph()
for line in open('../data/'+PLACE+'-users.edgelist','r'):
    sourceLabel, targetLabel, weight = line.split(",")
    weight = int(weight)
    if weight > 100:
        G.add_edge(sourceLabel, targetLabel, weight=weight)

In [None]:
len(G.nodes())

In [None]:
nx.draw_spring(G, with_labels=True)

In [None]:
nx.write_graphml(G,'../data/'+PLACE+'.graphml');
print("View the GML file in the data diretory: ../data/"+PLACE+".graphml")
print("Recommend downloading GEPHI: https://gephi.org/ to view this file")

# 8 Versions

In [None]:
ax = buildings.version.hist(figsize=(10,4),bins=np.arange(1,buildings.version.max(),1))
ax.set_yscale('log'); ax.set_title('Histogram of Object Versions', fontsize=16)
ax.set_xlabel("Version",fontsize=14); ax.set_ylabel("Number of Objects (log-scale)",fontsize=14);

### Most edited buildings?

In [None]:
buildings.sort_values(by='version', ascending=False).head()

# Minor Versions?

In [None]:
df.minorVersion.value_counts().head()

In [None]:
print("Number of uncounted minor geometry edits: {:,}".format( int(df.query('minorVersion>0').minorVersion.sum())) )

In [None]:
df2 = buildings[buildings.minorVersion>0].groupby('date').aggregate({'id':'count'})
df2.plot(figsize=(15,8), legend=None, marker='.', linewidth=0);
ax.set_title("When Minor Versions Happen (Geometry Only Changes)",fontsize=16); ax.set_ylabel("Number of edits")
ax.set_xlabel("Date", fontsize=14);

# 10 Evolution of the Map: Roads --> Buildings?

In [None]:
ax = df[(df.version==1) & (df.minorVersion==0)].groupby('date').aggregate({'building':'count','highway':'count','amenity':'count'}).cumsum().plot(figsize=(15,8))
ax.set_title('Creation of Objects overtime');
ax.set_yscale('LOG')

# Learning about Names

In [None]:
def extract_key(attributeString,attr):
    if type(attributeString)==str:
        try:
            tags   = attributeString.split("|-|")
            tuples = [x.split("-->") for x in tags]
            keys   = [x[0] for x in tuples]
            values = [x[1] for x in tuples]
            if attr in keys:
                return values[keys.index(attr)]
            else:
                return np.NaN
        except:
            print(attributeString)
            print("keys", keys)
            raise
    return np.NaN

In [None]:
df['nameAdded'] = df.aA.apply(lambda x: extract_key(x,'name'))

## When were names added to roads?

In [None]:
ax = df[pd.isnull(df.highway)].groupby('date').aggregate({'nameAdded':'count'}).plot(figsize=(15,8))
ax.set_title('When the `name` attribute was added to objects on the map',fontsize=18);

# Top 15 Mappers of 2018?

In [None]:
df[df.date > datetime.date(2018,1,1)].groupby('user').aggregate({'id':'count'}).sort_values(by='id',ascending=False).head(15)


## A heatmap of building edits?

In [33]:
import json, os
featColl = {'type':'FeatureCollection','features':[]}
for date, edits in df[pd.notnull(df.building)].groupby('date'):
    try:
        geometryColl = edits.center.apply(lambda x: {'type':'Point','coordinates':[float(y) for y in x.split(",")]})
        geojson = {
            'geometry':{'type':'GeometryCollection','geometries':list(geometryColl)},
            'type':"Feature",
            'properties': {'days_since_2005': (date - datetime.date(2005,1,1) ).days, 'edits':len(edits)}
        }
        featColl['features'].append(geojson)
    except:
        print("err--moving on")
    
json.dump(featColl, open("../docs/data/"+PLACE+".geojson",'w'))

print("Starting a local webserver in the `docs` directory: ")
os.system("cd ../docs && python -m SimpleHTTPServer 8000 & ")
print("\n\nCheck it out at: http://localhost:8000/index.html?data="+PLACE+".geojson")

Starting a local webserver in the `docs` directory: 


Check it out at: http://localhost:8000/index.html?data=detroit_michigan.geojson
