In [None]:
'''
Convert 1 and 2 into two separate python modules!!
'''

In [69]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [93]:
import geopandas as gpd
from shapely.geometry import box
import pyrosm

import pandas as pd
import numpy as np

from datetime import datetime

from extract_history_data import *
from extract_indirect_indicator_features import *
from extract_points import *
from extract_multipolygons import *
from extract_linestrings import *

### Handler for historical data

In [104]:
h = HistoryHandler()
h.apply_file("data/osm/historical/rec_historical.osm.pbf")

### Handler for extracting geometries from latest osm data

In [106]:
h_n = NodeHandler()
h_n.apply_file("data/osm/historical/rec.osm.pbf")

h_wr = AreaHandler()
h_wr.apply_file("data/osm/historical/rec.osm.pbf")

h_roads = WayHandler()
h_roads.apply_file("data/osm/historical/rec.osm.pbf", locations = True)

Note: the handlers are used several times below for extracting poi, building or roads from OSM data

# POI

### Extract all nodes, ways, relations from history data that are POIs

In [110]:
data_poi = h.get_data(poi_qualifier)
colnames = ['id', 'visible', 'ts', 'uid', 'tags', 'osm_type']
history = pd.DataFrame(data_poi, columns=colnames)
history = history.sort_values(by=['id', 'ts'])

### Extract indirect indicator features (nusers and ts) from history data

In [111]:
poi = extract_ii_features(history, poi_qualifier)

### Extract geometries of all POI

In [112]:
poi_n = h_n.get_gdf(poi_qualifier)
poi_wr = h_wr.get_gdf(poi_qualifier)

poi_geom = pd.concat([poi_n, poi_wr])

### Merge POI indirect indicator data with its geometries

In [113]:
poi_indir = pd.merge(poi, poi_geom,  how='inner', left_on=['id','osm_type'], right_on = ['id','osm_type'])

In [114]:
poi_indir = gpd.GeoDataFrame(poi_indir, crs="EPSG:4326")
poi_indir['item_type'] = 'poi'

#

# Roads

### Extract all ways from history data that are roads/streets

In [115]:
data_road = h.get_data(highway_qualifier)

colnames = ['id', 'visible', 'ts', 'uid', 'tags', 'osm_type']
history = pd.DataFrame(data_road, columns=colnames)
history = history.sort_values(by=['id', 'ts'])

### Extract indirect indicator features (nusers and ts) from history data

In [116]:
road = extract_ii_features(history, highway_qualifier)

### Extract road geometries using PyrOSM *(PS: Consider doing it with Osmium)*

In [117]:
# With PyrOSM:
# osm = pyrosm.OSM("data/osm/latest/rec.osm.pbf")
# road_geom = osm.get_network()[['id','geometry']] # Will use sidewalks column for workshop paper

# Or with Osmium:
road_geom = h_roads.get_gdf(highway_qualifier)

### Merge road indirect indicator data with its geometries

In [119]:
roads_indir = pd.merge(road, road_geom, how='inner', left_on=['id','osm_type'], right_on = ['id','osm_type'])

In [120]:
roads_indir = gpd.GeoDataFrame(roads_indir, crs="EPSG:4326")
roads_indir['item_type'] = 'road'

# Buildings

### Extract all ways and relations from history data that are buildings

In [121]:
data = h.get_data(building_qualifier)

colnames = ['id', 'visible', 'ts', 'uid', 'tags', 'osm_type']
history = pd.DataFrame(data, columns=colnames)
history = history.sort_values(by=['id', 'ts'])

### Extract indirect indicator features (nusers and ts) from history data

In [122]:
building = extract_ii_features(history, building_qualifier)

### Extract geometries of all buildings

In [123]:
building_geom = h_wr.get_gdf(building_qualifier)

### Merge building indirect indicator data with its geometries

In [124]:
building_indir = pd.merge(building, building_geom,  how='inner', on = ['id', 'osm_type'])

In [125]:
building_indir = gpd.GeoDataFrame(building_indir, crs="EPSG:4326")
building_indir['item_type'] = 'building'

In [133]:
gdf = pd.concat([building_indir, roads_indir, poi_indir])
gdf = gdf.reset_index(drop=True)
gdf = gpd.GeoDataFrame(gdf, crs="EPSG:4326")
gdf = gdf.to_crs('epsg:3395')
gdf

Unnamed: 0,id,nusers,ts,osm_type,tags,geometry,item_type
0,2402565,"[336460, 8570285, 1122708]",2019-01-25 15:01:07+00:00,R,{'building': 'stadium'},"MULTIPOLYGON (((-3884170.471 -890506.042, -388...",building
1,2514001,"[336460, 612405, 1772368]",2019-05-19 04:37:30+00:00,R,"{'addr:housenumber': '1086', 'addr:street': 'A...","MULTIPOLYGON (((-3884767.165 -892038.983, -388...",building
2,2959361,[651869],2017-04-09 13:39:57+00:00,R,{'building': 'yes'},"MULTIPOLYGON (((-3886256.787 -904237.600, -388...",building
3,2959362,[651869],2017-04-09 13:39:57+00:00,R,{'building': 'yes'},"MULTIPOLYGON (((-3886313.816 -904215.648, -388...",building
4,2959364,[651869],2017-04-09 13:39:58+00:00,R,{'building': 'yes'},"MULTIPOLYGON (((-3886251.433 -904154.516, -388...",building
...,...,...,...,...,...,...,...
193337,9806910817,[16236481],2022-06-09 16:44:23+00:00,N,"{'amenity': 'place_of_worship', 'name': 'Assem...",POINT (-3884533.283 -881911.919),poi
193338,9820145717,[1557079],2022-06-15 14:27:36+00:00,N,"{'addr:housenumber': '313', 'addr:street': 'Ru...",POINT (-3879687.045 -889272.991),poi
193339,9831609521,[15781871],2022-06-19 19:27:29+00:00,N,"{'amenity': 'restaurant', 'cuisine': 'pizza', ...",POINT (-3884907.918 -882420.685),poi
193340,9863865267,[1557079],2022-07-03 13:16:04+00:00,N,"{'addr:city': 'Olinda', 'addr:housenumber': '1...",POINT (-3879689.650 -889509.998),poi


### Divide geodataframe into 1km * 1km cells and compute the parameters for all buildings w.r.t their cell

In [21]:
# Have to make one more consideration: if one building can be part of multiple cell. Do check that tomorrow!!

size = 1000
xmin, ymin, xmax, ymax= gdf.total_bounds
cell_width = cell_height = size

indir_indicators = []

cell_no = 0 
for x0 in np.arange(xmin, xmax+cell_width, cell_width):
    for y0 in np.arange(ymin, ymax+cell_height, cell_height):
        x1 = x0+cell_width
        y1 = y0+cell_height

        cell = gdf.cx[x0:x1, y0:y1]
        
        # short form: uc -> user count, le_time -> last edit time
        roads = cell[cell['item_type'] == 'road']
        road_cnt, road_uc, road_le_time = compute_ii_indicators(roads)

        buildings = cell[cell['item_type'] == 'building']
        building_cnt, building_uc, building_le_time = compute_ii_indicators(buildings)

        pois = cell[cell['item_type'] == 'poi']
        poi_cnt, poi_uc, poi_le_time = compute_ii_indicators(pois)

        bid = []
        for item in buildings.to_numpy():
            bid.append(item[0])

        
        if(bid):
            indir_indicators.append([
                                    cell_no,
                                    bid, 
                                    road_cnt, 
                                    road_uc, 
                                    road_le_time, 
                                    building_cnt, 
                                    building_uc, 
                                    building_le_time, 
                                    poi_cnt, poi_uc, 
                                    poi_le_time
                                    ])

            cell_no += 1

In [22]:
colnames = ['cell_no', 'bid', 'road_cnt', 'road_uc', 'road_le_time', 'building_cnt', 'building_uc', 'building_le_time', 'poi_cnt', 'poi_uc', 'poi_le_time']
indir = pd.DataFrame(indir_indicators, columns=colnames)

In [23]:
indir

Unnamed: 0,cell_no,bid,road_cnt,road_uc,road_le_time,building_cnt,building_uc,building_le_time,poi_cnt,poi_uc,poi_le_time
0,0,"[649762976, 649762978, 649762980, 649762982, 8...",28,14,12,23,3,7,0,0,0
1,1,"[649762910, 649762914, 649762916, 649762919, 6...",54,22,8,56,3,22,2,2,70
2,2,"[683804184, 751199360, 751199361, 751199362, 7...",90,24,7,690,2,7,3,2,7
3,3,"[11243064, 12497977, 483259351, 483260229, 484...",157,41,5,996,11,5,45,8,5
4,4,"[590299017, 742250779, 742250780, 742250781, 7...",80,23,10,696,1,20,5,2,40
...,...,...,...,...,...,...,...,...,...,...,...
377,377,[885994795],16,19,8,1,1,20,0,0,0
378,378,"[885900392, 885900397]",7,9,13,2,1,20,0,0,0
379,379,"[153236225, 153236231, 153252396, 153252972, 4...",17,21,9,7,2,71,0,0,0
380,380,[681001036],22,25,12,1,1,40,2,2,26


In [27]:
stats = indir.describe()[['road_cnt', 'road_uc', 'building_cnt', 'building_uc', 'building_le_time', 'poi_cnt', 'poi_uc', 'poi_le_time']]
stats.filter(items = ['mean', '50%'], axis=0)

Unnamed: 0,road_cnt,road_uc,building_cnt,building_uc,building_le_time,poi_cnt,poi_uc,poi_le_time
mean,125.774869,38.65445,379.044503,6.379581,15.651832,27.434555,7.554974,10.861257
50%,120.0,39.0,94.0,4.0,9.0,9.0,4.0,6.5


# Extract Time Indicator

In [24]:
def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month

In [25]:
time = gdf[gdf['item_type'] == 'building'][['id','ts']]
time['last_edit(months)'] = [diff_month(datetime.now(), row) for row in time['ts']]

In [26]:
time

Unnamed: 0,id,ts,last_edit(months)
0,2402565,2019-01-25 15:01:07+00:00,43
1,2514001,2019-05-19 04:37:30+00:00,39
2,2959361,2017-04-09 13:39:57+00:00,64
3,2959362,2017-04-09 13:39:57+00:00,64
4,2959364,2017-04-09 13:39:58+00:00,64
...,...,...,...
140736,1064791239,2022-05-29 07:43:21+00:00,3
140737,1064791240,2022-05-29 07:43:21+00:00,3
140738,1070775598,2022-06-17 22:02:40+00:00,2
140739,1071164414,2022-06-19 03:46:38+00:00,2


In [31]:
stats = time.describe()[['last_edit(months)']]
stats.filter(items = ['mean', '50%'], axis=0)

Unnamed: 0,last_edit(months)
mean,33.911746
50%,32.0


In [None]:
# If last updated was a long time ago, the paper defines it as more reliable. Seems a bit contradictory! (Shouldn't it be more reliable as we have more new updates )