# Dataset due diligence: gdf / gdf_nodes

This notebook reproduces the core exploratory checks in a dedicated place. It focuses on:
- Geometry point order vs (u, v) order
- Duplicates in edges and nodes
- LineString point counts (m = 2)
- Consistency between gdf and gdf_nodes


In [1]:
# Tool for exploration

from web_map import open_google_maps

#example
lat, lon = 51.564, 0.00227
open_google_maps((lat, lon), open_in='inline')


In [21]:
from pyrosm import OSM
import geopandas as gpd
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=FutureWarning, module="pyrosm")

DATA_PATH = "data/greater-london-260126.osm.pbf"

osm_parser = OSM(DATA_PATH)

gdf_nodes, gdf = osm_parser.get_network(network_type="walking", nodes=True)

gdf_nodes = gdf_nodes.set_index("id")

display(gdf.head(3))
display(gdf_nodes.head(3))


Unnamed: 0,access,area,bicycle,bicycle_road,bridge,busway,cycleway,est_width,foot,footway,...,width,id,timestamp,version,tags,osm_type,geometry,u,v,length
0,,,,,,,,,,,...,,74,1759688079,12,"{""visible"":false,""sidewalk:both"":""separate""}",way,"LINESTRING (-0.19312 51.60173, -0.19305 51.60179)",196101,12544100541,8.815
1,,,,,,,,,,,...,,74,1759688079,12,"{""visible"":false,""sidewalk:both"":""separate""}",way,"LINESTRING (-0.19305 51.60179, -0.19297 51.60186)",12544100541,2121445348,9.297
2,,,,,,,,,,,...,,75,1690031606,12,"{""visible"":false}",way,"LINESTRING (-0.17679 51.61732, -0.17679 51.61683)",196055,1030634587,54.719


Unnamed: 0_level_0,changeset,version,timestamp,lat,visible,tags,lon,geometry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
196101,0,8,1758701752,51.601727,False,"{'highway': 'traffic_signals', 'traffic_signal...",-0.193124,POINT (-0.19312 51.60173)
12544100541,0,4,1758701752,51.601791,False,"{'crossing': 'traffic_signals', 'crossing:mark...",-0.193051,POINT (-0.19305 51.60179)
2121445348,0,3,1564910827,51.60186,False,,-0.192974,POINT (-0.19297 51.60186)


In [3]:
print(f"gdf rows: {len(gdf):,}")
print(f"gdf_nodes rows: {len(gdf_nodes):,}")
print("gdf CRS:", gdf.crs)
print("gdf_nodes CRS:", gdf_nodes.crs)
print("gdf geometry types:")
print(gdf.geometry.geom_type.value_counts())


gdf rows: 2,155,872
gdf_nodes rows: 1,947,925
gdf CRS: epsg:4326
gdf_nodes CRS: epsg:4326
gdf geometry types:
LineString    2155872
Name: count, dtype: int64


In [4]:
# Check that each geometry is a LineString with exactly 2 points
n_coords = gdf.geometry.count_coordinates()
print("n_coords value counts (top 5):")
print(n_coords.value_counts(dropna=False).head())

non_two = gdf[n_coords != 2]
print(f"Non-2-point geometries: {len(non_two):,}")
if len(non_two):
    display(non_two.head())


n_coords value counts (top 5):
2    2155872
Name: count, dtype: int64
Non-2-point geometries: 0


In [5]:
# Duplicate checks

dup_uv = gdf.duplicated(subset=["u", "v"]).sum()
geom_wkb = gdf.geometry.apply(lambda g: g.wkb if g is not None else None) # WKB for hashing
dup_uv_geom = gdf.assign(_geom_wkb=geom_wkb).duplicated(subset=["u", "v", "_geom_wkb"]).sum()

print(f"Duplicate edges by (u, v): {dup_uv:,}")
print(f"Duplicate edges by (u, v, geometry): {dup_uv_geom:,}")

print("gdf_nodes index unique:", gdf_nodes.index.is_unique)
print("Duplicate node coordinates (lat, lon):", gdf_nodes.duplicated(subset=["lat", "lon"]).sum())

geom_wkb_nodes = gdf_nodes.geometry.apply(lambda g: g.wkb if g is not None else None)
print("Duplicate node geometries:", geom_wkb_nodes.duplicated().sum())


Duplicate edges by (u, v): 127
Duplicate edges by (u, v, geometry): 127
gdf_nodes index unique: True
Duplicate node coordinates (lat, lon): 311
Duplicate node geometries: 311


In [30]:
# Check that the coordinates in the edges match the coordinates in the nodes

from shapely import Point
import numpy as np

def numpy_haversine_distance(arr_1: np.ndarray, arr_2: np.ndarray) -> np.ndarray:
    rarr_1 = np.radians(arr_1)
    rarr_2 = np.radians(np.array(arr_2))
    dlat = rarr_1[:, 0] - rarr_2[:, 0]
    dlon = rarr_1[:, 1] - rarr_2[:, 1]
    return numpy_haversine_function(dlat) + np.cos(rarr_1[:, 0]) * np.cos(rarr_2[:, 0]) * numpy_haversine_function(dlon)

def numpy_geodesic_distance(arr_1: np.ndarray, arr_2: np.ndarray) -> np.ndarray:
    R = 6_371_000  # Earth's radius in meters
    haversine_vals = numpy_haversine_distance(arr_1, arr_2)
    return 2 * R * np.arcsin(np.sqrt(haversine_vals))

def numpy_haversine_function(arr: np.ndarray):
    '''Pointwise Haversine function'''
    arr = np.sin(arr / 2) ** 2
    return arr

# Get the coordinates from the edges
# Note - use of boundary maps cycles to 0
gdf['u_coords'] = gdf.geometry.apply(lambda g: Point(g.coords[0]))
gdf['v_coords'] = gdf.geometry.apply(lambda g: Point(g.coords[1]))

# Prepare the nodes data
gdf_nodes_appendage = gdf_nodes[['lat', 'lon']]
gdf_nodes_appendage['point'] = gdf_nodes_appendage.apply(lambda row: Point(row['lon'], row['lat']), axis=1)
gdf_nodes_appendage = gdf_nodes_appendage.set_geometry('point')

# Merge nodes data using u<->id for the first point, v<->id for the second point
gdf_merged = gdf.merge(gdf_nodes_appendage[['point']].rename(columns={'point': 'point_u'}), how='left', right_index=True, left_on='u')
gdf_merged = gdf_merged.merge(gdf_nodes_appendage[['point']].rename(columns={'point': 'point_v'}), how='left', right_index=True, left_on='v')


# Get the distances between the u coordinates from different sources of the same edge
# reorder to (lat, lon) for the haversine functions
edge_u = gdf_merged['u_coords'].get_coordinates().values[:, ::-1]
node_u = gdf_merged['point_u'].get_coordinates().values[:, ::-1]

# Get the distances between the v coordinates from different sources of the same edge
# reorder to (lat, lon) for the haversine functions
edge_v = gdf_merged['v_coords'].get_coordinates().values[:, ::-1]
node_v = gdf_merged['point_v'].get_coordinates().values[:, ::-1]

v_distances = numpy_geodesic_distance(edge_v, node_v)
u_distances = numpy_geodesic_distance(edge_u, node_u)

print(f"v point mismatches: {(v_distances != 0).sum():,}")
print(f"u point mismatches: {(u_distances != 0).sum():,}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf_nodes_appendage['point'] = gdf_nodes_appendage.apply(lambda row: Point(row['lon'], row['lat']), axis=1)


v point mismatches: 0
u point mismatches: 0
