## Baseline GCN testing
Notebook to create and evaluate GCN against EBC on predicting number of passing bicyclists in *copenhagen?*
- Preprocess EBC for graph DONE
- Assign Metrics from data
- Create Torch Graph
- Evaluate against SOTA

In [1]:
import torch
from torch_geometric.data import Data
import torch_geometric as tg
import osmnx as ox
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import folium
from folium import plugins
from folium.plugins import HeatMap
from shapely.geometry import Point, LineString, Polygon
import shapely
import momepy as mp 
import esda
import seaborn as sns
from shapely.strtree import STRtree
import pickle
from tqdm import tqdm

import os, glob

sns.set_theme()



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/christianrasmussen/Documents/thesis/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/christianrasmussen/Documents/thesis/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/christianrasmussen/Documents/thesis/.venv/lib/python3.12/site-packages/ipykernel/

In [None]:
lat, lon = 55.6867243, 12.5700724
dist = 10000
features = [
    'amenity',
    'shop',
    'building',
    # 'aerialway',
    # 'aeroway', 
    # 'barrier', 
    # 'boundary', 
    # 'craft', 
    # 'emergency', 
    # 'highway', 
    # 'historic',
    # 'landuse', 
    # 'leisure', 
    # 'healthcare', 
    # 'military',
    'natural',
    # 'office',
    # 'power',
    # 'public_transport',
    # 'railway',
    # 'place',
    # 'service', 
    # 'tourism', 
    # 'waterway', 
    # 'route', 
    # 'water'
]
expand_features = ['shop', 'route', 'highway', 'waterway', 'width', 'length', 'aerialway', 'power', 'healthcare']
expand_features = features


In [3]:
# for item in ['shop', 'route', 'highway', 'waterway', 'width', 'length', 'aerialway', 'power', 'healthcare']:
# 	if item in features:
# 		features.remove(item)


In [4]:
def get_city_graph(lat, lon, dist, features, expand_features):
    g = ox.graph_from_point((lat, lon), dist=dist, network_type='bike', simplify=True, retain_all=False)
    feat_dict = {i : True for i in features}
    amenities = ox.features.features_from_point((lat, lon), tags=feat_dict, dist=dist)
    amenities = amenities[amenities.geometry.notnull()]
    amenities['new_col'] = np.nan

    for feat in features:
        if feat not in expand_features:
            amenities.loc[amenities[feat].notnull(), 'new_col'] = feat
    
    amenities['amenity'] = amenities['new_col']

    for feat in expand_features:
        amenities['amenity'].fillna(amenities[feat], inplace=True)
    amenities = amenities[amenities['amenity'].notnull()]

    gdf = mp.nx_to_gdf(g, points=False, lines=True, spatial_weights=True).to_crs(epsg=3857)
    gdf = gdf[gdf.geometry.notnull()].reset_index(drop=True)
    return g, gdf, amenities

g, gdf, amenities = get_city_graph(lat,
                                    lon,
                                    dist,
                                    features = features, 
                                    expand_features = expand_features)


  amenities.loc[amenities[feat].notnull(), 'new_col'] = feat
  gdf = mp.nx_to_gdf(g, points=False, lines=True, spatial_weights=True).to_crs(epsg=3857)


In [5]:
### carry weights over to line graph

def create_linegraph(g):
    g = nx.Graph(g)
    H = nx.line_graph(g)
    H.add_nodes_from((node, g.edges[node]) for node in H)   
    for s, t in H.edges:
        H.edges[(s, t)]['weight'] = g.edges[s]['length'] + g.edges[t]['length']
    return H

H = create_linegraph(g)


### EBC Calculation

In [6]:


def calc_bc(shortest_paths, graph):
    bc = {i: 0 for i in graph.nodes}
    for node in tqdm(graph.nodes):
        for path in shortest_paths[node].values():
            for node_visited in set(path):
                bc[node_visited] += 1
    total_nodes = graph.number_of_nodes() ** 2
    return {node: count / total_nodes for node, count in bc.items()}

ebc = dict(nx.all_pairs_dijkstra_path(H,
                                    weight='weight',
                                    cutoff=1000,
))
bc = calc_bc(ebc, H)


100%|██████████| 78168/78168 [00:29<00:00, 2672.34it/s]


In [7]:
bc2 = {}
for x, y in bc:
    bc2[(x, y)] = bc[(x, y)]

nx.set_node_attributes(H, bc2, 'bc')


In [8]:
nodes, edges = mp.nx_to_gdf(g)
filepath = '../data/raw/trafiktaelling.json'
gdf2 = gpd.GeoDataFrame.from_file(filepath)
gdf2.set_crs(epsg=4326, inplace=True)
gdf2 = gdf2.to_crs(epsg=3857)
gdf2['geometry'] = gdf2['geometry']
# gdf2 = gdf2[gdf2['geometry'].within(gdf['geometry'])]
### export only relevant columns
gdf_new = gdf2[['id', 'vejnavn', 'geometry', 'aadt_cykler']]
### remove null values on aadt_cykler
gdf_new = gdf_new[gdf_new['aadt_cykler'].notnull()]


  nodes, edges = mp.nx_to_gdf(g)


In [9]:
xmin, ymin, xmax, ymax = gdf.total_bounds
gdf_new = gdf_new.cx[xmin:xmax, ymin:ymax]
gdf_new.to_crs(epsg=4326, inplace=True)


In [10]:
linestrings = [i[2]['geometry'] if 'geometry' in i[2] else None for i in list(g.edges(data=True))]
from_node = [i[0] for i in list(g.edges(data=True))]
to_node = [i[1] for i in list(g.edges(data=True))]

tree = STRtree(linestrings)
for i, row in tqdm(gdf_new.iterrows(), total=len(gdf_new)):
    point = row['geometry']
    if point is None:
        continue
    nearest_edge_idx = tree.nearest(point)
    nearest_edge = linestrings[nearest_edge_idx]
    nearest_edge_distance = nearest_edge.distance(point)
    start_node = from_node[linestrings.index(nearest_edge)]
    end_node = to_node[linestrings.index(nearest_edge)]
    
    # Ensure the edge exists in the graph
    if (start_node, end_node) not in H.nodes():
        if (end_node, start_node) not in H.nodes:
            continue
        else:
            start_node, end_node = end_node, start_node

    if 'aadt' not in H.nodes()[(start_node, end_node)]:
        H.nodes()[(start_node, end_node)]['aadt'] = row['aadt_cykler']
        H.nodes()[(start_node, end_node)]['aadt_distance'] = nearest_edge_distance
    if 'aadt_distance' not in H.nodes()[(start_node, end_node)] or H.nodes()[(start_node, end_node)]['aadt_distance'] > nearest_edge_distance:
        H.nodes()[(start_node, end_node)]['aadt'] = row['aadt_cykler']
        H.nodes()[(start_node, end_node)]['aadt_distance'] = nearest_edge_distance


100%|██████████| 670/670 [09:22<00:00,  1.19it/s]


In [11]:
for node, value in H.nodes(data=True):
    if 'aadt' not in value.keys():
        value['aadt'] = 0


In [12]:
# amenities = amenities.reset_index()
nodes = list((node, linestring) for node, linestring in H.nodes(data='geometry'))
nodes = [node for node in nodes if node[1] is not None]
linestrings = [linestring for node, linestring in nodes]
nodes = [node for node, linestring in nodes]
assert len(nodes) == len(linestrings)
amenities['geometry'] = amenities['geometry'].apply(lambda x: x.centroid if x.geom_type == 'Polygon' else x)
tree = STRtree(linestrings)
for geom, amenity in zip(amenities['geometry'], amenities['amenity']):
    nearest = tree.nearest(geom)
    nearest = nodes[nearest]
    if 'amenity' not in H.nodes[nearest]:
        H.nodes[nearest]['amenity'] = [amenity]
    else:
        H.nodes[nearest]['amenity'].append(amenity)

from collections import Counter
for i in H.nodes(data=True):
    if 'amenity' in i[1]:
        amenity_counts = Counter(i[1]['amenity'])
        for key in amenity_counts:
            H.nodes[i[0]][key] = amenity_counts[key]
        ## drop the amenity key
        H.nodes[i[0]].pop('amenity', None)


In [13]:
for node in H.nodes(data=True):
    node[1].pop('geometry', None)
    node[1].pop('osmid', None)
    node[1].pop('name', None)
    node[1].pop('highway', None)
    node[1].pop('ref', None)
    node[1].pop('aadt_dist', None)
    node[1].pop('aadt_distance', None)

    for key in list(node[1].keys()):
        if type(node[1][key]) not in (int, float):
            try:
                node[1][key] = float(node[1][key])
            except:
                node[1].pop(key, None)


In [14]:
all_feats = []
for node in H.nodes(data=True):
    for key in node[1].keys():
        if key not in all_feats:
            all_feats.append(key)


In [15]:
for node in H.nodes(data=True):
    for feat in all_feats:
        if feat not in node[1].keys():
            node[1][feat] = 0


In [16]:
### save node features in dataframe
node_features = []
for node in H.nodes(data=True):
    node_features.append(node[1])
node_features = pd.DataFrame(node_features).drop(columns=['aadt'])


In [17]:
node_list, x, y = [], [], []
for node, feats in list(H.nodes(data=True)):
    node_list.append(node)
    x.append([feats[feat] for feat in all_feats if feat != 'aadt'])
    y.append(feats['aadt'])

node_idx = {node : idx for idx, node in enumerate(node_list)}
edge_index = []
for s, t in list(H.edges):
    edge_index.append([node_idx[s], node_idx[t]])

edge_index = torch.tensor(edge_index, dtype=torch.long).t()
x = torch.tensor(x, dtype=torch.float)
y = torch.tensor(y, dtype=torch.float)

linegraph = Data()
linegraph.num_nodes = len(node_list)
linegraph.x = x
linegraph.y = y
linegraph.edge_index = edge_index


In [18]:
# ### create file structure based on number of node features
# config_folder = glob.glob('../data/graphs/configs/*.txt')
# num_folder = None

# if len(config_folder) == 0:
#     print('Creating new folder')
#     os.mkdir('../data/graphs/1')
#     os.mkdir('../data/graphs/1/models')
#     with open('../data/graphs/configs/1.txt', 'w') as f:
#         f.write(f"features: {' '.join(features)}\n")
#         f.write(f"expand_features: {' '.join(expand_features)}\n")
#         f.write(f"distance: {dist}\n")

# for file in config_folder:
#     same_features = False
#     same_expand_features = False
#     same_distance = False
#     with open(file, 'r') as f:
#         config = f.readlines()
#     for line in config:
#         if 'distance' in line:
#             if int(line.split(':')[-1]) == dist:
#                 same_distance = True
#         if 'features' in line:
#             if set(line.split(':')[-1].split()) == set(features):
#                 same_features = True
#         if 'expand_features' in line:
#             if set(line.split(':')[-1].split()) == set(expand_features):
#                 same_expand_features = True
#     if same_distance and same_features and same_expand_features:
#         num_folder = file.split('/')[-1].split('.')[0]
#     else:
#         num_folder = len(config_folder) + 1
#         os.makedirs(f'../data/graphs/{num_folder}/models', exist_ok=True)
#         with open(f'../data/graphs/configs/{num_folder}.txt', 'w') as f:
#             f.write(f"features: {' '.join(features)}\n")
#             f.write(f"expand_features: {' '.join(expand_features)}\n")
#             f.write(f"distance: {dist}\n")
#     # Ensure the directory exists before writing the file
#     os.makedirs(f'../data/graphs/{num_folder}', exist_ok=True)
#     with open(f'../data/graphs/{num_folder}/linegraph_tg.pkl', 'wb') as f:
#         pickle.dump(linegraph, f)
#     node_features.to_csv(f'../data/graphs/{num_folder}/node_features.csv', index=False)

# for file in config_folder:
#     same_features = False
#     same_expand_features = False
#     same_distance = False
#     with open(file, 'r') as f:
#         config = f.readlines()
#     for line in config:
#         if 'distance' in line:
#             if int(line.split(':')[-1]) == dist:
#                 same_distance = True
#         if 'features' in line:
#             if set(line.split(':')[-1].split()) == set(features):
#                 same_features = True
#         if 'expand_features' in line:
#             if set(line.split(':')[-1].split()) == set(expand_features):
#                 same_expand_features = True
#     if same_distance and same_features and same_expand_features:
#         num_folder = file.split('/')[-1].split('.')[0]
#     else:
#         num_folder = len(config_folder) + 1
#         try:
#             os.mkdir(f'../data/graphs/{num_folder}')
#             os.mkdir(f'../data/graphs/{num_folder}/models')
#         except:
#             pass
#         with open(f'../data/graphs/configs/{num_folder}.txt', 'w') as f:
#             f.write(f"features: {' '.join(features)}\n")
#             f.write(f"expand_features: {' '.join(expand_features)}\n")
#             f.write(f"distance: {dist}\n")
#     with open(f'../data/graphs/{num_folder}/linegraph_tg.pkl', 'wb') as f:
#         pickle.dump(linegraph, f)


In [19]:
import os
import glob
import pickle

# Find existing config files
config_folder = glob.glob('../data/graphs/configs/*.txt')

# Helper function to check if config matches
def config_matches(file_path, features, expand_features, dist):
    with open(file_path, 'r') as f:
        config = f.readlines()
    config_dict = {}
    for line in config:
        key, value = line.strip().split(':', 1)
        config_dict[key.strip()] = set(value.strip().split()) if key != 'distance' else int(value.strip())

    return (
        config_dict.get('features', set()) == set(features) and
        config_dict.get('expand_features', set()) == set(expand_features) and
        config_dict.get('distance', None) == dist
    )

# Create initial folder if no config exists
if not config_folder:
    print('Creating new folder structure...')
    os.makedirs('../data/graphs/1/models', exist_ok=True)
    with open('../data/graphs/configs/1.txt', 'w') as f:
        f.write(f"features: {' '.join(features)}\n")
        f.write(f"expand_features: {' '.join(expand_features)}\n")
        f.write(f"distance: {dist}\n")
    num_folder = '1'
else:
    # Check if a matching config already exists
    num_folder = None
    for file in config_folder:
        if config_matches(file, features, expand_features, dist):
            num_folder = os.path.splitext(os.path.basename(file))[0]
            break

    # If no matching config, create new one
    if not num_folder:
        num_folder = str(len(config_folder) + 1)
        os.makedirs(f'../data/graphs/{num_folder}/models', exist_ok=True)
        with open(f'../data/graphs/configs/{num_folder}.txt', 'w') as f:
            f.write(f"features: {' '.join(features)}\n")
            f.write(f"expand_features: {' '.join(expand_features)}\n")
            f.write(f"distance: {dist}\n")

# Save graph and node features
os.makedirs(f'../data/graphs/{num_folder}', exist_ok=True)
with open(f'../data/graphs/{num_folder}/linegraph_tg.pkl', 'wb') as f:
    pickle.dump(linegraph, f)

node_features.to_csv(f'../data/graphs/{num_folder}/node_features.csv', index=False)
